From e8453d397e2ecf71522e7a920c27b4045d927e00 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 5 Feb 2025 15:06:18 +0100 Subject: [PATCH 01/11] first attempt at enabling two ways to submit/receive jobs --- README.md | 14 +++++++ app.cfg.example | 13 ++++++- eessi_bot_event_handler.py | 5 +++ eessi_bot_job_manager.py | 80 ++++++++++++++++++++++++++++---------- tasks/build.py | 23 ++++++++++- tools/config.py | 9 +++++ 6 files changed, 120 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 157e5e7..3ddf081 100644 --- a/README.md +++ b/README.md @@ -375,6 +375,20 @@ package repositories. Typically these settings are set in the prologue of a Slurm job. However, when entering the [EESSI compatibility layer](https://www.eessi.io/docs/compatibility_layer), most environment settings are cleared. Hence, they need to be set again at a later stage. +``` +job_handover_protocol = hold_release +``` +The `job_handover_protocol` setting defines which method is used to handover a +job from the event handler to the job manager. Values are + - `hold_release` (job is submitted with `--hold`, job manager removes the hold + with `scontrol release`) + - `delayed_begin` (job is submitted with `--begin=now+(5 * poll_interval)` and + any `--hold` is removed from the submission parameters); see setting + `poll_interval` further below; this is useful if the + bot account cannot run `scontrol release` to remove the hold of the job; + also, the status update in the PR comment of the job is extended by noting + the `EligibleTime` + ``` job_name = JOB_NAME ``` diff --git a/app.cfg.example b/app.cfg.example index 1958121..5ceec8c 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -88,6 +88,17 @@ container_cachedir = PATH_TO_SHARED_DIRECTORY # http_proxy = http://PROXY_DNS:3128/ # https_proxy = http://PROXY_DNS:3128/ +# The job_handover_protocol setting defines which method is used to handover a +# job from the event handler to the job manager. Values are +# - hold_release (job is submitted with '--hold', job manager removes the hold +# with 'scontrol release') +# - delayed_begin (job is submitted with '--begin=now+(5 * poll_interval)' and +# any '--hold' is removed from the submission parameters); this is useful if the +# bot account cannot run 'scontrol release' to remove the hold of the job; +# also, the status update in the PR comment of the job is extended by noting +# the 'EligibleTime' +job_handover_protocol = hold_release + # Used to give all jobs of a bot instance the same name. Can be used to allow # multiple bot instances running on the same Slurm cluster. job_name = prod @@ -253,7 +264,7 @@ with_accelerator =  and accelerator `{accelerator}` [new_job_comments] -awaits_launch = job awaits launch by Slurm scheduler +awaits_launch = job awaits launch by Slurm scheduler{extra_info} [running_job_comments] running_job = job `{job_id}` is running diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index f5b05d1..445f982 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -56,6 +56,7 @@ # config.BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS, # optional # config.BUILDENV_SETTING_HTTPS_PROXY, # optional # config.BUILDENV_SETTING_HTTP_PROXY, # optional + config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL, # required config.BUILDENV_SETTING_JOB_NAME, # required config.BUILDENV_SETTING_JOBS_BASE_DIR, # required # config.BUILDENV_SETTING_LOAD_MODULES, # optional @@ -92,6 +93,10 @@ config.GITHUB_SETTING_APP_NAME, # required config.GITHUB_SETTING_INSTALLATION_ID, # required config.GITHUB_SETTING_PRIVATE_KEY], # required + # the poll interval setting is required for the alternative job handover + # protocol (delayed_begin) + config.SECTION_JOB_MANAGER: [ + config.JOB_MANAGER_SETTING_POLL_INTERVAL], # required config.SECTION_REPO_TARGETS: [ config.REPO_TARGETS_SETTING_REPO_TARGET_MAP, # required config.REPO_TARGETS_SETTING_REPOS_CFG_DIR], # required diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index bb0c6dd..fffd179 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -51,6 +51,7 @@ # settings that are required in 'app.cfg' REQUIRED_CONFIG = { config.SECTION_BUILDENV: [ + config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL, # required config.BUILDENV_SETTING_JOB_NAME], # required config.SECTION_FINISHED_JOB_COMMENTS: [ config.FINISHED_JOB_COMMENTS_SETTING_JOB_RESULT_UNKNOWN_FMT, # required @@ -91,6 +92,9 @@ def __init__(self): self.job_name = buildenv_cfg.get(config.BUILDENV_SETTING_JOB_NAME) if self.job_name and len(self.job_name) < 3: raise Exception(f"job name ({self.job_name}) is shorter than 3 characters") + self.job_handover_protocol = buildenv_cfg.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL) + if self.job_handover_protocol not in config.JOB_HANDOVER_PROTOCOLS_SET: + raise Exception(f"job handover protocol ({self.job_handover_protocol}) is unknown") def get_current_jobs(self): """ @@ -256,6 +260,25 @@ def determine_finished_jobs(self, known_jobs, current_jobs): return finished_jobs + def parse_scontrol_show_job_output(self, output): + """ + The output of 'scontrol --oneliner show job' is a list of key=value pairs + separated by whitespaces. + + Args: + output (string): the output of the scontrol command + + Returns: + (dict): Returns a dictionary of the key-value pairs + """ + job_info = {} + stripped_output = output.strip() + for pair in stripped_output.split(): + key, value = pair.split('=', 1) + job_info[key] = value + + return job_info + def process_new_job(self, new_job): """ Process a new job by verifying that it is a bot job and if so @@ -283,19 +306,20 @@ def process_new_job(self, new_job): log_file=self.logfile, ) - # parse output of 'scontrol_cmd' to determine the job's working - # directory - match = re.search(r".* WorkDir=(\S+) .*", - str(scontrol_output)) - if match: + # parse output of 'scontrol_cmd' + job_info = parse_scontrol_show_job_output(str(scontrol_output)) + + # check if job_info contains 'WorkDir', if not we cannot process the job + # further + if 'WorkDir' in job_info: log( "process_new_job(): work dir of job %s: '%s'" - % (job_id, match.group(1)), + % (job_id, job_info['WorkDir']), self.logfile, ) job_metadata_path = "%s/_bot_job%s.metadata" % ( - match.group(1), + job_info['WorkDir'], job_id, ) @@ -313,21 +337,34 @@ def process_new_job(self, new_job): symlink_source = os.path.join(self.submitted_jobs_dir, job_id) log( "process_new_job(): create a symlink: %s -> %s" - % (symlink_source, match.group(1)), + % (symlink_source, job_info['WorkDir']), self.logfile, ) - os.symlink(match.group(1), symlink_source) - - release_cmd = "%s release %s" % ( - self.scontrol_command, - job_id, - ) + os.symlink(job_info['WorkDir'], symlink_source) + + # handle different job handover protocols + # *_HOLD_RELEASE: job was submitted with '--hold' and shall be + # released with 'scontrol release JOB_ID' + # *_DELAYED_BEGIN: job was submitted with '--begin=now+SOMEDELAY', + # no extra action is needed + job_status = '' + extra_info = '' + if self.job_handover_protocol == config.JOB_HANDOVER_HOLD_RELEASE: + release_cmd = "%s release %s" % ( + self.scontrol_command, + job_id, + ) - release_output, release_err, release_exitcode = run_cmd( - release_cmd, - "process_new_job(): scontrol command", - log_file=self.logfile, - ) + release_output, release_err, release_exitcode = run_cmd( + release_cmd, + "process_new_job(): scontrol command", + log_file=self.logfile, + ) + job_status = 'released' + extra_info = '' + elif self.job_handover_protocol == config.JOB_HANDOVER_DELAYED_BEGIN: + job_status = 'received' + extra_info = " (eligible to start from {job_info['EligibleTime'})" # update PR defined by repo and pr_number stored in the job's # metadata file @@ -356,8 +393,9 @@ def process_new_job(self, new_job): if "comment_id" in new_job: new_job_comments_cfg = config.read_config()[config.SECTION_NEW_JOB_COMMENTS] dt = datetime.now(timezone.utc) - update = "\n|%s|released|" % dt.strftime("%b %d %X %Z %Y") - update += f"{new_job_comments_cfg[config.NEW_JOB_COMMENTS_SETTING_AWAITS_LAUNCH]}|" + update = "\n|%s|%s|" % (dt.strftime("%b %d %X %Z %Y"), job_status) + description_col_fmt = new_job_comments_cfg[config.NEW_JOB_COMMENTS_SETTING_AWAITS_LAUNCH] + update += f"{description_col_fmt.format(extra_info=extra_info)}|" update_comment(new_job["comment_id"], pr, update) else: log( diff --git a/tasks/build.py b/tasks/build.py index 0ddcf61..70f857a 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -102,9 +102,28 @@ def get_build_env_cfg(cfg): log(f"{fn}(): submit_command '{submit_command}'") config_data[config.BUILDENV_SETTING_SUBMIT_COMMAND] = submit_command + job_handover_protocol = buildenv.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL) slurm_params = buildenv.get(config.BUILDENV_SETTING_SLURM_PARAMS) - # always submit jobs with hold set, so job manager can release them - slurm_params += ' --hold' + if job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_HOLD_RELEASE: + # always submit jobs with hold set, so job manager can release them + slurm_params += ' --hold' + elif job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN: + # alternative method to submit without '--hold' and + # '--begin=now+5*poll_interval' instead + # 1. remove '--hold' if any + # 2. add '--begin=now+5*poll_interval' + slurm_params = slurm_params.replace('--hold', '') + job_manger_cfg = cfg[config.SECTION_JOB_MANAGER] + poll_interval = int(job_manger_cfg.get(config.JOB_MANAGER_SETTING_POLL_INTERVAL)) + slurm_params += f' --begin=now+{5 * poll_interval}' + else: + slurm_params += ' --hold' + log( + f"{fn}(): unknown job handover protocol in app.cfg" + f" ('{config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL} = {job_handover_protocol}');" + f" added '--hold' as default" + ) + log(f"{fn}(): slurm_params '{slurm_params}'") config_data[config.BUILDENV_SETTING_SLURM_PARAMS] = slurm_params diff --git a/tools/config.py b/tools/config.py index 60554be..a40af38 100644 --- a/tools/config.py +++ b/tools/config.py @@ -46,6 +46,7 @@ BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS = 'cvmfs_customizations' BUILDENV_SETTING_HTTPS_PROXY = 'https_proxy' BUILDENV_SETTING_HTTP_PROXY = 'http_proxy' +BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL = 'job_handover_protocol' BUILDENV_SETTING_JOB_NAME = 'job_name' BUILDENV_SETTING_JOBS_BASE_DIR = 'jobs_base_dir' BUILDENV_SETTING_LOAD_MODULES = 'load_modules' @@ -114,6 +115,14 @@ CLEAN_UP_SETTING_TRASH_BIN_ROOT_DIR = 'trash_bin_dir' CLEAN_UP_SETTING_MOVED_JOB_DIRS_COMMENT = 'moved_job_dirs_comment' +# definition of values +JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN = 'job_handover_protocol:delayed_begin' +JOB_HANDOVER_PROTOCOL_HOLD_RELEASE = 'job_handover_protocol:hold_release' +JOB_HANDOVER_PROTOCOLS_SET = { + JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN, + JOB_HANDOVER_PROTOCOL_HOLD_RELEASE +} + def read_config(path='app.cfg'): """ From 7cc10df34600a1bd0055ece300f6d25eb7ab709d Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 5 Feb 2025 15:18:23 +0100 Subject: [PATCH 02/11] fix hound issue --- eessi_bot_job_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index fffd179..9d596d0 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -307,7 +307,7 @@ def process_new_job(self, new_job): ) # parse output of 'scontrol_cmd' - job_info = parse_scontrol_show_job_output(str(scontrol_output)) + job_info = self.parse_scontrol_show_job_output(str(scontrol_output)) # check if job_info contains 'WorkDir', if not we cannot process the job # further From 9f67d490cf9863ac426df1a0c5f51eb0ce5a0334 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 6 Feb 2025 20:20:51 +0100 Subject: [PATCH 03/11] fix job handover protocol values --- tools/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/config.py b/tools/config.py index a40af38..12c2930 100644 --- a/tools/config.py +++ b/tools/config.py @@ -116,8 +116,8 @@ CLEAN_UP_SETTING_MOVED_JOB_DIRS_COMMENT = 'moved_job_dirs_comment' # definition of values -JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN = 'job_handover_protocol:delayed_begin' -JOB_HANDOVER_PROTOCOL_HOLD_RELEASE = 'job_handover_protocol:hold_release' +JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN = 'delayed_begin' +JOB_HANDOVER_PROTOCOL_HOLD_RELEASE = 'hold_release' JOB_HANDOVER_PROTOCOLS_SET = { JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN, JOB_HANDOVER_PROTOCOL_HOLD_RELEASE From 1898e18783c94d04c783a160288bf75ca501d08d Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 6 Feb 2025 21:31:52 +0100 Subject: [PATCH 04/11] update test data to include job_handover_protocol setting --- tests/test_app.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_app.cfg b/tests/test_app.cfg index 43e11bf..a3b2379 100644 --- a/tests/test_app.cfg +++ b/tests/test_app.cfg @@ -12,6 +12,7 @@ # sample config file for tests (some functions run config.read_config() # which reads app.cfg by default) [buildenv] +job_handover_protocol = hold_release [job_manager] From 782a86235fcb9757b38a0a7dc339edf61320f184 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 7 Feb 2025 19:56:11 +0100 Subject: [PATCH 05/11] use correct constant for handover protocols --- eessi_bot_job_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 9d596d0..4fcf9af 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -349,7 +349,7 @@ def process_new_job(self, new_job): # no extra action is needed job_status = '' extra_info = '' - if self.job_handover_protocol == config.JOB_HANDOVER_HOLD_RELEASE: + if self.job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_HOLD_RELEASE: release_cmd = "%s release %s" % ( self.scontrol_command, job_id, @@ -362,7 +362,7 @@ def process_new_job(self, new_job): ) job_status = 'released' extra_info = '' - elif self.job_handover_protocol == config.JOB_HANDOVER_DELAYED_BEGIN: + elif self.job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN: job_status = 'received' extra_info = " (eligible to start from {job_info['EligibleTime'})" From ddfb729f8cae0219e3eb21b924e074150270a73a Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 12 Feb 2025 06:27:49 +0100 Subject: [PATCH 06/11] add setting to define the job delay factor --- README.md | 10 ++++++++++ app.cfg.example | 8 ++++++++ eessi_bot_event_handler.py | 1 + tasks/build.py | 8 +++++--- tools/config.py | 1 + 5 files changed, 25 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3ddf081..518b00f 100644 --- a/README.md +++ b/README.md @@ -375,6 +375,16 @@ package repositories. Typically these settings are set in the prologue of a Slurm job. However, when entering the [EESSI compatibility layer](https://www.eessi.io/docs/compatibility_layer), most environment settings are cleared. Hence, they need to be set again at a later stage. +``` +job_delay_begin_factor = 2 +``` +The `job_delay_begin_factor` setting defines how many times the `poll_interval` a +job's begin (EligibleTime) from now should be delayed if the handover protocol +is set to `delayed_begin` (see setting `job_handover_protocol`). That is, if +the `job_delay_begin_factor` is set to five (5) the delay time is calculated as +5 * `poll_interval`. The event manager would use 2 as default value when +submitting jobs. + ``` job_handover_protocol = hold_release ``` diff --git a/app.cfg.example b/app.cfg.example index 5ceec8c..3cdb3b1 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -88,6 +88,14 @@ container_cachedir = PATH_TO_SHARED_DIRECTORY # http_proxy = http://PROXY_DNS:3128/ # https_proxy = http://PROXY_DNS:3128/ +# The job_delay_begin_factor setting defines how many times the poll_interval a +# job's begin (EligibleTime) from now should be delayed if the handover protocol +# is set to `delayed_begin` (see setting `job_handover_protocol`). That is, if +# the job_delay_begin_factor is set to five (5) the delay time is calculated as +# 5 * poll_interval. The event manager would use 2 as the default factor when +# submitting jobs. +job_delay_begin_factor = 2 + # The job_handover_protocol setting defines which method is used to handover a # job from the event handler to the job manager. Values are # - hold_release (job is submitted with '--hold', job manager removes the hold diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 445f982..9422bb7 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -56,6 +56,7 @@ # config.BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS, # optional # config.BUILDENV_SETTING_HTTPS_PROXY, # optional # config.BUILDENV_SETTING_HTTP_PROXY, # optional + # config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, # optional (default: 2) config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL, # required config.BUILDENV_SETTING_JOB_NAME, # required config.BUILDENV_SETTING_JOBS_BASE_DIR, # required diff --git a/tasks/build.py b/tasks/build.py index 70f857a..a497a6a 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -109,13 +109,15 @@ def get_build_env_cfg(cfg): slurm_params += ' --hold' elif job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN: # alternative method to submit without '--hold' and - # '--begin=now+5*poll_interval' instead + # '--begin=now+factor*poll_interval' instead # 1. remove '--hold' if any - # 2. add '--begin=now+5*poll_interval' + # 2. add '--begin=now+factor*poll_interval' + # factor defined by setting 'job_delay_begin_factor' (default: 2) slurm_params = slurm_params.replace('--hold', '') job_manger_cfg = cfg[config.SECTION_JOB_MANAGER] poll_interval = int(job_manger_cfg.get(config.JOB_MANAGER_SETTING_POLL_INTERVAL)) - slurm_params += f' --begin=now+{5 * poll_interval}' + job_delay_begin_factor = buildenv.get(config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, 2) + slurm_params += f' --begin=now+{job_delay_begin_factor * poll_interval}' else: slurm_params += ' --hold' log( diff --git a/tools/config.py b/tools/config.py index 12c2930..2d49755 100644 --- a/tools/config.py +++ b/tools/config.py @@ -46,6 +46,7 @@ BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS = 'cvmfs_customizations' BUILDENV_SETTING_HTTPS_PROXY = 'https_proxy' BUILDENV_SETTING_HTTP_PROXY = 'http_proxy' +BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR = 'job_delay_begin_factor' BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL = 'job_handover_protocol' BUILDENV_SETTING_JOB_NAME = 'job_name' BUILDENV_SETTING_JOBS_BASE_DIR = 'jobs_base_dir' From dc5c2892acf9be7ad8e0f8a89c14921e55249690 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 12 Feb 2025 09:23:12 +0100 Subject: [PATCH 07/11] enable different messages for different job handover protocols --- README.md | 18 ++++++++++++++ app.cfg.example | 2 ++ eessi_bot_event_handler.py | 4 ++- tasks/build.py | 50 +++++++++++++++++++++++++++++--------- tools/config.py | 3 +++ 5 files changed, 64 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 518b00f..cd94e61 100644 --- a/README.md +++ b/README.md @@ -678,12 +678,30 @@ scontrol_command = /usr/bin/scontrol #### `[submitted_job_comments]` section The `[submitted_job_comments]` section specifies templates for messages about newly submitted jobs. + +DEPRECATED setting (use `awaits_release_delayed_begin_msg` and/or `awaits_release_hold_release_msg`) ``` awaits_release = job id `{job_id}` awaits release by job manager ``` `awaits_release` is used to provide a status update of a job (shown as a row in the job's status table). +``` +awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds +``` +`awaits_release_delayed_begin_msg` is used when the `job_handover_protocol` is +set to `delayed_begin`. Note, both `{job_id}` and `{delay_seconds}` need to be +present in the value or the event handler will throw an exception when formatting +the update of the PR comment corresponding to the job. + +``` +awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager +``` +`awaits_release_hold_release_msg` is used when the `job_handover_protocol` is +set to `hold_release`. Note, `{job_id}` needs to be present in the value or the +event handler will throw an exception when formatting the update of the PR +comment corresponding to the job. + ``` initial_comment = New job on instance `{app_name}` for architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}` ``` diff --git a/app.cfg.example b/app.cfg.example index 3cdb3b1..6941444 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -267,6 +267,8 @@ scontrol_command = /usr/bin/scontrol # information. [submitted_job_comments] awaits_release = job id `{job_id}` awaits release by job manager +awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds +awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager initial_comment = New job on instance `{app_name}` for CPU micro-architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}` with_accelerator =  and accelerator `{accelerator}` diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 9422bb7..2c22a67 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -103,7 +103,9 @@ config.REPO_TARGETS_SETTING_REPOS_CFG_DIR], # required config.SECTION_SUBMITTED_JOB_COMMENTS: [ config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT, # required - config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE, # required + # config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE, # optional + config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG, # required + config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG, # required config.SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR], # required } diff --git a/tasks/build.py b/tasks/build.py index a497a6a..02c6b3c 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -914,18 +914,44 @@ def create_pr_comment(job, job_id, app_name, pr, gh, symlink): dt = datetime.now(timezone.utc) # construct initial job comment - job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}" - f"\n|date|job status|comment|\n" - f"|----------|----------|------------------------|\n" - f"|{dt.strftime('%b %d %X %Z %Y')}|" - f"submitted|" - f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE]}|").format( - app_name=app_name, - arch_name=arch_name, - symlink=symlink, - repo_id=job.repo_id, - job_id=job_id, - accelerator_spec=accelerator_spec_str) + buildenv = config.read_config()[config.SECTION_BUILDENV] + job_handover_protocol = buildenv.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL) + if job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN: + release_msg_string = config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG + release_comment_template = submitted_job_comments_cfg[release_msg_string] + # calculate delay from poll_interval and delay_factor + job_manager_cfg = config.read_config()[config.SECTION_JOB_MANAGER] + poll_interval = job_manager_cfg.get(config.JOB_MANAGER_SETTING_POLL_INTERVAL) + delay_factor = buildenv.get(config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, 2) + eligible_in_seconds = poll_interval * delay_factor + job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}" + f"\n|date|job status|comment|\n" + f"|----------|----------|------------------------|\n" + f"|{dt.strftime('%b %d %X %Z %Y')}|" + f"submitted|" + f"{release_comment_template}|").format( + app_name=app_name, + arch_name=arch_name, + symlink=symlink, + repo_id=job.repo_id, + job_id=job_id, + delayed_seconds=eligible_in_seconds, + accelerator_spec=accelerator_spec_str) + else: + release_msg_string = config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG + release_comment_template = submitted_job_comments_cfg[release_msg_string] + job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}" + f"\n|date|job status|comment|\n" + f"|----------|----------|------------------------|\n" + f"|{dt.strftime('%b %d %X %Z %Y')}|" + f"submitted|" + f"{release_comment_template}|").format( + app_name=app_name, + arch_name=arch_name, + symlink=symlink, + repo_id=job.repo_id, + job_id=job_id, + accelerator_spec=accelerator_spec_str) # create comment to pull request repo_name = pr.base.repo.full_name diff --git a/tools/config.py b/tools/config.py index 2d49755..993fce1 100644 --- a/tools/config.py +++ b/tools/config.py @@ -108,7 +108,10 @@ RUNNING_JOB_COMMENTS_SETTING_RUNNING_JOB = 'running_job' SECTION_SUBMITTED_JOB_COMMENTS = 'submitted_job_comments' +# SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE is DEPRECATED SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE = 'awaits_release' +SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG = 'awaits_release_delayed_begin_msg' +SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG = 'awaits_release_hold_release_msg' SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT = 'initial_comment' SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR = 'with_accelerator' From 250291a18be9e73c90f69e6f4ed50f3e1aa64eaf Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 12 Feb 2025 09:30:34 +0100 Subject: [PATCH 08/11] make the hound happy --- eessi_bot_event_handler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 2c22a67..a627c61 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -104,8 +104,8 @@ config.SECTION_SUBMITTED_JOB_COMMENTS: [ config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT, # required # config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE, # optional - config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG, # required - config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG, # required + config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG, # required + config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG, # required config.SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR], # required } From 809a6020c58e27c6cea5d3dcb53dc3fad5fb68a7 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 13 Feb 2025 07:11:37 +0100 Subject: [PATCH 09/11] add required settings to test_app.cfg --- tests/test_app.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_app.cfg b/tests/test_app.cfg index a3b2379..84161ba 100644 --- a/tests/test_app.cfg +++ b/tests/test_app.cfg @@ -19,6 +19,8 @@ job_handover_protocol = hold_release # variable 'comment' under 'submitted_job_comments' should not be changed as there are regular expression patterns matching it [submitted_job_comments] awaits_release = job id `{job_id}` awaits release by job manager +awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds +awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager initial_comment = New job on instance `{app_name}` for CPU micro-architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}` with_accelerator =  and accelerator `{accelerator}` From b07c9b3222573119d42a0958d0954f066f3dd136 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 13 Feb 2025 20:56:08 +0100 Subject: [PATCH 10/11] don't delay jobs to 2095 :) --- tasks/build.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 02c6b3c..8f966f1 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -116,7 +116,7 @@ def get_build_env_cfg(cfg): slurm_params = slurm_params.replace('--hold', '') job_manger_cfg = cfg[config.SECTION_JOB_MANAGER] poll_interval = int(job_manger_cfg.get(config.JOB_MANAGER_SETTING_POLL_INTERVAL)) - job_delay_begin_factor = buildenv.get(config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, 2) + job_delay_begin_factor = float(buildenv.get(config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, 2)) slurm_params += f' --begin=now+{job_delay_begin_factor * poll_interval}' else: slurm_params += ' --hold' @@ -921,8 +921,8 @@ def create_pr_comment(job, job_id, app_name, pr, gh, symlink): release_comment_template = submitted_job_comments_cfg[release_msg_string] # calculate delay from poll_interval and delay_factor job_manager_cfg = config.read_config()[config.SECTION_JOB_MANAGER] - poll_interval = job_manager_cfg.get(config.JOB_MANAGER_SETTING_POLL_INTERVAL) - delay_factor = buildenv.get(config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, 2) + poll_interval = int(job_manager_cfg.get(config.JOB_MANAGER_SETTING_POLL_INTERVAL)) + delay_factor = float(buildenv.get(config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, 2)) eligible_in_seconds = poll_interval * delay_factor job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}" f"\n|date|job status|comment|\n" @@ -935,7 +935,7 @@ def create_pr_comment(job, job_id, app_name, pr, gh, symlink): symlink=symlink, repo_id=job.repo_id, job_id=job_id, - delayed_seconds=eligible_in_seconds, + delay_seconds=eligible_in_seconds, accelerator_spec=accelerator_spec_str) else: release_msg_string = config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG From 156c963b2db032e65b84fa49eaa0d8e79787221d Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 13 Feb 2025 21:25:15 +0100 Subject: [PATCH 11/11] Slurm expects ints --- tasks/build.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 8f966f1..3b1ee83 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -117,7 +117,7 @@ def get_build_env_cfg(cfg): job_manger_cfg = cfg[config.SECTION_JOB_MANAGER] poll_interval = int(job_manger_cfg.get(config.JOB_MANAGER_SETTING_POLL_INTERVAL)) job_delay_begin_factor = float(buildenv.get(config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, 2)) - slurm_params += f' --begin=now+{job_delay_begin_factor * poll_interval}' + slurm_params += f' --begin=now+{int(job_delay_begin_factor * poll_interval)}' else: slurm_params += ' --hold' log( @@ -923,7 +923,7 @@ def create_pr_comment(job, job_id, app_name, pr, gh, symlink): job_manager_cfg = config.read_config()[config.SECTION_JOB_MANAGER] poll_interval = int(job_manager_cfg.get(config.JOB_MANAGER_SETTING_POLL_INTERVAL)) delay_factor = float(buildenv.get(config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, 2)) - eligible_in_seconds = poll_interval * delay_factor + eligible_in_seconds = int(poll_interval * delay_factor) job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}" f"\n|date|job status|comment|\n" f"|----------|----------|------------------------|\n"