diff --git a/atlas/lib/idds/atlas/workflow/atlasactuatorwork.py b/atlas/lib/idds/atlas/workflow/atlasactuatorwork.py index 8db8de40..ac34775a 100644 --- a/atlas/lib/idds/atlas/workflow/atlasactuatorwork.py +++ b/atlas/lib/idds/atlas/workflow/atlasactuatorwork.py @@ -30,7 +30,8 @@ class ATLASActuatorWork(ATLASCondorWork): def __init__(self, executable=None, arguments=None, parameters=None, setup=None, work_tag='actuating', exec_type='local', sandbox=None, work_id=None, name=None, - primary_input_collection=None, other_input_collections=None, + primary_input_collection=None, other_input_collections=None, input_collections=None, + primary_output_collection=None, other_output_collections=None, output_collections=None, log_collections=None, logger=None, workload_id=None, @@ -61,6 +62,9 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, exec_type=exec_type, sandbox=sandbox, work_id=work_id, primary_input_collection=primary_input_collection, other_input_collections=other_input_collections, + primary_output_collection=primary_output_collection, + other_output_collections=other_output_collections, + input_collections=input_collections, output_collections=output_collections, log_collections=log_collections, logger=logger, @@ -160,7 +164,7 @@ def poll_external_collection(self, coll): def get_input_collections(self): # return [self.primary_input_collection] + self.other_input_collections - colls = [self.primary_input_collection] + self.other_input_collections + colls = [self._primary_input_collection] + self._other_input_collections for coll_int_id in colls: coll = self.collections[coll_int_id] coll = self.poll_external_collection(coll) @@ -173,7 +177,7 @@ def get_input_contents(self): """ try: ret_files = [] - coll = self.collections[self.primary_input_collection] + coll = self.collections[self._primary_input_collection] ret_file = {'coll_id': coll['coll_id'], 'scope': coll['scope'], 'name': coll['name'], @@ -222,8 +226,8 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): new_inputs.append(ip) # to avoid cheking new inputs if there are no new inputs anymore - if (not new_inputs and 'status' in self.collections[self.primary_input_collection] - and self.collections[self.primary_input_collection]['status'] in [CollectionStatus.Closed]): # noqa: W503 + if (not new_inputs and 'status' in self.collections[self._primary_input_collection] + and self.collections[self._primary_input_collection]['status'] in [CollectionStatus.Closed]): # noqa: W503 self.set_has_new_inputs(False) else: mapped_keys = mapped_input_output_maps.keys() @@ -233,7 +237,7 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): next_key = 1 for ip in new_inputs: out_ip = copy.deepcopy(ip) - out_ip['coll_id'] = self.collections[self.output_collections[0]]['coll_id'] + out_ip['coll_id'] = self.collections[self._primary_output_collection]['coll_id'] new_input_output_maps[next_key] = {'inputs': [ip], 'outputs': [out_ip]} next_key += 1 @@ -342,7 +346,7 @@ def generate_processing_script_sandbox(self, processing): script += 'base_sandbox="$(basename -- $sandbox)"\n' script += 'tar xzf $base_sandbox\n' - dataset = self.collections[self.primary_input_collection] + dataset = self.collections[self._primary_input_collection] script += 'rucio download %s:%s\n' % (dataset['scope'], dataset['name']) script += 'chmod +x %s\n' % str(self.executable) script += "echo '%s' '%s'\n" % (str(self.executable), str(arguments)) diff --git a/atlas/lib/idds/atlas/workflow/atlascondorwork.py b/atlas/lib/idds/atlas/workflow/atlascondorwork.py index b1d70d81..93075486 100644 --- a/atlas/lib/idds/atlas/workflow/atlascondorwork.py +++ b/atlas/lib/idds/atlas/workflow/atlascondorwork.py @@ -18,7 +18,8 @@ class ATLASCondorWork(Work): def __init__(self, executable=None, arguments=None, parameters=None, setup=None, work_type=None, work_tag='hpo', exec_type='local', sandbox=None, work_id=None, - primary_input_collection=None, other_input_collections=None, + primary_input_collection=None, other_input_collections=None, input_collections=None, + primary_output_collection=None, other_output_collections=None, output_collections=None, log_collections=None, agent_attributes=None, logger=None): @@ -43,6 +44,9 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, exec_type=exec_type, sandbox=sandbox, work_id=work_id, primary_input_collection=primary_input_collection, other_input_collections=other_input_collections, + primary_output_collection=primary_output_collection, + other_output_collections=other_output_collections, + input_collections=input_collections, output_collections=output_collections, log_collections=log_collections, agent_attributes=agent_attributes, diff --git a/atlas/lib/idds/atlas/workflow/atlashpowork.py b/atlas/lib/idds/atlas/workflow/atlashpowork.py index 11513b23..640b433c 100644 --- a/atlas/lib/idds/atlas/workflow/atlashpowork.py +++ b/atlas/lib/idds/atlas/workflow/atlashpowork.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 +# - Wen Guan, , 2020 - 2022 import copy import datetime @@ -124,6 +124,8 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, if agent_attributes: self.set_agent_attributes(agent_attributes) + self.logger = self.get_logger() + def set_agent_attributes(self, attrs, req_attributes=None): self.agent_attributes = attrs @@ -191,12 +193,13 @@ def poll_external_collection(self, coll): self.logger.error(traceback.format_exc()) raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) - def get_input_collections(self): + def get_input_collections(self, poll_externel=False): # return [self.primary_input_collection] + self.other_input_collections - colls = [self.primary_input_collection] + self.other_input_collections + colls = [self._primary_input_collection] + self._other_input_collections for coll_int_id in colls: coll = self.collections[coll_int_id] - coll = self.poll_external_collection(coll) + if poll_externel: + coll = self.poll_external_collection(coll) self.collections[coll_int_id] = coll return super(ATLASHPOWork, self).get_input_collections() @@ -212,7 +215,7 @@ def get_input_contents(self, point_index=1): return [] ret_files = [] - coll = self.collections[self.primary_input_collection] + coll = self.collections[self._primary_input_collection] if self.max_points and (self.max_points - self.finished_points < self.num_points_per_iteration): self.points_to_generate = self.max_points - self.finished_points @@ -273,46 +276,11 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): New inputs which are not yet mapped to outputs. :param mapped_input_output_maps: Inputs that are already mapped. - """ - unfinished_mapped = self.get_unfinished_points(mapped_input_output_maps) - self.unfinished_points = unfinished_mapped - mapped_inputs = self.get_mapped_inputs(mapped_input_output_maps) - mapped_inputs_scope_name = [ip['scope'] + ":" + ip['name'] for ip in mapped_inputs] - mapped_keys = mapped_input_output_maps.keys() - if mapped_keys: - next_key = max(mapped_keys) + 1 - else: - next_key = 1 - - inputs = self.get_input_contents(point_index=next_key) - - new_inputs = [] - new_input_output_maps = {} - for ip in inputs: - ip_scope_name = ip['scope'] + ":" + ip['name'] - if ip_scope_name not in mapped_inputs_scope_name: - new_inputs.append(ip) - - # to avoid cheking new inputs if there are no new inputs anymore - if (not new_inputs and self.collections[self.primary_input_collection] - and self.collections[self.primary_input_collection].status in [CollectionStatus.Closed]): # noqa: W503 - self.set_has_new_inputs(False) - else: - for ip in new_inputs: - out_ip = copy.deepcopy(ip) - ip['status'] = ContentStatus.Available - ip['substatus'] = ContentStatus.Available - out_ip['coll_id'] = self.collections[self.output_collections[0]].coll_id - new_input_output_maps[next_key] = {'inputs': [ip], - 'outputs': [out_ip], - 'inputs_dependency': [], - 'logs': []} - next_key += 1 - - self.unfinished_points = self.unfinished_points + len(new_inputs) + :returns new_input_output_maps as dict. + """ - return new_input_output_maps + return {} def generate_points(self): active_processing = self.get_processing(None, without_creating=True) @@ -366,8 +334,8 @@ def get_processing(self, input_output_maps, without_creating=False): if self.active_processings: return self.processings[self.active_processings[0]] else: - # if not without_creating: - # return self.create_processing(input_output_maps) + if not without_creating: + return self.create_processing(input_output_maps) pass return None @@ -390,21 +358,6 @@ def get_status_statistics(self, registered_input_output_maps): self.status_statistics = status_statistics return status_statistics - """ - def syn_collection_status(self): - input_collections = self.get_input_collections() - output_collections = self.get_output_collections() - # log_collections = self.get_log_collections() - - for input_collection in input_collections: - input_collection['total_files'] = self.finished_points + self.unfinished_points - input_collection['processed_files'] = self.finished_points + self.unfinished_points - - for output_collection in output_collections: - output_collection['total_files'] = self.finished_points + self.unfinished_points - output_collection['processed_files'] = self.finished_points - """ - def syn_work_status(self, registered_input_output_maps, all_updates_flushed=True, output_statistics={}, to_release_input_contents=[]): super(ATLASHPOWork, self).syn_work_status(registered_input_output_maps) self.get_status_statistics(registered_input_output_maps) @@ -640,14 +593,18 @@ def submit_processing(self, processing): if proc.external_id: # if 'job_id' in processing['processing_metadata']: pass + return True, None, None else: job_id, errors = self.submit_condor_processing(processing) # processing['processing_metadata']['job_id'] = job_id # processing['processing_metadata']['errors'] = errors - proc.external_id = job_id if job_id: + proc.external_id = job_id proc.submitted_at = datetime.datetime.utcnow() - proc.errors = errors + return True, None, None + else: + proc.errors = errors + return False, None, errors def parse_processing_outputs(self, processing): request_id = processing['request_id'] @@ -678,7 +635,6 @@ def poll_processing(self, processing): proc = processing['processing_metadata']['processing'] job_status, job_err_msg = self.poll_condor_job_status(processing, proc.external_id) processing_outputs = None - reset_expired_at = False if job_status in [ProcessingStatus.Finished]: job_outputs, parser_errors = self.parse_processing_outputs(processing) if job_outputs: @@ -690,36 +646,9 @@ def poll_processing(self, processing): processing_err = parser_errors elif job_status in [ProcessingStatus.Failed]: processing_status = job_status - processing_err = job_err_msg - elif self.toexpire: - processing_status = ProcessingStatus.Expired - processing_err = "The processing is expired" - elif job_status in [ProcessingStatus.Cancelled]: - processing_status = job_status - processing_err = job_err_msg - elif self.tocancel: - self.cancelled_processings.append(proc.internal_id) - processing_status = ProcessingStatus.Cancelled - processing_outputs = None - processing_err = 'Cancelled' - elif self.tosuspend: - self.suspended_processings.append(proc.internal_id) - processing_status = ProcessingStatus.Suspended - processing_outputs = None - processing_err = 'Suspend' - elif self.toresume: - # self.old_processings.append(processing['processing_metadata']['internal_id']) - # self.active_processings.clear() - # self.active_processings.remove(processing['processing_metadata']['internal_id']) - processing['processing_metadata']['resuming_at'] = datetime.datetime.utcnow() - processing_status = ProcessingStatus.Running - reset_expired_at = True - processing_outputs = None - processing_err = None - else: processing_status = job_status processing_err = job_err_msg - return processing_status, processing_outputs, processing_err, reset_expired_at + return processing_status, processing_outputs, processing_err except Exception as ex: self.logger.error("processing_id %s exception: %s, %s" % (processing['processing_id'], str(ex), traceback.format_exc())) proc.retries += 1 @@ -727,25 +656,98 @@ def poll_processing(self, processing): processing_status = ProcessingStatus.Failed else: processing_status = ProcessingStatus.Running - return processing_status, None, None, False + return processing_status, None, None - def poll_processing_updates(self, processing, input_output_maps): - processing_status, processing_outputs, processing_err, reset_expired_at = self.poll_processing(processing) + def generate_new_input_output_maps(self, input_output_maps, points): + unfinished_mapped = self.get_unfinished_points(input_output_maps) + self.unfinished_points = unfinished_mapped - # processing_metadata = processing['processing_metadata'] - # if not processing_metadata: - # processing_metadata = {} - # processing_metadata['errors'] = processing_err - proc = processing['processing_metadata']['processing'] - proc.errors = processing_err + mapped_keys = input_output_maps.keys() + if mapped_keys: + next_key = max(mapped_keys) + 1 + else: + next_key = 1 - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': processing_status, - 'processing_metadata': processing['processing_metadata'], - 'output_metadata': processing_outputs}} + coll = self.collections[self._primary_input_collection] + new_input_output_maps = {} + loss = None + for point in points: + content = {'coll_id': coll.coll_id, + 'scope': coll.scope, + 'name': str(next_key), + 'bytes': 0, + 'adler32': None, + 'min_id': 0, + 'max_id': 0, + 'path': json.dumps((point, loss)), + 'content_type': ContentType.File, + 'content_metadata': {'events': 0}} + out_content = copy.deepcopy(content) + content['status'] = ContentStatus.Available + content['substatus'] = ContentStatus.Available + out_content['coll_id'] = self.collections[self._primary_output_collection].coll_id + new_input_output_maps[next_key] = {'inputs': [content], + 'outputs': [out_content], + 'inputs_dependency': [], + 'logs': []} + next_key += 1 + return new_input_output_maps - if reset_expired_at: - update_processing['parameters']['expired_at'] = None - processing['expired_at'] = None + def poll_processing_updates(self, processing, input_output_maps, log_prefix=''): + proc = processing['processing_metadata']['processing'] + new_input_output_maps = {} updated_contents = [] - return update_processing, updated_contents, {} + if proc.external_id: + processing_status, processing_outputs, processing_err = self.poll_processing(processing) + + proc.errors = processing_err + + if processing_status in [ProcessingStatus.Finished]: + proc.external_id = None + if processing_outputs: + self.logger.info(log_prefix + "Processing finished with outputs") + points = processing_outputs + new_input_output_maps = self.generate_new_input_output_maps(input_output_maps, points) + proc.old_external_id.append(proc.external_id) + processing_status = ProcessingStatus.Running + else: + self.status = WorkStatus.Finished + self.logger.info(log_prefix + "Processing finished and output is empty (output: %s)" % str(processing_outputs)) + elif processing_status in [ProcessingStatus.Failed, ProcessingStatus.Cancelled]: + proc.external_id = None + proc.retries += 1 + if proc.retries <= 3: + self.logger.warn(log_prefix + "Processing terminated (status: %s, retries: %s), retries <=3, new status: %s" % (processing_status, + proc.retries, + ProcessingStatus.Running)) + processing_status = ProcessingStatus.Running + else: + self.status = WorkStatus.Failed + self.logger.warn(log_prefix + "Processing terminated (status: %s, retries: %s)" % (processing_status, proc.retries)) + else: + unfinished_mapped = self.get_unfinished_points(input_output_maps) + self.unfinished_points = unfinished_mapped + + if self.unfinished_points > 0: + processing_status = ProcessingStatus.Running + else: + self.logger.warn(log_prefix + "max_points: %s, finished_points: %s, unfinished_points: %s, num_points_per_iteration: %s" % (self.max_points, + self.finished_points, + self.unfinished_points, + self.num_points_per_iteration)) + if self.max_points and (self.max_points - self.finished_points < self.num_points_per_iteration): + self.points_to_generate = self.max_points - self.finished_points + if self.points_to_generate <= 0: + processing_status = ProcessingStatus.Finished + self.status = WorkStatus.Finished + else: + status, workload_id, error = self.submit_processing(processing) + processing_status = ProcessingStatus.Running + + return processing_status, updated_contents, new_input_output_maps, [], {} + + def abort_processing(self, processing, log_prefix=''): + self.logger.info(log_prefix + "abort processing") + + def resume_processing(self, processing, log_prefix=''): + self.logger.info(log_prefix + "resume processing") diff --git a/atlas/lib/idds/atlas/workflow/atlaspandawork.py b/atlas/lib/idds/atlas/workflow/atlaspandawork.py index bc856374..d1a5e697 100644 --- a/atlas/lib/idds/atlas/workflow/atlaspandawork.py +++ b/atlas/lib/idds/atlas/workflow/atlaspandawork.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 - 2021 +# - Wen Guan, , 2020 - 2022 try: @@ -24,8 +24,7 @@ from idds.common import exceptions from idds.common.constants import (TransformType, CollectionStatus, CollectionType, - ContentStatus, ContentType, - ProcessingStatus, WorkStatus) + ContentType, ProcessingStatus, WorkStatus) from idds.common.utils import extract_scope_atlas from idds.workflow.work import Work, Processing # from idds.workflow.workflow import Condition @@ -35,6 +34,8 @@ class ATLASPandaWork(Work): def __init__(self, task_parameters=None, work_tag='atlas', exec_type='panda', work_id=None, primary_input_collection=None, other_input_collections=None, + input_collections=None, + primary_output_collection=None, other_output_collections=None, output_collections=None, log_collections=None, logger=None, # dependency_map=None, task_name="", @@ -54,6 +55,9 @@ def __init__(self, task_parameters=None, work_id=work_id, primary_input_collection=primary_input_collection, other_input_collections=other_input_collections, + primary_output_collection=primary_output_collection, + other_output_collections=other_output_collections, + input_collections=input_collections, output_collections=output_collections, log_collections=log_collections, release_inputs_after_submitting=True, @@ -61,12 +65,19 @@ def __init__(self, task_parameters=None, self.panda_url = None self.panda_url_ssl = None self.panda_monitor = None + self.panda_auth = None + self.panda_auth_vo = None + self.panda_config_root = None + self.pandacache_url = None + self.panda_verify_host = None self.task_type = 'test' self.task_parameters = None self.parse_task_parameters(task_parameters) # self.logger.setLevel(logging.DEBUG) + self.logger = self.get_logger() + self.retry_number = 0 self.num_retries = num_retries @@ -98,30 +109,42 @@ def load_panda_urls(self): self.panda_url = None self.panda_url_ssl = None self.panda_monitor = None + self.pandacache_url = None + self.panda_verify_host = None + self.panda_auth = None + self.panda_auth_vo = None + self.panda_config_root = None if panda_config.has_section('panda'): - if panda_config.has_option('panda', 'panda_monitor_url'): + if 'PANDA_MONITOR_URL' not in os.environ and panda_config.has_option('panda', 'panda_monitor_url'): self.panda_monitor = panda_config.get('panda', 'panda_monitor_url') os.environ['PANDA_MONITOR_URL'] = self.panda_monitor # self.logger.debug("Panda monitor url: %s" % str(self.panda_monitor)) - if panda_config.has_option('panda', 'panda_url'): + if 'PANDA_URL' not in os.environ and panda_config.has_option('panda', 'panda_url'): self.panda_url = panda_config.get('panda', 'panda_url') os.environ['PANDA_URL'] = self.panda_url # self.logger.debug("Panda url: %s" % str(self.panda_url)) - if panda_config.has_option('panda', 'panda_url_ssl'): + if 'PANDACACHE_URL' not in os.environ and panda_config.has_option('panda', 'pandacache_url'): + self.pandacache_url = panda_config.get('panda', 'pandacache_url') + os.environ['PANDACACHE_URL'] = self.pandacache_url + # self.logger.debug("Pandacache url: %s" % str(self.pandacache_url)) + if 'PANDA_VERIFY_HOST' not in os.environ and panda_config.has_option('panda', 'panda_verify_host'): + self.panda_verify_host = panda_config.get('panda', 'panda_verify_host') + os.environ['PANDA_VERIFY_HOST'] = self.panda_verify_host + # self.logger.debug("Panda verify host: %s" % str(self.panda_verify_host)) + if 'PANDA_URL_SSL' not in os.environ and panda_config.has_option('panda', 'panda_url_ssl'): self.panda_url_ssl = panda_config.get('panda', 'panda_url_ssl') os.environ['PANDA_URL_SSL'] = self.panda_url_ssl # self.logger.debug("Panda url ssl: %s" % str(self.panda_url_ssl)) - - if not self.panda_monitor and 'PANDA_MONITOR_URL' in os.environ and os.environ['PANDA_MONITOR_URL']: - self.panda_monitor = os.environ['PANDA_MONITOR_URL'] - # self.logger.debug("Panda monitor url: %s" % str(self.panda_monitor)) - if not self.panda_url and 'PANDA_URL' in os.environ and os.environ['PANDA_URL']: - self.panda_url = os.environ['PANDA_URL'] - # self.logger.debug("Panda url: %s" % str(self.panda_url)) - if not self.panda_url_ssl and 'PANDA_URL_SSL' in os.environ and os.environ['PANDA_URL_SSL']: - self.panda_url_ssl = os.environ['PANDA_URL_SSL'] - # self.logger.debug("Panda url ssl: %s" % str(self.panda_url_ssl)) + if 'PANDA_AUTH' not in os.environ and panda_config.has_option('panda', 'panda_auth'): + self.panda_auth = panda_config.get('panda', 'panda_auth') + os.environ['PANDA_AUTH'] = self.panda_auth + if 'PANDA_AUTH_VO' not in os.environ and panda_config.has_option('panda', 'panda_auth_vo'): + self.panda_auth_vo = panda_config.get('panda', 'panda_auth_vo') + os.environ['PANDA_AUTH_VO'] = self.panda_auth_vo + if 'PANDA_CONFIG_ROOT' not in os.environ and panda_config.has_option('panda', 'panda_config_root'): + self.panda_config_root = panda_config.get('panda', 'panda_config_root') + os.environ['PANDA_CONFIG_ROOT'] = self.panda_config_root def set_agent_attributes(self, attrs, req_attributes=None): if self.class_name not in attrs or 'life_time' not in attrs[self.class_name] or int(attrs[self.class_name]['life_time']) <= 0: @@ -168,6 +191,17 @@ def parse_task_parameters(self, task_parameters): log_col = {'scope': scope, 'name': name} self.add_log_collections(log_col) + if not self.get_primary_output_collection(): + all_colls = self.get_collections() + if all_colls: + one_coll = all_colls[0] + output_coll_scope = one_coll.scope + else: + output_coll_scope = 'pseudo.scope' + name = 'pseudo_output.' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S_%f") + str(random.randint(1, 1000)) + output_coll = {'scope': output_coll_scope, 'name': name, 'type': CollectionType.PseudoDataset} + self.set_primary_output_collection(output_coll) + if not self.get_primary_input_collection(): output_colls = self.get_output_collections() output_coll = output_colls[0] @@ -181,6 +215,72 @@ def parse_task_parameters(self, task_parameters): # raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) self.add_errors(str(ex)) + def renew_parameter(self, parameter): + new_parameter = parameter + has_updates = True + len_idds = len('___idds___') + while has_updates: + if '___idds___' in parameter: + pos_start = parameter.find('___idds___') + attr = parameter[pos_start:] + attr = attr.replace("___idds___", "") + pos = attr.find("___") + if pos > -1: + attr = attr[:pos] + if attr: + idds_attr = "___idds___" + attr + "___" + if hasattr(self, attr): + has_updates = True + attr_value = getattr(self, attr) + new_parameter = new_parameter.replace(idds_attr, str(attr_value)) + parameter = parameter.replace(idds_attr, str(attr_value)) + else: + parameter = parameter[pos_start + len_idds:] + else: + has_updates = False + return new_parameter + + def renew_parameters_from_attributes(self): + if not self.task_parameters: + return + + try: + for key in self.task_parameters: + if self.task_parameters[key] and type(self.task_parameters[key]) in [str]: + self.task_parameters[key] = self.renew_parameter(self.task_parameters[key]) + + if 'taskName' in self.task_parameters: + self.task_name = self.task_parameters['taskName'] + self.task_name = self.renew_parameter(self.task_name) + self.task_parameters['taskName'] = self.task_name + self.set_work_name(self.task_name) + + if 'prodSourceLabel' in self.task_parameters: + self.task_type = self.task_parameters['prodSourceLabel'] + + if 'jobParameters' in self.task_parameters: + jobParameters = self.task_parameters['jobParameters'] + for jobP in jobParameters: + if type(jobP) in [dict]: + for key in jobP: + if jobP[key] and type(jobP[key]) in [str]: + jobP[key] = self.renew_parameter(jobP[key]) + + if 'log' in self.task_parameters: + log = self.task_parameters['log'] + for key in log: + if log[key] and type(log[key]) in [str]: + self.task_parameters['log'][key] = self.renew_parameter(log[key]) + + for coll_id in self.collections: + coll_name = self.collections[coll_id].name + self.collections[coll_id].name = self.renew_parameter(coll_name) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + # raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) + self.add_errors(str(ex)) + def get_rucio_client(self): try: client = RucioClient() @@ -253,20 +353,21 @@ def poll_external_collection(self, coll): self.logger.error(traceback.format_exc()) raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) - def get_input_collections(self): + def get_input_collections(self, poll_externel=False): """ *** Function called by Transformer agent. """ - colls = [self.primary_input_collection] + self.other_input_collections + colls = [self._primary_input_collection] + self._other_input_collections for coll_int_id in colls: coll = self.collections[coll_int_id] # if self.is_internal_collection(coll): # coll = self.poll_internal_collection(coll) # else: # coll = self.poll_external_collection(coll) - coll = self.poll_external_collection(coll) + if poll_externel: + coll = self.poll_external_collection(coll) self.collections[coll_int_id] = coll - return super(ATLASPandaWork, self).get_input_collections() + return super(ATLASPandaWork, self).get_input_collections(poll_externel=poll_externel) def get_input_contents(self): """ @@ -347,7 +448,7 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): new_inputs.append(ip) # to avoid cheking new inputs if there are no new inputs anymore - if (not new_inputs and self.collections[self.primary_input_collection].status in [CollectionStatus.Closed]): # noqa: W503 + if (not new_inputs and self.collections[self._primary_input_collection].status in [CollectionStatus.Closed]): # noqa: W503 self.set_has_new_inputs(False) else: pass @@ -397,10 +498,10 @@ def submit_panda_task(self, processing): proc = processing['processing_metadata']['processing'] task_param = proc.processing_metadata['task_param'] return_code = Client.insertTaskParams(task_param, verbose=True) - if return_code[0] == 0: + if return_code[0] == 0 and return_code[1][0] is True: try: task_id = int(return_code[1][1]) - return task_id + return task_id, None except Exception as ex: self.logger.warn("task id is not retruned: (%s) is not task id: %s" % (return_code[1][1], str(ex))) # jediTaskID=26468582 @@ -409,30 +510,38 @@ def submit_panda_task(self, processing): for part in parts: if 'jediTaskID=' in part: task_id = int(part.split("=")[1]) - return task_id + return task_id, None + else: + return None, return_code else: self.logger.warn("submit_panda_task, return_code: %s" % str(return_code)) + return None, return_code except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) # raise exceptions.AgentPluginError('%s: %s' % (str(ex), traceback.format_exc())) - return None + return None, str(ex) + return None, None def submit_processing(self, processing): """ *** Function called by Carrier agent. """ + errors = None proc = processing['processing_metadata']['processing'] if proc.workload_id: # if 'task_id' in processing['processing_metadata'] and processing['processing_metadata']['task_id']: pass + return True, proc.workload_id, None else: - task_id = self.submit_panda_task(processing) + task_id, errors = self.submit_panda_task(processing) # processing['processing_metadata']['task_id'] = task_id # processing['processing_metadata']['workload_id'] = task_id - proc.workload_id = task_id if task_id: proc.submitted_at = datetime.datetime.utcnow() + proc.workload_id = task_id + return True, task_id, errors + return False, None, errors def poll_panda_task_status(self, processing): if 'processing' in processing['processing_metadata']: @@ -491,7 +600,7 @@ def get_panda_task_id(self, processing): return task_id - def poll_panda_task(self, processing=None, input_output_maps=None): + def poll_panda_task(self, processing=None, input_output_maps=None, log_prefix=''): task_id = None try: from pandaclient import Client @@ -504,34 +613,28 @@ def poll_panda_task(self, processing=None, input_output_maps=None): if task_id: # ret_ids = Client.getPandaIDsWithTaskID(task_id, verbose=False) - task_info = Client.getJediTaskDetails({'jediTaskID': task_id}, True, True, verbose=False) - self.logger.info("poll_panda_task, task_info: %s" % str(task_info)) + task_info = Client.getJediTaskDetails({'jediTaskID': task_id}, True, True, verbose=True) + self.logger.info(log_prefix + "poll_panda_task, task_info: %s" % str(task_info)) if task_info[0] != 0: - self.logger.warn("poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) + self.logger.warn(log_prefix + "poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) return ProcessingStatus.Submitting, [], {} task_info = task_info[1] processing_status = self.get_processing_status_from_panda_status(task_info["status"]) - if processing_status in [ProcessingStatus.SubFinished]: - if self.retry_number < self.num_retries: - self.reactivate_processing(processing) - processing_status = ProcessingStatus.Submitted - self.retry_number += 1 - return processing_status, [], {} else: - return ProcessingStatus.Failed, [], {} + return ProcessingStatus.Running, [], {} except Exception as ex: msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - self.logger.error(msg) - self.logger.error(ex) + self.logger.error(log_prefix + msg) + self.logger.error(log_prefix + ex) self.logger.error(traceback.format_exc()) # raise exceptions.IDDSException(msg) - return ProcessingStatus.Submitting, [], {} + return ProcessingStatus.Running, [], [] - def kill_processing(self, processing): + def kill_processing(self, processing, log_prefix=''): try: if processing: from pandaclient import Client @@ -540,11 +643,13 @@ def kill_processing(self, processing): # task_id = processing['processing_metadata']['task_id'] # Client.killTask(task_id) Client.finishTask(task_id, soft=False) + self.logger.info(log_prefix + "finishTask: %s" % task_id) except Exception as ex: - msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - raise exceptions.IDDSException(msg) + msg = "Failed to kill the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) + # raise exceptions.IDDSException(msg) + self.logger.error(log_prefix + "Failed to finishTask: %s, %s" % (task_id, msg)) - def kill_processing_force(self, processing): + def kill_processing_force(self, processing, log_prefix=''): try: if processing: from pandaclient import Client @@ -553,11 +658,13 @@ def kill_processing_force(self, processing): # task_id = processing['processing_metadata']['task_id'] Client.killTask(task_id) # Client.finishTask(task_id, soft=True) + self.logger.info(log_prefix + "killTask: %s" % task_id) except Exception as ex: - msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - raise exceptions.IDDSException(msg) + msg = "Failed to force kill the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) + # raise exceptions.IDDSException(msg) + self.logger.error(log_prefix + "Failed to finishTask: %s, %s" % (task_id, msg)) - def reactivate_processing(self, processing): + def reactivate_processing(self, processing, log_prefix=''): try: if processing: from pandaclient import Client @@ -567,119 +674,46 @@ def reactivate_processing(self, processing): # Client.retryTask(task_id) status, out = Client.retryTask(task_id, newParams={}) - self.logger.warn("Retry processing(%s) with task id(%s): %s, %s" % (processing['processing_id'], task_id, status, out)) + self.logger.warn(log_prefix + "Retry processing(%s) with task id(%s): %s, %s" % (processing['processing_id'], task_id, status, out)) # Client.reactivateTask(task_id) # Client.resumeTask(task_id) except Exception as ex: - msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - raise exceptions.IDDSException(msg) + msg = log_prefix + "Failed to resume the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) + # raise exceptions.IDDSException(msg) + self.logger.error(msg) - def poll_processing_updates(self, processing, input_output_maps): + def abort_processing(self, processing, log_prefix=''): + self.kill_processing_force(processing, log_prefix=log_prefix) + + def resume_processing(self, processing, log_prefix=''): + self.reactivate_processing(processing, log_prefix=log_prefix) + + def poll_processing_updates(self, processing, input_output_maps, log_prefix=''): """ *** Function called by Carrier agent. """ updated_contents = [] - update_processing = {} - reset_expired_at = False - reactive_contents = [] # self.logger.debug("poll_processing_updates, input_output_maps: %s" % str(input_output_maps)) if processing: proc = processing['processing_metadata']['processing'] - if proc.tocancel: - self.logger.info("Cancelling processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing_force(processing) - # self.kill_processing(processing) - proc.tocancel = False - proc.polling_retries = 0 - elif proc.tosuspend: - self.logger.info("Suspending processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing_force(processing) - # self.kill_processing(processing) - proc.tosuspend = False - proc.polling_retries = 0 - elif proc.toresume: - self.logger.info("Resuming processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.reactivate_processing(processing) - reset_expired_at = True - proc.toresume = False - proc.polling_retries = 0 - proc.has_new_updates() - # reactive_contents = self.reactive_contents(input_output_maps) - # elif self.is_processing_expired(processing): - elif proc.toexpire: - self.logger.info("Expiring processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing(processing) - proc.toexpire = False - proc.polling_retries = 0 - elif proc.tofinish or proc.toforcefinish: - self.logger.info("Finishing processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing(processing) - proc.tofinish = False - proc.toforcefinish = False - proc.polling_retries = 0 - - processing_status, poll_updated_contents, new_input_output_maps = self.poll_panda_task(processing=processing, input_output_maps=input_output_maps) - self.logger.debug("poll_processing_updates, processing_status: %s" % str(processing_status)) - self.logger.debug("poll_processing_updates, update_contents: %s" % str(poll_updated_contents)) + + processing_status, poll_updated_contents, new_input_output_maps = self.poll_panda_task(processing=processing, + input_output_maps=input_output_maps, + log_prefix=log_prefix) + self.logger.debug(log_prefix + "poll_processing_updates, processing_status: %s" % str(processing_status)) + self.logger.debug(log_prefix + "poll_processing_updates, update_contents: %s" % str(poll_updated_contents)) if poll_updated_contents: proc.has_new_updates() for content in poll_updated_contents: updated_content = {'content_id': content['content_id'], + 'status': content['status'], 'substatus': content['substatus'], 'content_metadata': content['content_metadata']} updated_contents.append(updated_content) - content_substatus = {'finished': 0, 'unfinished': 0} - for map_id in input_output_maps: - outputs = input_output_maps[map_id]['outputs'] - for content in outputs: - if content.get('substatus', ContentStatus.New) != ContentStatus.Available: - content_substatus['unfinished'] += 1 - else: - content_substatus['finished'] += 1 - - if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed] and updated_contents: - self.logger.info("Processing %s is terminated, but there are still contents to be flushed. Waiting." % (proc.workload_id)) - # there are still polling contents, should not terminate the task. - processing_status = ProcessingStatus.Running - - if processing_status in [ProcessingStatus.SubFinished] and content_substatus['finished'] > 0 and content_substatus['unfinished'] == 0: - # found that a 'done' panda task has got a 'finished' status. Maybe in this case 'finished' is a transparent status. - if proc.polling_retries is None: - proc.polling_retries = 0 - - if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed]: - if proc.polling_retries is not None and proc.polling_retries < 3: - self.logger.info("processing %s polling_retries(%s) < 3, keep running" % (processing['processing_id'], proc.polling_retries)) - processing_status = ProcessingStatus.Running - proc.polling_retries += 1 - else: - proc.polling_retries = 0 - - if proc.in_operation_time(): - processing_status = ProcessingStatus.Running - - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': processing_status}} - if reset_expired_at: - processing['expired_at'] = None - update_processing['parameters']['expired_at'] = None - proc.polling_retries = 0 - # if (processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed] - # or processing['status'] in [ProcessingStatus.Resuming]): # noqa W503 - # using polling_retries to poll it again when panda may update the status in a delay(when issuing retryTask, panda will not update it without any delay). - update_processing['parameters']['status'] = ProcessingStatus.Resuming - proc.status = update_processing['parameters']['status'] - - self.logger.debug("poll_processing_updates, task: %s, update_processing: %s" % - (proc.workload_id, str(update_processing))) - self.logger.debug("poll_processing_updates, task: %s, updated_contents: %s" % - (proc.workload_id, str(updated_contents))) - self.logger.debug("poll_processing_updates, task: %s, reactive_contents: %s" % - (proc.workload_id, str(reactive_contents))) - return update_processing, updated_contents + reactive_contents, new_input_output_maps + return processing_status, updated_contents, new_input_output_maps, [], {} def get_status_statistics(self, registered_input_output_maps): status_statistics = {} @@ -702,9 +736,9 @@ def syn_work_status(self, registered_input_output_maps, all_updates_flushed=True self.logger.debug("syn_work_status, self.active_processings: %s" % str(self.active_processings)) self.logger.debug("syn_work_status, self.has_new_inputs(): %s" % str(self.has_new_inputs)) self.logger.debug("syn_work_status, coll_metadata_is_open: %s" % - str(self.collections[self.primary_input_collection].coll_metadata['is_open'])) + str(self.collections[self._primary_input_collection].coll_metadata['is_open'])) self.logger.debug("syn_work_status, primary_input_collection_status: %s" % - str(self.collections[self.primary_input_collection].status)) + str(self.collections[self._primary_input_collection].status)) self.logger.debug("syn_work_status(%s): is_processings_terminated: %s" % (str(self.get_processing_ids()), str(self.is_processings_terminated()))) self.logger.debug("syn_work_status(%s): is_input_collections_closed: %s" % (str(self.get_processing_ids()), str(self.is_input_collections_closed()))) @@ -712,7 +746,8 @@ def syn_work_status(self, registered_input_output_maps, all_updates_flushed=True self.logger.debug("syn_work_status(%s): has_to_release_inputs: %s" % (str(self.get_processing_ids()), str(self.has_to_release_inputs()))) self.logger.debug("syn_work_status(%s): to_release_input_contents: %s" % (str(self.get_processing_ids()), str(to_release_input_contents))) - if self.is_processings_terminated() and self.is_input_collections_closed() and not self.has_new_inputs and not self.has_to_release_inputs() and not to_release_input_contents: + # if self.is_processings_terminated() and self.is_input_collections_closed() and not self.has_new_inputs and not self.has_to_release_inputs() and not to_release_input_contents: + if self.is_processings_terminated(): # if not self.is_all_outputs_flushed(registered_input_output_maps): if not all_updates_flushed: self.logger.warn("The work processings %s is terminated. but not all outputs are flushed. Wait to flush the outputs then finish the transform" % str(self.get_processing_ids())) diff --git a/atlas/lib/idds/atlas/workflow/atlasstageinwork.py b/atlas/lib/idds/atlas/workflow/atlasstageinwork.py index 00eb9e62..d700df60 100644 --- a/atlas/lib/idds/atlas/workflow/atlasstageinwork.py +++ b/atlas/lib/idds/atlas/workflow/atlasstageinwork.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 +# - Wen Guan, , 2020 - 2022 import copy import datetime @@ -21,13 +21,15 @@ from idds.common.constants import (TransformType, CollectionType, CollectionStatus, ContentStatus, ContentType, ProcessingStatus, WorkStatus) -from idds.workflow.work import Work, Processing +from idds.workflow.work import Processing +from idds.workflow.datawork import DataWork -class ATLASStageinWork(Work): +class ATLASStageinWork(DataWork): def __init__(self, executable=None, arguments=None, parameters=None, setup=None, work_tag='stagein', exec_type='local', sandbox=None, work_id=None, - primary_input_collection=None, other_input_collections=None, + primary_input_collection=None, other_input_collections=None, input_collections=None, + primary_output_collection=None, other_output_collections=None, output_collections=None, log_collections=None, agent_attributes=None, logger=None, @@ -57,6 +59,9 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, exec_type=exec_type, sandbox=sandbox, work_id=work_id, primary_input_collection=primary_input_collection, other_input_collections=other_input_collections, + primary_output_collection=primary_output_collection, + other_output_collections=other_output_collections, + input_collections=input_collections, output_collections=output_collections, log_collections=log_collections, agent_attributes=agent_attributes, @@ -123,12 +128,13 @@ def poll_external_collection(self, coll): # raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) return coll - def get_input_collections(self): + def get_input_collections(self, poll_externel=False): # return [self.primary_input_collection] + self.other_input_collections - colls = [self.primary_input_collection] + self.other_input_collections + colls = [self._primary_input_collection] + self._other_input_collections for coll_int_id in colls: coll = self.collections[coll_int_id] - coll = self.poll_external_collection(coll) + if poll_externel: + coll = self.poll_external_collection(coll) self.collections[coll_int_id] = coll return super(ATLASStageinWork, self).get_input_collections() @@ -139,10 +145,10 @@ def get_input_contents(self): try: ret_files = [] rucio_client = self.get_rucio_client() - files = rucio_client.list_files(scope=self.collections[self.primary_input_collection].scope, - name=self.collections[self.primary_input_collection].name) + files = rucio_client.list_files(scope=self.collections[self._primary_input_collection].scope, + name=self.collections[self._primary_input_collection].name) for file in files: - ret_file = {'coll_id': self.collections[self.primary_input_collection].coll_id, + ret_file = {'coll_id': self.collections[self._primary_input_collection].coll_id, 'scope': file['scope'], 'name': file['name'], 'bytes': file['bytes'], @@ -189,7 +195,7 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): new_inputs.append(ip) # to avoid cheking new inputs if there are no new inputs anymore - if (not new_inputs and self.collections[self.primary_input_collection].status in [CollectionStatus.Closed]): # noqa: W503 + if (not new_inputs and self.collections[self._primary_input_collection].status in [CollectionStatus.Closed]): # noqa: W503 self.set_has_new_inputs(False) else: mapped_keys = mapped_input_output_maps.keys() @@ -199,10 +205,10 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): next_key = 1 for ip in new_inputs: self.num_mapped_inputs += 1 - out_ip = copy.deepcopy(ip) ip['status'] = ContentStatus.New ip['substatus'] = ContentStatus.New - out_ip['coll_id'] = self.collections[self.output_collections[0]].coll_id + out_ip = copy.deepcopy(ip) + out_ip['coll_id'] = self.collections[self._primary_output_collection].coll_id new_input_output_maps[next_key] = {'inputs': [ip], 'outputs': [out_ip], 'inputs_dependency': [], @@ -225,9 +231,9 @@ def create_processing(self, input_output_maps=[]): 'life_time': self.life_time, 'rule_id': self.rule_id} proc = Processing(processing_metadata=processing_metadata) - proc.external_id = self.rule_id - if self.rule_id: - proc.submitted_at = datetime.datetime.utcnow() + # proc.external_id = self.rule_id + # if self.rule_id: + # proc.submitted_at = datetime.datetime.utcnow() self.add_processing_to_processings(proc) self.active_processings.append(proc.internal_id) @@ -236,8 +242,8 @@ def create_processing(self, input_output_maps=[]): def create_rule(self, processing): try: rucio_client = self.get_rucio_client() - ds_did = {'scope': self.collections[self.primary_input_collection].scope, - 'name': self.collections[self.primary_input_collection].name} + ds_did = {'scope': self.collections[self._primary_input_collection].scope, + 'name': self.collections[self._primary_input_collection].name} rule_id = rucio_client.add_replication_rule(dids=[ds_did], copies=1, rse_expression=self.dest_rse, @@ -248,31 +254,39 @@ def create_rule(self, processing): ask_approval=False) if type(rule_id) in (list, tuple): rule_id = rule_id[0] - return rule_id + return rule_id, None except RucioDuplicateRule as ex: self.logger.warn(ex) - rules = rucio_client.list_did_rules(scope=self.collections[self.primary_input_collection].scope, - name=self.collections[self.primary_input_collection].name) + rules = rucio_client.list_did_rules(scope=self.collections[self._primary_input_collection].scope, + name=self.collections[self._primary_input_collection].name) for rule in rules: if rule['account'] == rucio_client.account and rule['rse_expression'] == self.dest_rse: - return rule['id'] + return rule['id'], None except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) # raise exceptions.AgentPluginError('%s: %s' % (str(ex), traceback.format_exc())) - return None + return None, str(ex) + return None, None def submit_processing(self, processing): proc = processing['processing_metadata']['processing'] - if proc.external_id: - # if 'rule_id' in processing['processing_meta']: - pass - else: - rule_id = self.create_rule(processing) - # processing['processing_metadata']['rule_id'] = rule_id - proc.external_id = rule_id - if rule_id: + + if not proc.external_id: + if self.rule_id: + proc.external_id = self.rule_id proc.submitted_at = datetime.datetime.utcnow() + return True, None, None + else: + rule_id, error = self.create_rule(processing) + if rule_id: + proc.external_id = rule_id + proc.submitted_at = datetime.datetime.utcnow() + return True, None, None + else: + return False, None, error + else: + return True, None, None def poll_rule(self, processing): try: @@ -313,79 +327,57 @@ def poll_processing(self, processing): return processing, 'notOk', {} - def poll_processing_updates(self, processing, input_output_maps): + def poll_processing_updates(self, processing, input_output_maps, log_prefix=''): try: processing, rule_state, rep_status = self.poll_processing(processing) + self.logger.info(log_prefix + "poll_processing rule_state: %s" % rule_state) updated_contents = [] - content_substatus = {'finished': 0, 'unfinished': 0} + updated_contents_full = [] for map_id in input_output_maps: inputs = input_output_maps[map_id]['inputs'] outputs = input_output_maps[map_id]['outputs'] for content in inputs + outputs: key = '%s:%s' % (content['scope'], content['name']) if key in rep_status: - if content['substatus'] != rep_status[key]: + if rule_state in ['OK'] and content['substatus'] != ContentStatus.Available: updated_content = {'content_id': content['content_id'], - 'substatus': rep_status[key]} + 'status': ContentStatus.Available, + 'substatus': ContentStatus.Available} updated_contents.append(updated_content) - content['substatus'] = rep_status[key] - if content['substatus'] == ContentStatus.Available: - content_substatus['finished'] += 1 - else: - content_substatus['unfinished'] += 1 - - update_processing = {} - if rule_state == 'OK' and content_substatus['finished'] > 0 and content_substatus['unfinished'] == 0: - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Finished}} - elif self.toexpire: - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Expired}} - elif self.tocancel: - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Cancelled}} - elif self.tosuspend: - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Suspended}} - elif self.toresume: - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Running}} - update_processing['parameters']['expired_at'] = None - processing['expired_at'] = None - proc = processing['processing_metadata']['processing'] - proc.has_new_updates() - elif self.tofinish: - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.SubFinished}} - elif self.toforcefinish: - for map_id in input_output_maps: - inputs = input_output_maps[map_id]['inputs'] - outputs = input_output_maps[map_id]['outputs'] - for content in inputs + outputs: - if content['substatus'] not in [ContentStatus.Available, ContentStatus.FakeAvailable]: + content['status'] = ContentStatus.Available + content['substatus'] = ContentStatus.Available + updated_contents_full.append(content) + elif content['substatus'] != rep_status[key]: updated_content = {'content_id': content['content_id'], - 'substatus': ContentStatus.FakeAvailable} + 'status': ContentStatus.Available, + 'substatus': rep_status[key]} updated_contents.append(updated_content) - content['substatus'] = ContentStatus.FakeAvailable + content['status'] = rep_status[key] + content['substatus'] = rep_status[key] + updated_contents_full.append(content) - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Finished}} + processing_status = ProcessingStatus.Running + if rule_state in ['OK']: + processing_status = ProcessingStatus.Finished + # elif rule_state in ['STUCK', 'SUSPENDED']: + elif rule_state in ['SUSPENDED']: + processing_status = ProcessingStatus.SubFinished if updated_contents: proc = processing['processing_metadata']['processing'] proc.has_new_updates() - return update_processing, updated_contents, {} + return processing_status, updated_contents, {}, updated_contents_full, {} except exceptions.ProcessNotFound as ex: self.logger.warn("processing_id %s not not found: %s" % (processing['processing_id'], str(ex))) - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.SubFinished}} - return update_processing, [], {} + processing_status = ProcessingStatus.Failed + return processing_status, [], {}, [], {} except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) - raise ex + + return ProcessingStatus.Running, [], {}, [], {} def get_status_statistics(self, registered_input_output_maps): status_statistics = {} diff --git a/atlas/lib/idds/atlas/workflowv2/atlashpowork.py b/atlas/lib/idds/atlas/workflowv2/atlashpowork.py index 1180f903..6a136530 100644 --- a/atlas/lib/idds/atlas/workflowv2/atlashpowork.py +++ b/atlas/lib/idds/atlas/workflowv2/atlashpowork.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 +# - Wen Guan, , 2020 - 2022 import copy import datetime @@ -124,6 +124,8 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, if agent_attributes: self.set_agent_attributes(agent_attributes) + self.logger = self.get_logger() + def set_agent_attributes(self, attrs, req_attributes=None): self.agent_attributes = attrs @@ -191,12 +193,13 @@ def poll_external_collection(self, coll): self.logger.error(traceback.format_exc()) raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) - def get_input_collections(self): + def get_input_collections(self, poll_externel=False): # return [self.primary_input_collection] + self.other_input_collections colls = [self._primary_input_collection] + self._other_input_collections for coll_int_id in colls: coll = self.collections[coll_int_id] - coll = self.poll_external_collection(coll) + if poll_externel: + coll = self.poll_external_collection(coll) self.collections[coll_int_id] = coll return super(ATLASHPOWork, self).get_input_collections() @@ -273,46 +276,11 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): New inputs which are not yet mapped to outputs. :param mapped_input_output_maps: Inputs that are already mapped. - """ - unfinished_mapped = self.get_unfinished_points(mapped_input_output_maps) - self.unfinished_points = unfinished_mapped - mapped_inputs = self.get_mapped_inputs(mapped_input_output_maps) - mapped_inputs_scope_name = [ip['scope'] + ":" + ip['name'] for ip in mapped_inputs] - mapped_keys = mapped_input_output_maps.keys() - if mapped_keys: - next_key = max(mapped_keys) + 1 - else: - next_key = 1 - - inputs = self.get_input_contents(point_index=next_key) - - new_inputs = [] - new_input_output_maps = {} - for ip in inputs: - ip_scope_name = ip['scope'] + ":" + ip['name'] - if ip_scope_name not in mapped_inputs_scope_name: - new_inputs.append(ip) - - # to avoid cheking new inputs if there are no new inputs anymore - if (not new_inputs and self.collections[self._primary_input_collection] - and self.collections[self._primary_input_collection].status in [CollectionStatus.Closed]): # noqa: W503 - self.set_has_new_inputs(False) - else: - for ip in new_inputs: - out_ip = copy.deepcopy(ip) - ip['status'] = ContentStatus.Available - ip['substatus'] = ContentStatus.Available - out_ip['coll_id'] = self.collections[self._primary_output_collection].coll_id - new_input_output_maps[next_key] = {'inputs': [ip], - 'outputs': [out_ip], - 'inputs_dependency': [], - 'logs': []} - next_key += 1 - - self.unfinished_points = self.unfinished_points + len(new_inputs) + :returns new_input_output_maps as dict. + """ - return new_input_output_maps + return {} def generate_points(self): active_processing = self.get_processing(None, without_creating=True) @@ -366,8 +334,8 @@ def get_processing(self, input_output_maps, without_creating=False): if self.active_processings: return self.processings[self.active_processings[0]] else: - # if not without_creating: - # return self.create_processing(input_output_maps) + if not without_creating: + return self.create_processing(input_output_maps) pass return None @@ -390,21 +358,6 @@ def get_status_statistics(self, registered_input_output_maps): self.status_statistics = status_statistics return status_statistics - """ - def syn_collection_status(self): - input_collections = self.get_input_collections() - output_collections = self.get_output_collections() - # log_collections = self.get_log_collections() - - for input_collection in input_collections: - input_collection['total_files'] = self.finished_points + self.unfinished_points - input_collection['processed_files'] = self.finished_points + self.unfinished_points - - for output_collection in output_collections: - output_collection['total_files'] = self.finished_points + self.unfinished_points - output_collection['processed_files'] = self.finished_points - """ - def syn_work_status(self, registered_input_output_maps, all_updates_flushed=True, output_statistics={}, to_release_input_contents=[]): super(ATLASHPOWork, self).syn_work_status(registered_input_output_maps) self.get_status_statistics(registered_input_output_maps) @@ -640,14 +593,18 @@ def submit_processing(self, processing): if proc.external_id: # if 'job_id' in processing['processing_metadata']: pass + return True, None, None else: job_id, errors = self.submit_condor_processing(processing) # processing['processing_metadata']['job_id'] = job_id # processing['processing_metadata']['errors'] = errors - proc.external_id = job_id if job_id: + proc.external_id = job_id proc.submitted_at = datetime.datetime.utcnow() - proc.errors = errors + return True, None, None + else: + proc.errors = errors + return False, None, errors def parse_processing_outputs(self, processing): request_id = processing['request_id'] @@ -678,7 +635,6 @@ def poll_processing(self, processing): proc = processing['processing_metadata']['processing'] job_status, job_err_msg = self.poll_condor_job_status(processing, proc.external_id) processing_outputs = None - reset_expired_at = False if job_status in [ProcessingStatus.Finished]: job_outputs, parser_errors = self.parse_processing_outputs(processing) if job_outputs: @@ -690,36 +646,9 @@ def poll_processing(self, processing): processing_err = parser_errors elif job_status in [ProcessingStatus.Failed]: processing_status = job_status - processing_err = job_err_msg - elif self.toexpire: - processing_status = ProcessingStatus.Expired - processing_err = "The processing is expired" - elif job_status in [ProcessingStatus.Cancelled]: - processing_status = job_status - processing_err = job_err_msg - elif self.tocancel: - self.cancelled_processings.append(proc.internal_id) - processing_status = ProcessingStatus.Cancelled - processing_outputs = None - processing_err = 'Cancelled' - elif self.tosuspend: - self.suspended_processings.append(proc.internal_id) - processing_status = ProcessingStatus.Suspended - processing_outputs = None - processing_err = 'Suspend' - elif self.toresume: - # self.old_processings.append(processing['processing_metadata']['internal_id']) - # self.active_processings.clear() - # self.active_processings.remove(processing['processing_metadata']['internal_id']) - processing['processing_metadata']['resuming_at'] = datetime.datetime.utcnow() - processing_status = ProcessingStatus.Running - reset_expired_at = True - processing_outputs = None - processing_err = None - else: processing_status = job_status processing_err = job_err_msg - return processing_status, processing_outputs, processing_err, reset_expired_at + return processing_status, processing_outputs, processing_err except Exception as ex: self.logger.error("processing_id %s exception: %s, %s" % (processing['processing_id'], str(ex), traceback.format_exc())) proc.retries += 1 @@ -727,25 +656,98 @@ def poll_processing(self, processing): processing_status = ProcessingStatus.Failed else: processing_status = ProcessingStatus.Running - return processing_status, None, None, False + return processing_status, None, None - def poll_processing_updates(self, processing, input_output_maps): - processing_status, processing_outputs, processing_err, reset_expired_at = self.poll_processing(processing) + def generate_new_input_output_maps(self, input_output_maps, points): + unfinished_mapped = self.get_unfinished_points(input_output_maps) + self.unfinished_points = unfinished_mapped - # processing_metadata = processing['processing_metadata'] - # if not processing_metadata: - # processing_metadata = {} - # processing_metadata['errors'] = processing_err - proc = processing['processing_metadata']['processing'] - proc.errors = processing_err + mapped_keys = input_output_maps.keys() + if mapped_keys: + next_key = max(mapped_keys) + 1 + else: + next_key = 1 - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': processing_status, - 'processing_metadata': processing['processing_metadata'], - 'output_metadata': processing_outputs}} + coll = self.collections[self._primary_input_collection] + new_input_output_maps = {} + loss = None + for point in points: + content = {'coll_id': coll.coll_id, + 'scope': coll.scope, + 'name': str(next_key), + 'bytes': 0, + 'adler32': None, + 'min_id': 0, + 'max_id': 0, + 'path': json.dumps((point, loss)), + 'content_type': ContentType.File, + 'content_metadata': {'events': 0}} + out_content = copy.deepcopy(content) + content['status'] = ContentStatus.Available + content['substatus'] = ContentStatus.Available + out_content['coll_id'] = self.collections[self._primary_output_collection].coll_id + new_input_output_maps[next_key] = {'inputs': [content], + 'outputs': [out_content], + 'inputs_dependency': [], + 'logs': []} + next_key += 1 + return new_input_output_maps - if reset_expired_at: - update_processing['parameters']['expired_at'] = None - processing['expired_at'] = None + def poll_processing_updates(self, processing, input_output_maps, log_prefix=''): + proc = processing['processing_metadata']['processing'] + new_input_output_maps = {} updated_contents = [] - return update_processing, updated_contents, {} + if proc.external_id: + processing_status, processing_outputs, processing_err = self.poll_processing(processing) + + proc.errors = processing_err + + if processing_status in [ProcessingStatus.Finished]: + proc.external_id = None + if processing_outputs: + self.logger.info(log_prefix + "Processing finished with outputs") + points = processing_outputs + new_input_output_maps = self.generate_new_input_output_maps(input_output_maps, points) + proc.old_external_id.append(proc.external_id) + processing_status = ProcessingStatus.Running + else: + self.status = WorkStatus.Finished + self.logger.info(log_prefix + "Processing finished and output is empty (output: %s)" % str(processing_outputs)) + elif processing_status in [ProcessingStatus.Failed, ProcessingStatus.Cancelled]: + proc.external_id = None + proc.retries += 1 + if proc.retries <= 3: + self.logger.warn(log_prefix + "Processing terminated (status: %s, retries: %s), retries <=3, new status: %s" % (processing_status, + proc.retries, + ProcessingStatus.Running)) + processing_status = ProcessingStatus.Running + else: + self.status = WorkStatus.Failed + self.logger.warn(log_prefix + "Processing terminated (status: %s, retries: %s)" % (processing_status, proc.retries)) + else: + unfinished_mapped = self.get_unfinished_points(input_output_maps) + self.unfinished_points = unfinished_mapped + + if self.unfinished_points > 0: + processing_status = ProcessingStatus.Running + else: + self.logger.warn(log_prefix + "max_points: %s, finished_points: %s, unfinished_points: %s, num_points_per_iteration: %s" % (self.max_points, + self.finished_points, + self.unfinished_points, + self.num_points_per_iteration)) + if self.max_points and (self.max_points - self.finished_points < self.num_points_per_iteration): + self.points_to_generate = self.max_points - self.finished_points + if self.points_to_generate <= 0: + processing_status = ProcessingStatus.Finished + self.status = WorkStatus.Finished + else: + status, workload_id, error = self.submit_processing(processing) + processing_status = ProcessingStatus.Running + + return processing_status, updated_contents, new_input_output_maps, [], {} + + def abort_processing(self, processing, log_prefix=''): + self.logger.info(log_prefix + "abort processing") + + def resume_processing(self, processing, log_prefix=''): + self.logger.info(log_prefix + "resume processing") diff --git a/atlas/lib/idds/atlas/workflowv2/atlaslocalpandawork.py b/atlas/lib/idds/atlas/workflowv2/atlaslocalpandawork.py index 0ef14b6c..12d215ae 100644 --- a/atlas/lib/idds/atlas/workflowv2/atlaslocalpandawork.py +++ b/atlas/lib/idds/atlas/workflowv2/atlaslocalpandawork.py @@ -18,8 +18,7 @@ from rucio.common.exception import (CannotAuthenticate as RucioCannotAuthenticate) from idds.common import exceptions -from idds.common.constants import (ContentStatus, - ProcessingStatus, WorkStatus) +from idds.common.constants import (ProcessingStatus, WorkStatus) from idds.common.utils import extract_scope_atlas # from idds.workflowv2.work import Work, Processing # from idds.workflowv2.workflow import Condition @@ -293,7 +292,7 @@ def get_rucio_download_client(self): raise exceptions.IDDSException('%s: %s' % (str(error), traceback.format_exc())) return client - def poll_panda_task_output(self, processing=None, input_output_maps=None): + def poll_panda_task_output(self, processing=None, input_output_maps=None, log_prefix=''): task_id = None try: from pandaclient import Client @@ -308,25 +307,20 @@ def poll_panda_task_output(self, processing=None, input_output_maps=None): if task_id: # ret_ids = Client.getPandaIDsWithTaskID(task_id, verbose=False) task_info = Client.getJediTaskDetails({'jediTaskID': task_id}, True, True, verbose=False) - self.logger.info("poll_panda_task, task_info: %s" % str(task_info)) + self.logger.info(log_prefix + "poll_panda_task, task_info: %s" % str(task_info)) if task_info[0] != 0: - self.logger.warn("poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) - return ProcessingStatus.Submitting, [], {} + self.logger.warn(log_prefix + "poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) + return ProcessingStatus.Submitting, [], {}, {} task_info = task_info[1] processing_status = self.get_processing_status_from_panda_status(task_info["status"]) - if processing_status in [ProcessingStatus.SubFinished]: - if self.retry_number < self.num_retries: - self.reactivate_processing(processing) - processing_status = ProcessingStatus.Submitted - self.retry_number += 1 if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished]: output_status, output_metadata = self.process_outputs(processing) if not output_status: err = "Failed to process processing(processing_id: %s, task_id: %s) outputs" % (processing['processing_id'], task_id) - self.logger.error(err) + self.logger.error(log_prefix + err) self.add_errors(err) processing_status = ProcessingStatus.Failed @@ -335,121 +329,42 @@ def poll_panda_task_output(self, processing=None, input_output_maps=None): return ProcessingStatus.Failed, [], {}, output_metadata except Exception as ex: msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - self.logger.error(msg) - self.logger.error(ex) + self.logger.error(log_prefix + msg) + self.logger.error(log_prefix + ex) self.logger.error(traceback.format_exc()) # raise exceptions.IDDSException(msg) return ProcessingStatus.Submitting, [], {}, {} - def poll_processing_updates(self, processing, input_output_maps): + def poll_processing_updates(self, processing, input_output_maps, log_prefix=''): """ *** Function called by Carrier agent. """ updated_contents = [] - update_processing = {} - reset_expired_at = False - reactive_contents = [] + update_contents_full = [] + parameters = {} # self.logger.debug("poll_processing_updates, input_output_maps: %s" % str(input_output_maps)) if processing: proc = processing['processing_metadata']['processing'] - if proc.tocancel: - self.logger.info("Cancelling processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing_force(processing) - # self.kill_processing(processing) - proc.tocancel = False - proc.polling_retries = 0 - elif proc.tosuspend: - self.logger.info("Suspending processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing_force(processing) - # self.kill_processing(processing) - proc.tosuspend = False - proc.polling_retries = 0 - elif proc.toresume: - self.logger.info("Resuming processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.reactivate_processing(processing) - reset_expired_at = True - proc.toresume = False - proc.polling_retries = 0 - proc.has_new_updates() - # reactive_contents = self.reactive_contents(input_output_maps) - # elif self.is_processing_expired(processing): - elif proc.toexpire: - self.logger.info("Expiring processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing(processing) - proc.toexpire = False - proc.polling_retries = 0 - elif proc.tofinish or proc.toforcefinish: - self.logger.info("Finishing processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing(processing) - proc.tofinish = False - proc.toforcefinish = False - proc.polling_retries = 0 - - processing_status, poll_updated_contents, new_input_output_maps, output_metadata = self.poll_panda_task_output(processing=processing, input_output_maps=input_output_maps) - self.logger.debug("poll_processing_updates, processing_status: %s" % str(processing_status)) - self.logger.debug("poll_processing_updates, update_contents: %s" % str(poll_updated_contents)) - self.logger.debug("poll_processing_updates, output_metadata: %s" % str(output_metadata)) + + processing_status, poll_updated_contents, new_input_output_maps, output_metadata = self.poll_panda_task_output(processing=processing, + input_output_maps=input_output_maps, + log_prefix=log_prefix) + self.logger.debug(log_prefix + "poll_processing_updates, output_metadata: %s" % str(output_metadata)) if poll_updated_contents: proc.has_new_updates() for content in poll_updated_contents: updated_content = {'content_id': content['content_id'], + 'status': content['status'], 'substatus': content['substatus'], 'content_metadata': content['content_metadata']} updated_contents.append(updated_content) - content_substatus = {'finished': 0, 'unfinished': 0} - for map_id in input_output_maps: - outputs = input_output_maps[map_id]['outputs'] - for content in outputs: - if content.get('substatus', ContentStatus.New) != ContentStatus.Available: - content_substatus['unfinished'] += 1 - else: - content_substatus['finished'] += 1 - - if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed] and updated_contents: - self.logger.info("Processing %s is terminated, but there are still contents to be flushed. Waiting." % (proc.workload_id)) - # there are still polling contents, should not terminate the task. - processing_status = ProcessingStatus.Running - - if processing_status in [ProcessingStatus.SubFinished] and content_substatus['finished'] > 0 and content_substatus['unfinished'] == 0: - # found that a 'done' panda task has got a 'finished' status. Maybe in this case 'finished' is a transparent status. - if proc.polling_retries is None: - proc.polling_retries = 0 - - if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed]: - if proc.polling_retries is not None and proc.polling_retries < 3: - self.logger.info("processing %s polling_retries(%s) < 3, keep running" % (processing['processing_id'], proc.polling_retries)) - processing_status = ProcessingStatus.Running - proc.polling_retries += 1 - else: - proc.polling_retries = 0 - - if proc.in_operation_time(): - processing_status = ProcessingStatus.Running - - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': processing_status, - 'output_metadata': output_metadata}} - - if reset_expired_at: - processing['expired_at'] = None - update_processing['parameters']['expired_at'] = None - proc.polling_retries = 0 - # if (processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed] - # or processing['status'] in [ProcessingStatus.Resuming]): # noqa W503 - # using polling_retries to poll it again when panda may update the status in a delay(when issuing retryTask, panda will not update it without any delay). - update_processing['parameters']['status'] = ProcessingStatus.Resuming - proc.status = update_processing['parameters']['status'] - - self.logger.debug("poll_processing_updates, task: %s, update_processing: %s" % - (proc.workload_id, str(update_processing))) - self.logger.debug("poll_processing_updates, task: %s, updated_contents: %s" % - (proc.workload_id, str(updated_contents))) - self.logger.debug("poll_processing_updates, task: %s, reactive_contents: %s" % - (proc.workload_id, str(reactive_contents))) - return update_processing, updated_contents + reactive_contents, new_input_output_maps + if output_metadata: + parameters = {'output_metadata': output_metadata} + + return processing_status, updated_contents, new_input_output_maps, update_contents_full, parameters def syn_work_status(self, registered_input_output_maps, all_updates_flushed=True, output_statistics={}, to_release_input_contents=[]): super(ATLASLocalPandaWork, self).syn_work_status(registered_input_output_maps, all_updates_flushed, output_statistics, to_release_input_contents) @@ -469,7 +384,8 @@ def syn_work_status(self, registered_input_output_maps, all_updates_flushed=True self.logger.debug("syn_work_status(%s): has_to_release_inputs: %s" % (str(self.get_processing_ids()), str(self.has_to_release_inputs()))) self.logger.debug("syn_work_status(%s): to_release_input_contents: %s" % (str(self.get_processing_ids()), str(to_release_input_contents))) - if self.is_processings_terminated() and self.is_input_collections_closed() and not self.has_new_inputs and not self.has_to_release_inputs() and not to_release_input_contents: + # if self.is_processings_terminated() and self.is_input_collections_closed() and not self.has_new_inputs and not self.has_to_release_inputs() and not to_release_input_contents: + if self.is_processings_terminated(): # if not self.is_all_outputs_flushed(registered_input_output_maps): if not all_updates_flushed: self.logger.warn("The work processings %s is terminated. but not all outputs are flushed. Wait to flush the outputs then finish the transform" % str(self.get_processing_ids())) diff --git a/atlas/lib/idds/atlas/workflowv2/atlaspandawork.py b/atlas/lib/idds/atlas/workflowv2/atlaspandawork.py index fc27c68c..c01a2bd3 100644 --- a/atlas/lib/idds/atlas/workflowv2/atlaspandawork.py +++ b/atlas/lib/idds/atlas/workflowv2/atlaspandawork.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 - 2021 +# - Wen Guan, , 2020 - 2022 try: @@ -24,8 +24,7 @@ from idds.common import exceptions from idds.common.constants import (TransformType, CollectionStatus, CollectionType, - ContentStatus, ContentType, - ProcessingStatus, WorkStatus) + ContentType, ProcessingStatus, WorkStatus) from idds.common.utils import extract_scope_atlas from idds.workflowv2.work import Work, Processing # from idds.workflowv2.workflow import Condition @@ -77,6 +76,8 @@ def __init__(self, task_parameters=None, self.parse_task_parameters(task_parameters) # self.logger.setLevel(logging.DEBUG) + self.logger = self.get_logger() + self.retry_number = 0 self.num_retries = num_retries @@ -216,18 +217,27 @@ def parse_task_parameters(self, task_parameters): def renew_parameter(self, parameter): new_parameter = parameter - if '___idds___' in parameter: - pos_start = parameter.find('___idds___') - attr = parameter[pos_start:] - attr = attr.replace("___idds___", "") - pos = attr.find("___") - if pos > -1: - attr = attr[:pos] - if attr: - idds_attr = "___idds___" + attr + "___" - if hasattr(self, attr): - attr_value = getattr(self, attr) - new_parameter = parameter.replace(idds_attr, str(attr_value)) + has_updates = True + len_idds = len('___idds___') + while has_updates: + if '___idds___' in parameter: + pos_start = parameter.find('___idds___') + attr = parameter[pos_start:] + attr = attr.replace("___idds___", "") + pos = attr.find("___") + if pos > -1: + attr = attr[:pos] + if attr: + idds_attr = "___idds___" + attr + "___" + if hasattr(self, attr): + has_updates = True + attr_value = getattr(self, attr) + new_parameter = new_parameter.replace(idds_attr, str(attr_value)) + parameter = parameter.replace(idds_attr, str(attr_value)) + else: + parameter = parameter[pos_start + len_idds:] + else: + has_updates = False return new_parameter def renew_parameters_from_attributes(self): @@ -343,7 +353,7 @@ def poll_external_collection(self, coll): self.logger.error(traceback.format_exc()) raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) - def get_input_collections(self): + def get_input_collections(self, poll_externel=False): """ *** Function called by Transformer agent. """ @@ -354,9 +364,10 @@ def get_input_collections(self): # coll = self.poll_internal_collection(coll) # else: # coll = self.poll_external_collection(coll) - coll = self.poll_external_collection(coll) + if poll_externel: + coll = self.poll_external_collection(coll) self.collections[coll_int_id] = coll - return super(ATLASPandaWork, self).get_input_collections() + return super(ATLASPandaWork, self).get_input_collections(poll_externel=poll_externel) def get_input_contents(self): """ @@ -487,10 +498,10 @@ def submit_panda_task(self, processing): proc = processing['processing_metadata']['processing'] task_param = proc.processing_metadata['task_param'] return_code = Client.insertTaskParams(task_param, verbose=True) - if return_code[0] == 0: + if return_code[0] == 0 and return_code[1][0] is True: try: task_id = int(return_code[1][1]) - return task_id + return task_id, None except Exception as ex: self.logger.warn("task id is not retruned: (%s) is not task id: %s" % (return_code[1][1], str(ex))) # jediTaskID=26468582 @@ -499,30 +510,38 @@ def submit_panda_task(self, processing): for part in parts: if 'jediTaskID=' in part: task_id = int(part.split("=")[1]) - return task_id + return task_id, None + else: + return None, return_code else: self.logger.warn("submit_panda_task, return_code: %s" % str(return_code)) + return None, return_code except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) # raise exceptions.AgentPluginError('%s: %s' % (str(ex), traceback.format_exc())) - return None + return None, str(ex) + return None, None def submit_processing(self, processing): """ *** Function called by Carrier agent. """ + errors = None proc = processing['processing_metadata']['processing'] if proc.workload_id: # if 'task_id' in processing['processing_metadata'] and processing['processing_metadata']['task_id']: pass + return True, proc.workload_id, None else: - task_id = self.submit_panda_task(processing) + task_id, errors = self.submit_panda_task(processing) # processing['processing_metadata']['task_id'] = task_id # processing['processing_metadata']['workload_id'] = task_id - proc.workload_id = task_id if task_id: proc.submitted_at = datetime.datetime.utcnow() + proc.workload_id = task_id + return True, task_id, errors + return False, None, errors def poll_panda_task_status(self, processing): if 'processing' in processing['processing_metadata']: @@ -581,7 +600,7 @@ def get_panda_task_id(self, processing): return task_id - def poll_panda_task(self, processing=None, input_output_maps=None): + def poll_panda_task(self, processing=None, input_output_maps=None, log_prefix=''): task_id = None try: from pandaclient import Client @@ -594,34 +613,28 @@ def poll_panda_task(self, processing=None, input_output_maps=None): if task_id: # ret_ids = Client.getPandaIDsWithTaskID(task_id, verbose=False) - task_info = Client.getJediTaskDetails({'jediTaskID': task_id}, True, True, verbose=False) - self.logger.info("poll_panda_task, task_info: %s" % str(task_info)) + task_info = Client.getJediTaskDetails({'jediTaskID': task_id}, True, True, verbose=True) + self.logger.info(log_prefix + "poll_panda_task, task_info: %s" % str(task_info)) if task_info[0] != 0: - self.logger.warn("poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) + self.logger.warn(log_prefix + "poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) return ProcessingStatus.Submitting, [], {} task_info = task_info[1] processing_status = self.get_processing_status_from_panda_status(task_info["status"]) - if processing_status in [ProcessingStatus.SubFinished]: - if self.retry_number < self.num_retries: - self.reactivate_processing(processing) - processing_status = ProcessingStatus.Submitted - self.retry_number += 1 - return processing_status, [], {} else: - return ProcessingStatus.Failed, [], {} + return ProcessingStatus.Running, [], {} except Exception as ex: msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - self.logger.error(msg) - self.logger.error(ex) + self.logger.error(log_prefix + msg) + self.logger.error(log_prefix + ex) self.logger.error(traceback.format_exc()) # raise exceptions.IDDSException(msg) - return ProcessingStatus.Submitting, [], {} + return ProcessingStatus.Running, [], [] - def kill_processing(self, processing): + def kill_processing(self, processing, log_prefix=''): try: if processing: from pandaclient import Client @@ -630,11 +643,13 @@ def kill_processing(self, processing): # task_id = processing['processing_metadata']['task_id'] # Client.killTask(task_id) Client.finishTask(task_id, soft=False) + self.logger.info(log_prefix + "finishTask: %s" % task_id) except Exception as ex: - msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - raise exceptions.IDDSException(msg) + msg = "Failed to kill the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) + # raise exceptions.IDDSException(msg) + self.logger.error(log_prefix + "Failed to finishTask: %s, %s" % (task_id, msg)) - def kill_processing_force(self, processing): + def kill_processing_force(self, processing, log_prefix=''): try: if processing: from pandaclient import Client @@ -643,11 +658,13 @@ def kill_processing_force(self, processing): # task_id = processing['processing_metadata']['task_id'] Client.killTask(task_id) # Client.finishTask(task_id, soft=True) + self.logger.info(log_prefix + "killTask: %s" % task_id) except Exception as ex: - msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - raise exceptions.IDDSException(msg) + msg = "Failed to force kill the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) + # raise exceptions.IDDSException(msg) + self.logger.error(log_prefix + "Failed to finishTask: %s, %s" % (task_id, msg)) - def reactivate_processing(self, processing): + def reactivate_processing(self, processing, log_prefix=''): try: if processing: from pandaclient import Client @@ -657,119 +674,46 @@ def reactivate_processing(self, processing): # Client.retryTask(task_id) status, out = Client.retryTask(task_id, newParams={}) - self.logger.warn("Retry processing(%s) with task id(%s): %s, %s" % (processing['processing_id'], task_id, status, out)) + self.logger.warn(log_prefix + "Retry processing(%s) with task id(%s): %s, %s" % (processing['processing_id'], task_id, status, out)) # Client.reactivateTask(task_id) # Client.resumeTask(task_id) except Exception as ex: - msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - raise exceptions.IDDSException(msg) + msg = log_prefix + "Failed to resume the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) + # raise exceptions.IDDSException(msg) + self.logger.error(msg) + + def abort_processing(self, processing, log_prefix=''): + self.kill_processing_force(processing, log_prefix=log_prefix) - def poll_processing_updates(self, processing, input_output_maps): + def resume_processing(self, processing, log_prefix=''): + self.reactivate_processing(processing, log_prefix=log_prefix) + + def poll_processing_updates(self, processing, input_output_maps, log_prefix=''): """ *** Function called by Carrier agent. """ updated_contents = [] - update_processing = {} - reset_expired_at = False - reactive_contents = [] # self.logger.debug("poll_processing_updates, input_output_maps: %s" % str(input_output_maps)) if processing: proc = processing['processing_metadata']['processing'] - if proc.tocancel: - self.logger.info("Cancelling processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing_force(processing) - # self.kill_processing(processing) - proc.tocancel = False - proc.polling_retries = 0 - elif proc.tosuspend: - self.logger.info("Suspending processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing_force(processing) - # self.kill_processing(processing) - proc.tosuspend = False - proc.polling_retries = 0 - elif proc.toresume: - self.logger.info("Resuming processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.reactivate_processing(processing) - reset_expired_at = True - proc.toresume = False - proc.polling_retries = 0 - proc.has_new_updates() - # reactive_contents = self.reactive_contents(input_output_maps) - # elif self.is_processing_expired(processing): - elif proc.toexpire: - self.logger.info("Expiring processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing(processing) - proc.toexpire = False - proc.polling_retries = 0 - elif proc.tofinish or proc.toforcefinish: - self.logger.info("Finishing processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing(processing) - proc.tofinish = False - proc.toforcefinish = False - proc.polling_retries = 0 - - processing_status, poll_updated_contents, new_input_output_maps = self.poll_panda_task(processing=processing, input_output_maps=input_output_maps) - self.logger.debug("poll_processing_updates, processing_status: %s" % str(processing_status)) - self.logger.debug("poll_processing_updates, update_contents: %s" % str(poll_updated_contents)) + + processing_status, poll_updated_contents, new_input_output_maps = self.poll_panda_task(processing=processing, + input_output_maps=input_output_maps, + log_prefix=log_prefix) + self.logger.debug(log_prefix + "poll_processing_updates, processing_status: %s" % str(processing_status)) + self.logger.debug(log_prefix + "poll_processing_updates, update_contents: %s" % str(poll_updated_contents)) if poll_updated_contents: proc.has_new_updates() for content in poll_updated_contents: updated_content = {'content_id': content['content_id'], + 'status': content['status'], 'substatus': content['substatus'], 'content_metadata': content['content_metadata']} updated_contents.append(updated_content) - content_substatus = {'finished': 0, 'unfinished': 0} - for map_id in input_output_maps: - outputs = input_output_maps[map_id]['outputs'] - for content in outputs: - if content.get('substatus', ContentStatus.New) != ContentStatus.Available: - content_substatus['unfinished'] += 1 - else: - content_substatus['finished'] += 1 - - if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed] and updated_contents: - self.logger.info("Processing %s is terminated, but there are still contents to be flushed. Waiting." % (proc.workload_id)) - # there are still polling contents, should not terminate the task. - processing_status = ProcessingStatus.Running - - if processing_status in [ProcessingStatus.SubFinished] and content_substatus['finished'] > 0 and content_substatus['unfinished'] == 0: - # found that a 'done' panda task has got a 'finished' status. Maybe in this case 'finished' is a transparent status. - if proc.polling_retries is None: - proc.polling_retries = 0 - - if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed]: - if proc.polling_retries is not None and proc.polling_retries < 3: - self.logger.info("processing %s polling_retries(%s) < 3, keep running" % (processing['processing_id'], proc.polling_retries)) - processing_status = ProcessingStatus.Running - proc.polling_retries += 1 - else: - proc.polling_retries = 0 - - if proc.in_operation_time(): - processing_status = ProcessingStatus.Running - - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': processing_status}} - if reset_expired_at: - processing['expired_at'] = None - update_processing['parameters']['expired_at'] = None - proc.polling_retries = 0 - # if (processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed] - # or processing['status'] in [ProcessingStatus.Resuming]): # noqa W503 - # using polling_retries to poll it again when panda may update the status in a delay(when issuing retryTask, panda will not update it without any delay). - update_processing['parameters']['status'] = ProcessingStatus.Resuming - proc.status = update_processing['parameters']['status'] - - self.logger.debug("poll_processing_updates, task: %s, update_processing: %s" % - (proc.workload_id, str(update_processing))) - self.logger.debug("poll_processing_updates, task: %s, updated_contents: %s" % - (proc.workload_id, str(updated_contents))) - self.logger.debug("poll_processing_updates, task: %s, reactive_contents: %s" % - (proc.workload_id, str(reactive_contents))) - return update_processing, updated_contents + reactive_contents, new_input_output_maps + return processing_status, updated_contents, new_input_output_maps, [], {} def get_status_statistics(self, registered_input_output_maps): status_statistics = {} diff --git a/atlas/lib/idds/atlas/workflowv2/atlasstageinwork.py b/atlas/lib/idds/atlas/workflowv2/atlasstageinwork.py index 89ef1b8b..d56917fd 100644 --- a/atlas/lib/idds/atlas/workflowv2/atlasstageinwork.py +++ b/atlas/lib/idds/atlas/workflowv2/atlasstageinwork.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 +# - Wen Guan, , 2020 - 2022 import copy import datetime @@ -21,10 +21,11 @@ from idds.common.constants import (TransformType, CollectionType, CollectionStatus, ContentStatus, ContentType, ProcessingStatus, WorkStatus) -from idds.workflowv2.work import Work, Processing +from idds.workflowv2.work import Processing +from idds.workflowv2.datawork import DataWork -class ATLASStageinWork(Work): +class ATLASStageinWork(DataWork): def __init__(self, executable=None, arguments=None, parameters=None, setup=None, work_tag='stagein', exec_type='local', sandbox=None, work_id=None, primary_input_collection=None, other_input_collections=None, input_collections=None, @@ -127,12 +128,13 @@ def poll_external_collection(self, coll): # raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) return coll - def get_input_collections(self): + def get_input_collections(self, poll_externel=False): # return [self.primary_input_collection] + self.other_input_collections colls = [self._primary_input_collection] + self._other_input_collections for coll_int_id in colls: coll = self.collections[coll_int_id] - coll = self.poll_external_collection(coll) + if poll_externel: + coll = self.poll_external_collection(coll) self.collections[coll_int_id] = coll return super(ATLASStageinWork, self).get_input_collections() @@ -203,9 +205,9 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): next_key = 1 for ip in new_inputs: self.num_mapped_inputs += 1 - out_ip = copy.deepcopy(ip) ip['status'] = ContentStatus.New ip['substatus'] = ContentStatus.New + out_ip = copy.deepcopy(ip) out_ip['coll_id'] = self.collections[self._primary_output_collection].coll_id new_input_output_maps[next_key] = {'inputs': [ip], 'outputs': [out_ip], @@ -229,9 +231,9 @@ def create_processing(self, input_output_maps=[]): 'life_time': self.life_time, 'rule_id': self.rule_id} proc = Processing(processing_metadata=processing_metadata) - proc.external_id = self.rule_id - if self.rule_id: - proc.submitted_at = datetime.datetime.utcnow() + # proc.external_id = self.rule_id + # if self.rule_id: + # proc.submitted_at = datetime.datetime.utcnow() self.add_processing_to_processings(proc) self.active_processings.append(proc.internal_id) @@ -252,31 +254,39 @@ def create_rule(self, processing): ask_approval=False) if type(rule_id) in (list, tuple): rule_id = rule_id[0] - return rule_id + return rule_id, None except RucioDuplicateRule as ex: self.logger.warn(ex) rules = rucio_client.list_did_rules(scope=self.collections[self._primary_input_collection].scope, name=self.collections[self._primary_input_collection].name) for rule in rules: if rule['account'] == rucio_client.account and rule['rse_expression'] == self.dest_rse: - return rule['id'] + return rule['id'], None except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) # raise exceptions.AgentPluginError('%s: %s' % (str(ex), traceback.format_exc())) - return None + return None, str(ex) + return None, None def submit_processing(self, processing): proc = processing['processing_metadata']['processing'] - if proc.external_id: - # if 'rule_id' in processing['processing_meta']: - pass - else: - rule_id = self.create_rule(processing) - # processing['processing_metadata']['rule_id'] = rule_id - proc.external_id = rule_id - if rule_id: + + if not proc.external_id: + if self.rule_id: + proc.external_id = self.rule_id proc.submitted_at = datetime.datetime.utcnow() + return True, None, None + else: + rule_id, error = self.create_rule(processing) + if rule_id: + proc.external_id = rule_id + proc.submitted_at = datetime.datetime.utcnow() + return True, None, None + else: + return False, None, error + else: + return True, None, None def poll_rule(self, processing): try: @@ -317,79 +327,57 @@ def poll_processing(self, processing): return processing, 'notOk', {} - def poll_processing_updates(self, processing, input_output_maps): + def poll_processing_updates(self, processing, input_output_maps, log_prefix=''): try: processing, rule_state, rep_status = self.poll_processing(processing) + self.logger.info(log_prefix + "poll_processing rule_state: %s" % rule_state) updated_contents = [] - content_substatus = {'finished': 0, 'unfinished': 0} + updated_contents_full = [] for map_id in input_output_maps: inputs = input_output_maps[map_id]['inputs'] outputs = input_output_maps[map_id]['outputs'] for content in inputs + outputs: key = '%s:%s' % (content['scope'], content['name']) if key in rep_status: - if content['substatus'] != rep_status[key]: + if rule_state in ['OK'] and content['substatus'] != ContentStatus.Available: updated_content = {'content_id': content['content_id'], - 'substatus': rep_status[key]} + 'status': ContentStatus.Available, + 'substatus': ContentStatus.Available} updated_contents.append(updated_content) - content['substatus'] = rep_status[key] - if content['substatus'] == ContentStatus.Available: - content_substatus['finished'] += 1 - else: - content_substatus['unfinished'] += 1 - - update_processing = {} - if rule_state == 'OK' and content_substatus['finished'] > 0 and content_substatus['unfinished'] == 0: - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Finished}} - elif self.toexpire: - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Expired}} - elif self.tocancel: - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Cancelled}} - elif self.tosuspend: - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Suspended}} - elif self.toresume: - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Running}} - update_processing['parameters']['expired_at'] = None - processing['expired_at'] = None - proc = processing['processing_metadata']['processing'] - proc.has_new_updates() - elif self.tofinish: - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.SubFinished}} - elif self.toforcefinish: - for map_id in input_output_maps: - inputs = input_output_maps[map_id]['inputs'] - outputs = input_output_maps[map_id]['outputs'] - for content in inputs + outputs: - if content['substatus'] not in [ContentStatus.Available, ContentStatus.FakeAvailable]: + content['status'] = ContentStatus.Available + content['substatus'] = ContentStatus.Available + updated_contents_full.append(content) + elif content['substatus'] != rep_status[key]: updated_content = {'content_id': content['content_id'], - 'substatus': ContentStatus.FakeAvailable} + 'status': ContentStatus.Available, + 'substatus': rep_status[key]} updated_contents.append(updated_content) - content['substatus'] = ContentStatus.FakeAvailable + content['status'] = rep_status[key] + content['substatus'] = rep_status[key] + updated_contents_full.append(content) - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Finished}} + processing_status = ProcessingStatus.Running + if rule_state in ['OK']: + processing_status = ProcessingStatus.Finished + # elif rule_state in ['STUCK', 'SUSPENDED']: + elif rule_state in ['SUSPENDED']: + processing_status = ProcessingStatus.SubFinished if updated_contents: proc = processing['processing_metadata']['processing'] proc.has_new_updates() - return update_processing, updated_contents, {} + return processing_status, updated_contents, {}, updated_contents_full, {} except exceptions.ProcessNotFound as ex: self.logger.warn("processing_id %s not not found: %s" % (processing['processing_id'], str(ex))) - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.SubFinished}} - return update_processing, [], {} + processing_status = ProcessingStatus.Failed + return processing_status, [], {}, [], {} except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) - raise ex + + return ProcessingStatus.Running, [], {}, [], {} def get_status_statistics(self, registered_input_output_maps): status_statistics = {} diff --git a/client/bin/idds b/client/bin/idds index 78b0d41f..2a1f6660 100755 --- a/client/bin/idds +++ b/client/bin/idds @@ -72,31 +72,20 @@ def get_requests_status(args): def abort_requests(args): wm = ClientManager(host=args.host, setup_client=True) - wm.abort(request_id=args.request_id, workload_id=args.workload_id) + ret = wm.abort(request_id=args.request_id, workload_id=args.workload_id) + print(ret) def abort_tasks(args): wm = ClientManager(host=args.host, setup_client=True) - wm.abort_tasks(request_id=args.request_id, workload_id=args.workload_id, task_id=args.task_id) - -def suspend_requests(args): - wm = ClientManager(host=args.host, setup_client=True) - wm.suspend(request_id=args.request_id, workload_id=args.workload_id) - - -def resume_requests(args): - wm = ClientManager(host=args.host, setup_client=True) - wm.resume(request_id=args.request_id, workload_id=args.workload_id) + ret = wm.abort_task(request_id=args.request_id, workload_id=args.workload_id, task_id=args.task_id) + print(ret) def retry_requests(args): wm = ClientManager(host=args.host, setup_client=True) - wm.retry(request_id=args.request_id, workload_id=args.workload_id) - - -def finish_requests(args): - wm = ClientManager(host=args.host, setup_client=True) - wm.finish(request_id=args.request_id, workload_id=args.workload_id, set_all_finished=args.set_all_finished) + ret = wm.retry(request_id=args.request_id, workload_id=args.workload_id) + print(ret) def download_logs(args): @@ -210,31 +199,12 @@ def get_parser(): abort_tasks_parser.add_argument('--workload_id', dest='workload_id', action='store', type=int, help='The workload id') abort_tasks_parser.add_argument('--task_id', dest='task_id', action='store', type=int, help='The task id') - # suspend requests - suspend_parser = subparsers.add_parser('suspend_requests', help='Suspend requests') - suspend_parser.set_defaults(function=suspend_requests) - suspend_parser.add_argument('--request_id', dest='request_id', action='store', type=int, help='The request id') - suspend_parser.add_argument('--workload_id', dest='workload_id', action='store', type=int, help='The workload id') - - # resume requests - resume_parser = subparsers.add_parser('resume_requests', help='Resume requests') - resume_parser.set_defaults(function=resume_requests) - resume_parser.add_argument('--request_id', dest='request_id', action='store', type=int, help='The request id') - resume_parser.add_argument('--workload_id', dest='workload_id', action='store', type=int, help='The workload id') - # retry requests retry_parser = subparsers.add_parser('retry_requests', help='Retry requests') retry_parser.set_defaults(function=retry_requests) retry_parser.add_argument('--request_id', dest='request_id', action='store', type=int, help='The request id') retry_parser.add_argument('--workload_id', dest='workload_id', action='store', type=int, help='The workload id') - # finish requests - finish_parser = subparsers.add_parser('finish_requests', help='Finish requests') - finish_parser.set_defaults(function=finish_requests) - finish_parser.add_argument('--request_id', dest='request_id', action='store', type=int, help='The request id') - finish_parser.add_argument('--workload_id', dest='workload_id', action='store', type=int, help='The workload id') - finish_parser.add_argument('--set_all_finished', default=False, action='store_true', help="Mark unfinished transformations as finished") - # download logs log_parser = subparsers.add_parser('download_logs', help='Download logs') log_parser.set_defaults(function=download_logs) diff --git a/client/lib/idds/client/clientmanager.py b/client/lib/idds/client/clientmanager.py index 60f102dd..3847a188 100644 --- a/client/lib/idds/client/clientmanager.py +++ b/client/lib/idds/client/clientmanager.py @@ -39,13 +39,14 @@ from idds.common import exceptions from idds.common.config import (get_local_cfg_file, get_local_config_root, get_local_config_value, get_main_config_file) -from idds.common.constants import RequestType, RequestStatus, ProcessingStatus +from idds.common.constants import RequestType, RequestStatus # from idds.common.utils import get_rest_host, exception_handler from idds.common.utils import exception_handler # from idds.workflowv2.work import Work, Parameter, WorkStatus # from idds.workflowv2.workflow import Condition, Workflow from idds.workflowv2.work import Collection +from idds.workflow.work import Collection as CollectionV1 setup_logging(__name__) @@ -397,7 +398,7 @@ def submit(self, workflow, username=None, userdn=None, use_dataset_name=True): if use_dataset_name: primary_init_work = workflow.get_primary_initial_collection() if primary_init_work: - if type(primary_init_work) in [Collection]: + if type(primary_init_work) in [Collection, CollectionV1]: props['scope'] = primary_init_work.scope props['name'] = primary_init_work.name else: @@ -422,24 +423,14 @@ def abort(self, request_id=None, workload_id=None): logging.error("Both request_id and workload_id are None. One of them should not be None") return (-1, "Both request_id and workload_id are None. One of them should not be None") - reqs = self.client.get_requests(request_id=request_id, workload_id=workload_id) - if reqs: - rets = [] - for req in reqs: - logging.info("Aborting request: %s" % req['request_id']) - # self.client.update_request(request_id=req['request_id'], parameters={'substatus': RequestStatus.ToCancel}) - self.client.send_message(request_id=req['request_id'], msg={'command': 'update_request', 'parameters': {'status': RequestStatus.ToCancel}}) - logging.info("Abort request registered successfully: %s" % req['request_id']) - ret = (0, "Abort request registered successfully: %s" % req['request_id']) - rets.append(ret) - return rets - else: - return (-1, 'No matching requests') + ret = self.client.abort_request(request_id=request_id, workload_id=workload_id) + # return (-1, 'No matching requests') + return ret @exception_handler - def abort_tasks(self, request_id=None, workload_id=None, task_id=None): + def abort_task(self, request_id=None, workload_id=None, task_id=None): """ - Abort tasks. + Abort task. :param workload_id: the workload id. :param request_id: the request. @@ -454,76 +445,8 @@ def abort_tasks(self, request_id=None, workload_id=None, task_id=None): logging.error("The task_id is required for killing tasks. If you want to kill the whole workflow, please try another API.") return (-1, "The task_id is required for killing tasks") - reqs = self.client.get_requests(request_id=request_id, workload_id=workload_id, with_processing=True) - if reqs: - rets = [] - for req in reqs: - if str(req['processing_workload_id']) == str(task_id): - logging.info("Aborting task: (request_id: %s, task_id: %s)" % (req['request_id'], task_id)) - self.client.send_message(request_id=req['request_id'], msg={'command': 'update_processing', - 'parameters': [{'status': ProcessingStatus.ToCancel, 'workload_id': task_id}]}) - logging.info("Abort task registered successfully: (request_id %s, task_id: %s)" % (req['request_id'], task_id)) - ret = (0, "Abort task registered successfully: (request_id %s, task_id: %s)" % (req['request_id'], task_id)) - rets.append(ret) - return rets - else: - return (-1, 'No matching requests') - - @exception_handler - def suspend(self, request_id=None, workload_id=None): - """ - Suspend requests. - - :param workload_id: the workload id. - :param request_id: the request. - """ - self.setup_client() - - if request_id is None and workload_id is None: - logging.error("Both request_id and workload_id are None. One of them should not be None") - return (-1, "Both request_id and workload_id are None. One of them should not be None") - - reqs = self.client.get_requests(request_id=request_id, workload_id=workload_id) - if reqs: - rets = [] - for req in reqs: - logging.info("Suspending request: %s" % req['request_id']) - # self.client.update_request(request_id=req['request_id'], parameters={'substatus': RequestStatus.ToSuspend}) - self.client.send_message(request_id=req['request_id'], msg={'command': 'update_request', 'parameters': {'status': RequestStatus.ToSuspend}}) - logging.info("Suspend request registered successfully: %s" % req['request_id']) - ret = (0, "Suspend request registered successfully: %s" % req['request_id']) - rets.append(ret) - return rets - else: - return (-1, 'No matching requests') - - @exception_handler - def resume(self, request_id=None, workload_id=None): - """ - Resume requests. - - :param workload_id: the workload id. - :param request_id: the request. - """ - self.setup_client() - - if request_id is None and workload_id is None: - logging.error("Both request_id and workload_id are None. One of them should not be None") - return (-1, "Both request_id and workload_id are None. One of them should not be None") - - reqs = self.client.get_requests(request_id=request_id, workload_id=workload_id) - if reqs: - rets = [] - for req in reqs: - logging.info("Resuming request: %s" % req['request_id']) - # self.client.update_request(request_id=req['request_id'], parameters={'substatus': RequestStatus.ToResume}) - self.client.send_message(request_id=req['request_id'], msg={'command': 'update_request', 'parameters': {'status': RequestStatus.ToResume}}) - logging.info("Resume request registered successfully: %s" % req['request_id']) - ret = (0, "Resume request registered successfully: %s" % req['request_id']) - rets.append(ret) - return rets - else: - return (-1, 'No matching requests') + ret = self.client.abort_request_task(request_id=request_id, workload_id=workload_id, task_id=task_id) + return ret @exception_handler def retry(self, request_id=None, workload_id=None): @@ -539,51 +462,8 @@ def retry(self, request_id=None, workload_id=None): logging.error("Both request_id and workload_id are None. One of them should not be None") return (-1, "Both request_id and workload_id are None. One of them should not be None") - reqs = self.client.get_requests(request_id=request_id, workload_id=workload_id) - if reqs: - rets = [] - for req in reqs: - logging.info("Retrying request: %s" % req['request_id']) - # self.client.update_request(request_id=req['request_id'], parameters={'substatus': RequestStatus.ToResume}) - self.client.send_message(request_id=req['request_id'], msg={'command': 'update_request', 'parameters': {'status': RequestStatus.ToResume}}) - logging.info("Retry request registered successfully: %s" % req['request_id']) - ret = (0, "Retry request registered successfully: %s" % req['request_id']) - rets.append(ret) - return rets - else: - return (-1, 'No matching requests') - - @exception_handler - def finish(self, request_id=None, workload_id=None, set_all_finished=False): - """ - Retry requests. - - :param workload_id: the workload id. - :param request_id: the request. - """ - self.setup_client() - - if request_id is None and workload_id is None: - logging.error("Both request_id and workload_id are None. One of them should not be None") - return (-1, "Both request_id and workload_id are None. One of them should not be None") - - reqs = self.client.get_requests(request_id=request_id, workload_id=workload_id) - if reqs: - rets = [] - for req in reqs: - logging.info("Finishing request: %s" % req['request_id']) - if set_all_finished: - # self.client.update_request(request_id=req['request_id'], parameters={'substatus': RequestStatus.ToForceFinish}) - self.client.send_message(request_id=req['request_id'], msg={'command': 'update_request', 'parameters': {'status': RequestStatus.ToForceFinish}}) - else: - # self.client.update_request(request_id=req['request_id'], parameters={'substatus': RequestStatus.ToFinish}) - self.client.send_message(request_id=req['request_id'], msg={'command': 'update_request', 'parameters': {'status': RequestStatus.ToFinish}}) - logging.info("ToFinish request registered successfully: %s" % req['request_id']) - ret = (0, "ToFinish request registered successfully: %s" % req['request_id']) - rets.append(ret) - return rets - else: - return (-1, 'No matching requests') + ret = self.client.retry_request(request_id=request_id, workload_id=workload_id) + return ret @exception_handler def get_requests(self, request_id=None, workload_id=None, with_detail=False, with_metadata=False): diff --git a/client/lib/idds/client/requestclient.py b/client/lib/idds/client/requestclient.py index b4239896..e6dfec21 100644 --- a/client/lib/idds/client/requestclient.py +++ b/client/lib/idds/client/requestclient.py @@ -105,3 +105,68 @@ def get_requests(self, request_id=None, workload_id=None, with_detail=False, wit # request['status'] = RequestStatus(request['status']) return requests + + def abort_request(self, request_id, workload_id=None): + """ + Abort Request. + + :param request_id: the request. + :param kwargs: other attributes of the request. + + :raise exceptions if it's not updated successfully. + """ + path = self.REQUEST_BASEURL + path += "/abort" + + if request_id is None: + request_id = 'null' + if workload_id is None: + workload_id = 'null' + + url = self.build_url(self.host, path=os.path.join(path, str(request_id), str(workload_id))) + r = self.get_request_response(url, type='PUT', data=None) + return r + + def abort_request_task(self, request_id, workload_id=None, task_id=None): + """ + Abort Request task. + + :param request_id: the request. + :param kwargs: other attributes of the request. + + :raise exceptions if it's not updated successfully. + """ + path = self.REQUEST_BASEURL + path += "/abort" + + if request_id is None: + request_id = 'null' + if workload_id is None: + workload_id = 'null' + if task_id is None: + task_id = 'null' + + url = self.build_url(self.host, path=os.path.join(path, str(request_id), str(workload_id), str(task_id))) + r = self.get_request_response(url, type='PUT', data=None) + return r + + def retry_request(self, request_id, workload_id=None): + """ + Retry Request. + + :param request_id: the request. + :param kwargs: other attributes of the request. + + :raise exceptions if it's not updated successfully. + """ + path = self.REQUEST_BASEURL + path += "/retry" + + if request_id is None: + request_id = 'null' + if workload_id is None: + workload_id = 'null' + + url = self.build_url(self.host, path=os.path.join(path, str(request_id), str(workload_id))) + r = self.get_request_response(url, type='PUT', data=None) + return r diff --git a/common/lib/idds/common/constants.py b/common/lib/idds/common/constants.py index eb35a311..a3c1b846 100644 --- a/common/lib/idds/common/constants.py +++ b/common/lib/idds/common/constants.py @@ -30,6 +30,8 @@ class Sections: Carrier = 'carrier' Conductor = 'conductor' Consumer = 'consumer' + EventBus = 'eventbus' + Cache = 'cache' class HTTP_STATUS_CODE: @@ -112,7 +114,10 @@ class WorkStatus(IDDSEnum): ToExpire = 15 Expiring = 16 Expired = 17 - Running = 18 + ToFinish = 18 + ToForceFinish = 19 + Running = 20 + Terminating = 21 class RequestStatus(IDDSEnum): @@ -136,6 +141,7 @@ class RequestStatus(IDDSEnum): Expired = 17 ToFinish = 18 ToForceFinish = 19 + Terminating = 20 class RequestLocking(IDDSEnum): @@ -190,6 +196,7 @@ class TransformType(IDDSEnum): Derivation = 5 Processing = 6 Actuating = 7 + Data = 8 Other = 99 @@ -214,6 +221,7 @@ class TransformStatus(IDDSEnum): Expired = 17 ToFinish = 18 ToForceFinish = 19 + Terminating = 20 class TransformLocking(IDDSEnum): @@ -316,6 +324,9 @@ class ProcessingStatus(IDDSEnum): ToFinish = 24 ToForceFinish = 25 Broken = 26 + Terminating = 27 + ToTrigger = 28 + Triggering = 29 class ProcessingLocking(IDDSEnum): @@ -363,6 +374,35 @@ class MessageTypeStr(IDDSEnum): UnknownWork = 'work_unknown' +TransformType2MessageTypeMap = { + '0': {'transform_type': TransformType.Workflow, + 'work': (MessageType.UnknownWork, MessageTypeStr.UnknownWork), + 'collection': (MessageType.UnknownCollection, MessageTypeStr.UnknownCollection), + 'file': (MessageType.UnknownFile, MessageTypeStr.UnknownFile) + }, + '2': {'transform_type': TransformType.StageIn, + 'work': (MessageType.StageInWork, MessageTypeStr.StageInWork), + 'collection': (MessageType.StageInCollection, MessageTypeStr.StageInCollection), + 'file': (MessageType.StageInFile, MessageTypeStr.StageInFile) + }, + '3': {'transform_type': TransformType.ActiveLearning, + 'work': (MessageType.ActiveLearningWork, MessageTypeStr.ActiveLearningWork), + 'collection': (MessageType.ActiveLearningCollection, MessageTypeStr.ActiveLearningCollection), + 'file': (MessageType.ActiveLearningFile, MessageTypeStr.ActiveLearningFile) + }, + '4': {'transform_type': TransformType.HyperParameterOpt, + 'work': (MessageType.HyperParameterOptWork, MessageTypeStr.HyperParameterOptWork), + 'collection': (MessageType.HyperParameterOptCollection, MessageTypeStr.HyperParameterOptCollection), + 'file': (MessageType.HyperParameterOptFile, MessageTypeStr.HyperParameterOptFile) + }, + '6': {'transform_type': TransformType.Processing, + 'work': (MessageType.ProcessingWork, MessageTypeStr.ProcessingWork), + 'collection': (MessageType.ProcessingCollection, MessageTypeStr.ProcessingCollection), + 'file': (MessageType.ProcessingFile, MessageTypeStr.ProcessingFile) + } +} + + class MessageStatus(IDDSEnum): New = 0 Fetched = 1 @@ -391,3 +431,53 @@ class MessageDestination(IDDSEnum): Carrier = 3 Conductor = 4 Outside = 5 + + +class CommandType(IDDSEnum): + AbortRequest = 0 + ResumeRequest = 1 + ExpireRequest = 2 + + +class CommandStatus(IDDSEnum): + New = 0 + Processing = 1 + Processed = 2 + Failed = 3 + UnknownCommand = 4 + + +class CommandLocking(IDDSEnum): + Idle = 0 + Locking = 1 + + +class CommandLocation(IDDSEnum): + Clerk = 0 + Transformer = 1 + Transporter = 2 + Carrier = 3 + Conductor = 4 + Rest = 5 + Other = 6 + + +def get_work_status_from_transform_processing_status(status): + if status in [ProcessingStatus.New, TransformStatus.New]: + return WorkStatus.New + elif status in [ProcessingStatus.Submitting, ProcessingStatus.Submitted, TransformStatus.Transforming]: + return WorkStatus.Transforming + elif status in [ProcessingStatus.Running]: + return WorkStatus.Transforming + elif status in [ProcessingStatus.Finished, TransformStatus.Finished]: + return WorkStatus.Finished + elif status in [ProcessingStatus.Failed, ProcessingStatus.Broken, TransformStatus.Failed]: + return WorkStatus.Failed + elif status in [ProcessingStatus.SubFinished, TransformStatus.SubFinished]: + return WorkStatus.SubFinished + elif status in [ProcessingStatus.Cancelled, ProcessingStatus.Suspended, TransformStatus.Cancelled, TransformStatus.Suspended]: + return WorkStatus.Cancelled + elif status in [ProcessingStatus.Terminating, TransformStatus.Terminating]: + return WorkStatus.Terminating + else: + return WorkStatus.Transforming diff --git a/common/lib/idds/common/exceptions.py b/common/lib/idds/common/exceptions.py index b1617c17..09285d06 100644 --- a/common/lib/idds/common/exceptions.py +++ b/common/lib/idds/common/exceptions.py @@ -204,6 +204,16 @@ def __init__(self, *args, **kwargs): self.error_code = 404 +class ProcessSubmitFailed(IDDSException): + """ + ProcessSubmitFailed + """ + def __init__(self, *args, **kwargs): + super(ProcessSubmitFailed, self).__init__(*args, **kwargs) + self._message = "Failed to submit process." + self.error_code = 405 + + class AgentException(IDDSException): """ BrokerException diff --git a/common/lib/idds/common/plugin/plugin_base.py b/common/lib/idds/common/plugin/plugin_base.py index 65122943..b71619c2 100644 --- a/common/lib/idds/common/plugin/plugin_base.py +++ b/common/lib/idds/common/plugin/plugin_base.py @@ -30,11 +30,17 @@ def __init__(self, **kwargs): def get_class_name(self): return self.__class__.__name__ - def setup_logger(self): + def setup_logger(self, logger=None): """ Setup logger """ - self.logger = logging.getLogger(self.get_class_name()) + if logger: + self.logger = logger + else: + self.logger = logging.getLogger(self.get_class_name()) + + def set_logger(self, logger): + self.logger = logger def __call__(self, **kwargs): return exceptions.NotImplementedException(self.get_class_name()) diff --git a/common/lib/idds/common/utils.py b/common/lib/idds/common/utils.py index 428fde66..6cbded24 100644 --- a/common/lib/idds/common/utils.py +++ b/common/lib/idds/common/utils.py @@ -53,13 +53,13 @@ def setup_logging(name, stream=None, loglevel=None): if config_has_section('common') and config_has_option('common', 'logdir'): logging.basicConfig(filename=os.path.join(config_get('common', 'logdir'), name), level=loglevel, - format='%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s') + format='%(asctime)s\t%(threadName)s\t%(name)s\t%(levelname)s\t%(message)s') else: logging.basicConfig(stream=sys.stdout, level=loglevel, - format='%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s') + format='%(asctime)s\t%(threadName)s\t%(name)s\t%(levelname)s\t%(message)s') else: logging.basicConfig(stream=stream, level=loglevel, - format='%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s') + format='%(asctime)s\t%(threadName)s\t%(name)s\t%(levelname)s\t%(message)s') def get_rest_url_prefix(): @@ -350,6 +350,8 @@ def default(self, obj): return obj.to_dict() elif isinstance(obj, datetime.datetime): return date_to_str(obj) + elif isinstance(obj, datetime.timedelta): + return str(obj) # elif isinstance(obj, (datetime.time, datetime.date)): # return obj.isoformat() # elif isinstance(obj, datetime.timedelta): diff --git a/docs/source/users/workflow_examples.rst b/docs/source/users/workflow_examples.rst index 1aea8415..3f7f88d3 100644 --- a/docs/source/users/workflow_examples.rst +++ b/docs/source/users/workflow_examples.rst @@ -25,6 +25,28 @@ Here is a simple example of loop workflow. cond = Condition(cond=work2.is_finished) workflow.add_loop_condition(cond) + # custom_condition + # iDDS will check to_contiue status. + # With ATLASLocalPanDAWork, the output will be parsed. If 'to_continue' is in the output, it will be used. + work2.add_custom_condition(key='to_continue', value=True) + cond = Condition(work2.get_custom_condition_status) + workflow.add_loop_condition(cond1) + + # multiple custom_condition + # to_continue and to_continue1 + work2.add_custom_condition(key='to_continue', value=True, op='and') + work2.add_custom_condition(key='to_continue1', value=True, op='and') + cond = Condition(work2.get_custom_condition_status) + workflow.add_loop_condition(cond1) + + # multiple custom_condition + # (to_continue and to_continue1) or to_exit or to_exit1 + work2.add_custom_condition(key='to_continue', value=True, op='and') + work2.add_custom_condition(key='to_continue1', value=True, op='and') + work2.add_custom_condition(key='to_exit', value=False, op='or') + work2.add_custom_condition(key='to_exit1', value=False, op='or') + cond = Condition(work2.get_custom_condition_status) + workflow.add_loop_condition(cond1) Sub workflow ~~~~~~~~~~~~~~~~~~~~~~~ @@ -119,5 +141,12 @@ However, to avoid the global parameters overwrite the work's private attributes, workflow1.add_work(work1, initial=False) workflow1.add_work(work2, initial=False) - # to avoid + # global paraeters workflow1.set_global_parameters({'user_attr1': 1, 'user_attr2': 2}) + + # sliced global parameters + workflow1.set_global_parameters({'user_attr': [1, 2, 3]}) + workflow1.set_sliced_global_parameters(source='user_attr', index=0) + workflow1.set_sliced_global_parameters(source='user_attr', index=1, name='user_myattr') + + diff --git a/doma/lib/idds/doma/workflow/domapandawork.py b/doma/lib/idds/doma/workflow/domapandawork.py index 8205ca94..e53b2425 100644 --- a/doma/lib/idds/doma/workflow/domapandawork.py +++ b/doma/lib/idds/doma/workflow/domapandawork.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 - 2021 +# - Wen Guan, , 2020 - 2022 # - Sergey Padolski, , 2020 @@ -37,22 +37,31 @@ class DomaPanDAWork(Work): def __init__(self, executable=None, arguments=None, parameters=None, setup=None, work_tag='lsst', exec_type='panda', sandbox=None, work_id=None, primary_input_collection=None, other_input_collections=None, + input_collections=None, + primary_output_collection=None, other_output_collections=None, output_collections=None, log_collections=None, logger=None, dependency_map=None, task_name="", - task_queue=None, processing_type=None, + task_queue=None, queue=None, processing_type=None, prodSourceLabel='test', task_type='test', maxwalltime=90000, maxattempt=5, core_count=1, encode_command_line=False, num_retries=5, + task_priority=900, task_log=None, task_cloud=None, - task_rss=1000): + task_site=None, + task_rss=1000, + vo='wlcg', + working_group='lsst'): super(DomaPanDAWork, self).__init__(executable=executable, arguments=arguments, parameters=parameters, setup=setup, work_type=TransformType.Processing, work_tag=work_tag, exec_type=exec_type, sandbox=sandbox, work_id=work_id, primary_input_collection=primary_input_collection, other_input_collections=other_input_collections, + primary_output_collection=primary_output_collection, + other_output_collections=other_output_collections, + input_collections=input_collections, output_collections=output_collections, log_collections=log_collections, release_inputs_after_submitting=True, @@ -64,27 +73,36 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, self.panda_auth = None self.panda_auth_vo = None self.panda_config_root = None + self.pandacache_url = None + self.panda_verify_host = None self.dependency_map = dependency_map self.dependency_map_deleted = [] # self.logger.setLevel(logging.DEBUG) self.task_name = task_name + self.real_task_name = None self.set_work_name(task_name) - self.queue = task_queue + self.task_queue = task_queue + self.queue = queue self.dep_tasks_id_names_map = {} self.executable = executable self.processingType = processing_type self.prodSourceLabel = prodSourceLabel self.task_type = task_type self.maxWalltime = maxwalltime - self.maxAttempt = maxattempt - self.core_count = core_count + self.maxAttempt = maxattempt if maxattempt else 5 + self.core_count = core_count if core_count else 1 self.task_log = task_log self.encode_command_line = encode_command_line self.task_cloud = task_cloud + self.task_site = task_site self.task_rss = task_rss + self.task_priority = task_priority + + self.vo = vo + self.working_group = working_group self.retry_number = 0 self.num_retries = num_retries @@ -93,6 +111,8 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, self.load_panda_urls() + self.dependency_tasks = None + def my_condition(self): if self.is_finished(): return True @@ -119,46 +139,43 @@ def load_panda_urls(self): self.panda_url = None self.panda_url_ssl = None self.panda_monitor = None + self.panda_auth = None + self.panda_auth_vo = None + self.panda_config_root = None + self.pandacache_url = None + self.panda_verify_host = None if panda_config.has_section('panda'): - if panda_config.has_option('panda', 'panda_monitor_url'): + if 'PANDA_MONITOR_URL' not in os.environ and panda_config.has_option('panda', 'panda_monitor_url'): self.panda_monitor = panda_config.get('panda', 'panda_monitor_url') os.environ['PANDA_MONITOR_URL'] = self.panda_monitor # self.logger.debug("Panda monitor url: %s" % str(self.panda_monitor)) - if panda_config.has_option('panda', 'panda_url'): + if 'PANDA_URL' not in os.environ and panda_config.has_option('panda', 'panda_url'): self.panda_url = panda_config.get('panda', 'panda_url') os.environ['PANDA_URL'] = self.panda_url # self.logger.debug("Panda url: %s" % str(self.panda_url)) - if panda_config.has_option('panda', 'panda_url_ssl'): + if 'PANDACACHE_URL' not in os.environ and panda_config.has_option('panda', 'pandacache_url'): + self.pandacache_url = panda_config.get('panda', 'pandacache_url') + os.environ['PANDACACHE_URL'] = self.pandacache_url + # self.logger.debug("Pandacache url: %s" % str(self.pandacache_url)) + if 'PANDA_VERIFY_HOST' not in os.environ and panda_config.has_option('panda', 'panda_verify_host'): + self.panda_verify_host = panda_config.get('panda', 'panda_verify_host') + os.environ['PANDA_VERIFY_HOST'] = self.panda_verify_host + # self.logger.debug("Panda verify host: %s" % str(self.panda_verify_host)) + if 'PANDA_URL_SSL' not in os.environ and panda_config.has_option('panda', 'panda_url_ssl'): self.panda_url_ssl = panda_config.get('panda', 'panda_url_ssl') os.environ['PANDA_URL_SSL'] = self.panda_url_ssl # self.logger.debug("Panda url ssl: %s" % str(self.panda_url_ssl)) - if panda_config.has_option('panda', 'panda_auth'): + if 'PANDA_AUTH' not in os.environ and panda_config.has_option('panda', 'panda_auth'): self.panda_auth = panda_config.get('panda', 'panda_auth') os.environ['PANDA_AUTH'] = self.panda_auth - if panda_config.has_option('panda', 'panda_auth_vo'): + if 'PANDA_AUTH_VO' not in os.environ and panda_config.has_option('panda', 'panda_auth_vo'): self.panda_auth_vo = panda_config.get('panda', 'panda_auth_vo') os.environ['PANDA_AUTH_VO'] = self.panda_auth_vo - if panda_config.has_option('panda', 'panda_config_root'): + if 'PANDA_CONFIG_ROOT' not in os.environ and panda_config.has_option('panda', 'panda_config_root'): self.panda_config_root = panda_config.get('panda', 'panda_config_root') os.environ['PANDA_CONFIG_ROOT'] = self.panda_config_root - if not self.panda_monitor and 'PANDA_MONITOR_URL' in os.environ and os.environ['PANDA_MONITOR_URL']: - self.panda_monitor = os.environ['PANDA_MONITOR_URL'] - # self.logger.debug("Panda monitor url: %s" % str(self.panda_monitor)) - if not self.panda_url and 'PANDA_URL' in os.environ and os.environ['PANDA_URL']: - self.panda_url = os.environ['PANDA_URL'] - # self.logger.debug("Panda url: %s" % str(self.panda_url)) - if not self.panda_url_ssl and 'PANDA_URL_SSL' in os.environ and os.environ['PANDA_URL_SSL']: - self.panda_url_ssl = os.environ['PANDA_URL_SSL'] - # self.logger.debug("Panda url ssl: %s" % str(self.panda_url_ssl)) - if not self.panda_auth and 'PANDA_AUTH' in os.environ and os.environ['PANDA_AUTH']: - self.panda_auth = os.environ['PANDA_AUTH'] - if not self.panda_auth_vo and 'PANDA_AUTH_VO' in os.environ and os.environ['PANDA_AUTH_VO']: - self.panda_auth_vo = os.environ['PANDA_AUTH_VO'] - if not self.panda_config_root and 'PANDA_CONFIG_ROOT' in os.environ and os.environ['PANDA_CONFIG_ROOT']: - self.panda_config_root = os.environ['PANDA_CONFIG_ROOT'] - def set_agent_attributes(self, attrs, req_attributes=None): if 'life_time' not in attrs[self.class_name] or int(attrs[self.class_name]['life_time']) <= 0: attrs['life_time'] = None @@ -169,14 +186,35 @@ def set_agent_attributes(self, attrs, req_attributes=None): self.poll_panda_jobs_chunk_size = int(self.agent_attributes['poll_panda_jobs_chunk_size']) def depend_on(self, work): + self.logger.debug("checking depending on") + if self.dependency_tasks is None: + self.logger.debug("constructing dependency_tasks set") + dependency_tasks = set([]) + for job in self.dependency_map: + inputs_dependency = job["dependencies"] + + for input_d in inputs_dependency: + task_name = input_d['task'] + dependency_tasks.add(task_name) + self.dependency_tasks = list(dependency_tasks) + + if work.task_name in self.dependency_tasks: + self.logger.debug("finished checking depending on") + return True + else: + self.logger.debug("finished checking depending on") + return False + + def get_ancestry_works(self): + tasks = set([]) for job in self.dependency_map: inputs_dependency = job["dependencies"] for input_d in inputs_dependency: task_name = input_d['task'] - if task_name == work.task_name: - return True - return False + if task_name not in tasks: + tasks.add(task_name) + return list(tasks) def poll_external_collection(self, coll): try: @@ -207,18 +245,19 @@ def poll_external_collection(self, coll): self.logger.error(traceback.format_exc()) raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) - def get_input_collections(self): + def get_input_collections(self, poll_externel=True): """ *** Function called by Transformer agent. """ - colls = [self.primary_input_collection] + self.other_input_collections + colls = [self._primary_input_collection] + self._other_input_collections for coll_int_id in colls: coll = self.collections[coll_int_id] # if self.is_internal_collection(coll): # coll = self.poll_internal_collection(coll) # else: # coll = self.poll_external_collection(coll) - coll = self.poll_external_collection(coll) + if poll_externel: + coll = self.poll_external_collection(coll) self.collections[coll_int_id] = coll return super(DomaPanDAWork, self).get_input_collections() @@ -281,6 +320,34 @@ def get_unmapped_jobs(self, mapped_input_output_maps={}): unmapped_jobs.append(job) return unmapped_jobs + def has_dependency(self): + for job in self.dependency_map: + if "dependencies" in job and job["dependencies"]: + return True + return False + + def get_parent_work_names(self): + parent_work_names = [] + for job in self.dependency_map: + if "dependencies" in job and job["dependencies"]: + inputs_dependency = job["dependencies"] + for input_d in inputs_dependency: + task_name = input_d['task'] + if task_name not in parent_work_names: + parent_work_names.append(task_name) + return parent_work_names + + def get_parent_workload_ids(self): + parent_workload_ids = [] + parent_work_names = self.get_parent_work_names() + work_name_to_coll_map = self.get_work_name_to_coll_map() + for work_name in parent_work_names: + if work_name in work_name_to_coll_map: + input_d_coll = work_name_to_coll_map[work_name]['outputs'][0] + if input_d_coll and 'workload_id' in input_d_coll: + parent_workload_ids.append(input_d_coll['workload_id']) + return parent_workload_ids + def get_new_input_output_maps(self, mapped_input_output_maps={}): """ *** Function called by Transformer agent. @@ -374,17 +441,19 @@ def create_processing(self, input_output_maps=[]): in_files.append(job['name']) task_param_map = {} - task_param_map['vo'] = 'wlcg' - if self.queue and len(self.queue) > 0: + task_param_map['vo'] = self.vo + if self.task_queue and len(self.task_queue) > 0: + task_param_map['site'] = self.task_queue + elif self.queue and len(self.queue) > 0: task_param_map['site'] = self.queue - task_param_map['workingGroup'] = 'lsst' + task_param_map['workingGroup'] = self.working_group task_param_map['nFilesPerJob'] = 1 task_param_map['nFiles'] = len(in_files) task_param_map['noInput'] = True task_param_map['pfnList'] = in_files task_param_map['taskName'] = self.task_name - task_param_map['userName'] = 'iDDS' - task_param_map['taskPriority'] = 900 + task_param_map['userName'] = self.username if self.username else 'iDDS' + task_param_map['taskPriority'] = self.task_priority task_param_map['architecture'] = '' task_param_map['transUses'] = '' task_param_map['transHome'] = None @@ -397,21 +466,24 @@ def create_processing(self, input_output_maps=[]): task_param_map['transPath'] = 'https://storage.googleapis.com/drp-us-central1-containers/bash-c' task_param_map['processingType'] = self.processingType task_param_map['prodSourceLabel'] = self.prodSourceLabel + task_param_map['noWaitParent'] = True task_param_map['taskType'] = self.task_type task_param_map['coreCount'] = self.core_count task_param_map['skipScout'] = True task_param_map['cloud'] = self.task_cloud + task_param_map['PandaSite'] = self.task_site if self.task_rss and self.task_rss > 0: task_param_map['ramCount'] = self.task_rss - task_param_map['ramUnit'] = 'MB' + # task_param_map['ramUnit'] = 'MB' + task_param_map['ramUnit'] = 'MBPerCoreFixed' task_param_map['inputPreStaging'] = True task_param_map['prestagingRuleID'] = 123 task_param_map['nChunksToWait'] = 1 - task_param_map['maxCpuCount'] = self.maxWalltime + task_param_map['maxCpuCount'] = self.core_count task_param_map['maxWalltime'] = self.maxWalltime - task_param_map['maxFailure'] = self.maxAttempt - task_param_map['maxAttempt'] = self.maxAttempt + task_param_map['maxFailure'] = self.maxAttempt if self.maxAttempt else 5 + task_param_map['maxAttempt'] = self.maxAttempt if self.maxAttempt else 5 task_param_map['log'] = self.task_log task_param_map['jobParameters'] = [ {'type': 'constant', @@ -434,16 +506,34 @@ def submit_panda_task(self, processing): proc = processing['processing_metadata']['processing'] task_param = proc.processing_metadata['task_param'] - return_code = Client.insertTaskParams(task_param, verbose=False) - if return_code[0] == 0: - return return_code[1][1] + if self.has_dependency(): + return_code = Client.insertTaskParams(task_param, verbose=True, parent_tid=self.parent_workload_id) + else: + return_code = Client.insertTaskParams(task_param, verbose=True) + if return_code[0] == 0 and return_code[1][0] is True: + try: + task_id = int(return_code[1][1]) + return task_id, None + except Exception as ex: + self.logger.warn("task id is not retruned: (%s) is not task id: %s" % (return_code[1][1], str(ex))) + # jediTaskID=26468582 + if return_code[1][1] and 'jediTaskID=' in return_code[1][1]: + parts = return_code[1][1].split(" ") + for part in parts: + if 'jediTaskID=' in part: + task_id = int(part.split("=")[1]) + return task_id, None + else: + return None, return_code else: self.logger.warn("submit_panda_task, return_code: %s" % str(return_code)) + return None, return_code except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) # raise exceptions.AgentPluginError('%s: %s' % (str(ex), traceback.format_exc())) - return None + return None, str(ex) + return None, None def submit_processing(self, processing): """ @@ -451,15 +541,15 @@ def submit_processing(self, processing): """ proc = processing['processing_metadata']['processing'] if proc.workload_id: - # if 'task_id' in processing['processing_metadata'] and processing['processing_metadata']['task_id']: pass + return True, proc.workload_id, None else: - task_id = self.submit_panda_task(processing) - # processing['processing_metadata']['task_id'] = task_id - # processing['processing_metadata']['workload_id'] = task_id - proc.workload_id = task_id + task_id, errors = self.submit_panda_task(processing) if task_id: + proc.workload_id = task_id proc.submitted_at = datetime.datetime.utcnow() + return True, task_id, errors + return False, None, errors def get_panda_task_id(self, processing): from pandaclient import Client @@ -475,7 +565,10 @@ def get_panda_task_id(self, processing): task_id = None for req_id in results: task_name = results[req_id]['taskName'] - if proc.workload_id is None and task_name == self.task_name: + local_task_name = proc.task_name + if not local_task_name: + local_task_name = self.task_name + if proc.workload_id is None and task_name == local_task_name: task_id = results[req_id]['jediTaskID'] # processing['processing_metadata']['task_id'] = task_id # processing['processing_metadata']['workload_id'] = task_id @@ -509,9 +602,11 @@ def get_processing_status_from_panda_status(self, task_status): elif task_status in ['finished', 'paused']: # finished, finishing, waiting it to be done processing_status = ProcessingStatus.SubFinished - elif task_status in ['failed', 'aborted', 'broken', 'exhausted']: + elif task_status in ['failed', 'aborted', 'exhausted']: # aborting, tobroken processing_status = ProcessingStatus.Failed + elif task_status in ['broken']: + processing_status = ProcessingStatus.Broken else: # finished, finishing, aborting, topreprocess, preprocessing, tobroken # toretry, toincexec, rerefine, paused, throttled, passed @@ -606,6 +701,9 @@ def get_map_id_from_input(self, input_output_maps, input_file): return None def get_content_status_from_panda_status(self, job_info): + if job_info is None: + return ContentStatus.Processing + jobstatus = job_info.jobStatus if jobstatus in ['finished', 'merging']: return ContentStatus.Available @@ -639,16 +737,37 @@ def get_update_contents_from_map_id(self, map_id, input_output_maps, job_info): update_contents.append(content) return update_contents + def get_panda_job_status(self, jobids): + self.logger.debug("get_panda_job_status, jobids[:10]: %s" % str(jobids[:10])) + from pandaclient import Client + ret = Client.getJobStatus(jobids, verbose=0) + if ret[0] == 0: + left_jobids = [] + ret_jobs = [] + jobs_list = ret[1] + for jobid, jobinfo in zip(jobids, jobs_list): + if jobinfo is None: + left_jobids.append(jobid) + else: + ret_jobs.append(jobinfo) + if left_jobids: + ret1 = Client.getFullJobStatus(ids=left_jobids, verbose=False) + if ret1[0] == 0: + left_jobs_list = ret1[1] + ret_jobs = ret_jobs + left_jobs_list + return ret_jobs + return [] + def map_panda_ids(self, unregistered_job_ids, input_output_maps): self.logger.debug("map_panda_ids, unregistered_job_ids[:10]: %s" % str(unregistered_job_ids[:10])) - from pandaclient import Client # updated_map_ids = [] full_update_contents = [] chunksize = 2000 chunks = [unregistered_job_ids[i:i + chunksize] for i in range(0, len(unregistered_job_ids), chunksize)] for chunk in chunks: - jobs_list = Client.getJobStatus(chunk, verbose=0)[1] + # jobs_list = Client.getJobStatus(chunk, verbose=0)[1] + jobs_list = self.get_panda_job_status(chunk) for job_info in jobs_list: if job_info and job_info.Files and len(job_info.Files) > 0: for job_file in job_info.Files: @@ -669,13 +788,13 @@ def map_panda_ids(self, unregistered_job_ids, input_output_maps): def get_status_changed_contents(self, unterminated_job_ids, input_output_maps, panda_id_to_map_ids): self.logger.debug("get_status_changed_contents, unterminated_job_ids[:10]: %s" % str(unterminated_job_ids[:10])) - from pandaclient import Client full_update_contents = [] chunksize = 2000 chunks = [unterminated_job_ids[i:i + chunksize] for i in range(0, len(unterminated_job_ids), chunksize)] for chunk in chunks: - jobs_list = Client.getJobStatus(chunk, verbose=0)[1] + # jobs_list = Client.getJobStatus(chunk, verbose=0)[1] + jobs_list = self.get_panda_job_status(chunk) for job_info in jobs_list: panda_id = job_info.PandaID map_id = panda_id_to_map_ids[panda_id] @@ -695,88 +814,17 @@ def get_final_update_contents(self, input_output_maps): return update_contents - def poll_panda_task_old(self, processing=None, input_output_maps=None): - task_id = None - try: - from pandaclient import Client - - jobs_ids = None - if processing: - proc = processing['processing_metadata']['processing'] - task_id = proc.workload_id - if task_id is None: - task_id = self.get_panda_task_id(processing) - - if task_id: - # ret_ids = Client.getPandaIDsWithTaskID(task_id, verbose=False) - self.logger.debug("poll_panda_task, task_id: %s" % str(task_id)) - task_info = Client.getJediTaskDetails({'jediTaskID': task_id}, True, True, verbose=False) - self.logger.debug("poll_panda_task, task_info[0]: %s" % str(task_info[0])) - if task_info[0] != 0: - self.logger.warn("poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) - return ProcessingStatus.Submitting, {} - - task_info = task_info[1] - - processing_status = self.get_processing_status_from_panda_status(task_info["status"]) - - if processing_status in [ProcessingStatus.SubFinished]: - if self.retry_number < self.num_retries: - self.reactivate_processing(processing) - processing_status = ProcessingStatus.Submitted - self.retry_number += 1 - - jobs_ids = task_info['PandaID'] - ret_get_registered_panda_jobids = self.get_registered_panda_jobids(input_output_maps) - terminated_job_ids, unterminated_job_ids, map_id_without_panda_ids, panda_id_to_map_ids = ret_get_registered_panda_jobids - - registered_job_ids = terminated_job_ids + unterminated_job_ids - unregistered_job_ids = [] - for job_id in jobs_ids: - if job_id not in registered_job_ids: - unregistered_job_ids.append(job_id) - - map_update_contents = self.map_panda_ids(unregistered_job_ids, input_output_maps) - status_changed_update_contents = self.get_status_changed_contents(unterminated_job_ids, input_output_maps, panda_id_to_map_ids) - final_update_contents = [] - - if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed]: - if (unregistered_job_ids or unterminated_job_ids): - # there are still polling contents, should not terminate the task. - log_warn = "Processing (%s) with panda id (%s) is %s, however there are still unregistered_job_ids(%s) or unterminated_job_ids(%s)" % (processing['processing_id'], - task_id, - processing_status, - str(unregistered_job_ids), - str(unterminated_job_ids)) - log_warn = log_warn + ". Keep the processing status as running now." - self.logger.warn(log_warn) - processing_status = ProcessingStatus.Running - else: - final_update_contents = self.get_final_update_contents(input_output_maps) - if final_update_contents: - processing_status = ProcessingStatus.Running - return processing_status, map_update_contents + status_changed_update_contents + final_update_contents - else: - return ProcessingStatus.Failed, {} - except Exception as ex: - msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - self.logger.error(msg) - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - # raise exceptions.IDDSException(msg) - return ProcessingStatus.Submitting, [] - def poll_panda_jobs(self, job_ids): job_ids = list(job_ids) self.logger.debug("poll_panda_jobs, poll_panda_jobs_chunk_size: %s, job_ids[:10]: %s" % (self.poll_panda_jobs_chunk_size, str(job_ids[:10]))) - from pandaclient import Client # updated_map_ids = [] inputname_jobid_map = {} chunksize = self.poll_panda_jobs_chunk_size chunks = [job_ids[i:i + chunksize] for i in range(0, len(job_ids), chunksize)] for chunk in chunks: - jobs_list = Client.getJobStatus(chunk, verbose=0)[1] + # jobs_list = Client.getJobStatus(chunk, verbose=0)[1] + jobs_list = self.get_panda_job_status(chunk) if jobs_list: self.logger.debug("poll_panda_jobs, input jobs: %s, output_jobs: %s" % (len(chunk), len(jobs_list))) for job_info in jobs_list: @@ -803,32 +851,27 @@ def get_job_maps(self, input_output_maps): for map_id in input_output_maps: inputs = input_output_maps[map_id]['inputs'] outputs = input_output_maps[map_id]['outputs'] - outputs_short = [] for content in outputs: - outputs_short.append({'content_id': content['content_id'], - 'status': content['status'], - 'substatus': content['substatus'], - 'content_metadata': content['content_metadata']}) - - if content['status'] in [ContentStatus.Available]: + if content['substatus'] in [ContentStatus.Available]: if 'panda_id' in content['content_metadata']: finished_jobs.append(content['content_metadata']['panda_id']) - elif content['status'] in [ContentStatus.Failed, ContentStatus.FinalFailed, + elif content['substatus'] in [ContentStatus.Failed, ContentStatus.FinalFailed, ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: if 'panda_id' in content['content_metadata']: failed_jobs.append(content['content_metadata']['panda_id']) for content in inputs: inputname_mapid_map[content['name']] = {'map_id': map_id, - 'outputs': outputs_short} + 'outputs': outputs} return finished_jobs + failed_jobs, inputname_mapid_map def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_map): self.logger.debug("get_update_contents, inputnames[:5]: %s" % str(inputnames[:5])) - self.logger.debug("get_update_contents, inputname_mapid_map[:5]: %s" % str({k: inputname_mapid_map[k] for k in inputnames[:5]})) - self.logger.debug("get_update_contents, inputname_jobid_map[:5]: %s" % str({k: inputname_jobid_map[k] for k in inputnames[:5]})) + # self.logger.debug("get_update_contents, inputname_mapid_map[:5]: %s" % str({k: inputname_mapid_map[k] for k in inputnames[:5]})) + self.logger.debug("get_update_contents, inputname_jobid_map[:3]: %s" % str({k: inputname_jobid_map[k] for k in inputnames[:3]})) update_contents = [] + update_contents_full = [] num_updated_contents, num_unupdated_contents = 0, 0 for inputname in inputnames: panda_id_status = inputname_jobid_map[inputname] @@ -838,6 +881,13 @@ def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_m contents = map_id_contents['outputs'] for content in contents: if content['substatus'] != panda_status: + # content['status'] = panda_status + content['substatus'] = panda_status + update_contents_full.append(content) + update_content = {'content_id': content['content_id'], + # 'status': panda_status, + 'substatus': panda_status} + # 'content_metadata': content['content_metadata'] if 'panda_id' in content['content_metadata'] and content['content_metadata']['panda_id']: # if content['content_metadata']['panda_id'] != job_info.PandaID: if content['content_metadata']['panda_id'] < panda_id: @@ -847,7 +897,9 @@ def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_m if content['content_metadata']['panda_id'] not in content['content_metadata']['old_panda_id']: content['content_metadata']['old_panda_id'].append(content['content_metadata']['panda_id']) content['content_metadata']['panda_id'] = panda_id + # content['status'] = panda_status content['substatus'] = panda_status + update_content['content_metadata'] = content['content_metadata'] elif content['content_metadata']['panda_id'] > panda_id: if 'old_panda_id' not in content['content_metadata']: content['content_metadata']['old_panda_id'] = [] @@ -855,6 +907,7 @@ def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_m content['content_metadata']['old_panda_id'].append(panda_id) # content['content_metadata']['panda_id'] = content['content_metadata']['panda_id'] # content['substatus'] = panda_status + update_content['content_metadata'] = content['content_metadata'] else: pass # content['content_metadata']['panda_id'] = panda_id @@ -862,16 +915,19 @@ def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_m else: content['content_metadata']['panda_id'] = panda_id content['substatus'] = panda_status + update_content['content_metadata'] = content['content_metadata'] - update_contents.append(content) + update_contents.append(update_content) num_updated_contents += 1 else: - num_unupdated_contents += 1 + # num_unupdated_contents += 1 + pass + self.logger.debug("get_update_contents, num_updated_contents: %s, num_unupdated_contents: %s" % (num_updated_contents, num_unupdated_contents)) - self.logger.debug("get_update_contents, update_contents[:5]: %s" % (str(update_contents[:5]))) - return update_contents + self.logger.debug("get_update_contents, update_contents[:3]: %s" % (str(update_contents[:3]))) + return update_contents, update_contents_full - def poll_panda_task(self, processing=None, input_output_maps=None): + def poll_panda_task(self, processing=None, input_output_maps=None, log_prefix=''): task_id = None try: from pandaclient import Client @@ -884,27 +940,22 @@ def poll_panda_task(self, processing=None, input_output_maps=None): if task_id: # ret_ids = Client.getPandaIDsWithTaskID(task_id, verbose=False) - self.logger.debug("poll_panda_task, task_id: %s" % str(task_id)) - task_info = Client.getJediTaskDetails({'jediTaskID': task_id}, True, True, verbose=False) - self.logger.debug("poll_panda_task, task_info[0]: %s" % str(task_info[0])) + self.logger.debug(log_prefix + "poll_panda_task, task_id: %s" % str(task_id)) + task_info = Client.getJediTaskDetails({'jediTaskID': task_id}, True, True, verbose=True) + self.logger.debug(log_prefix + "poll_panda_task, task_info[0]: %s" % str(task_info[0])) if task_info[0] != 0: - self.logger.warn("poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) - return ProcessingStatus.Submitting, [] + self.logger.warn(log_prefix + "poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) + return ProcessingStatus.Running, [], [] task_info = task_info[1] processing_status = self.get_processing_status_from_panda_status(task_info["status"]) - - if processing_status in [ProcessingStatus.SubFinished]: - if self.retry_number < self.num_retries: - self.reactivate_processing(processing) - processing_status = ProcessingStatus.Submitted - self.retry_number += 1 + self.logger.info(log_prefix + "poll_panda_task processing_status: %s" % processing_status) all_jobs_ids = task_info['PandaID'] terminated_jobs, inputname_mapid_map = self.get_job_maps(input_output_maps) - self.logger.debug("poll_panda_task, task_id: %s, all jobs: %s, terminated_jobs: %s" % (str(task_id), len(all_jobs_ids), len(terminated_jobs))) + self.logger.debug(log_prefix + "poll_panda_task, task_id: %s, all jobs: %s, terminated_jobs: %s" % (str(task_id), len(all_jobs_ids), len(terminated_jobs))) all_jobs_ids = set(all_jobs_ids) terminated_jobs = set(terminated_jobs) @@ -913,62 +964,22 @@ def poll_panda_task(self, processing=None, input_output_maps=None): inputname_jobid_map = self.poll_panda_jobs(unterminated_jobs) intersection_keys = set(inputname_mapid_map.keys()) & set(inputname_jobid_map.keys()) - updated_contents = self.get_update_contents(list(intersection_keys), inputname_mapid_map, inputname_jobid_map) - - final_update_contents = [] - if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed]: - if updated_contents: - # there are still polling contents, should not terminate the task. - log_warn = "Processing (%s) with panda task id (%s) is %s, however there are still updated_contents[:5]: %s" % (processing['processing_id'], - task_id, - processing_status, - str(updated_contents[:5])) - log_warn = log_warn + ". Keep the processing status as running now." - self.logger.warn(log_warn) - processing_status = ProcessingStatus.Running - elif list(unterminated_jobs): - log_warn = "Processing (%s) with panda task id (%s) is %s, however there are still unterminated_jobs[:5]: %s" % (processing['processing_id'], - task_id, - processing_status, - str(list(unterminated_jobs)[:5])) - log_warn = log_warn + ". Keep the processing status as running now." - self.logger.warn(log_warn) - processing_status = ProcessingStatus.Running - else: - # unsubmitted_inputnames = set(inputname_mapid_map.keys()) - set(inputname_jobid_map.keys()) - # unsubmitted_inputnames = list(unsubmitted_inputnames) - # if unsubmitted_inputnames: - # log_warn = "Processing (%s) with panda task id (%s) is %s, however there are still unsubmitted_inputnames[:5]: %s" % (processing['processing_id'], - # task_id, - # processing_status, - # str(unsubmitted_inputnames[:5])) - # log_warn = log_warn + ". Keep the processing status as running now." - # self.logger.warn(log_warn) - # processing_status = ProcessingStatus.Running - - for inputname in inputname_mapid_map: - map_id_contents = inputname_mapid_map[inputname] - contents = map_id_contents['outputs'] - for content in contents: - if (content['substatus'] not in [ContentStatus.Available, ContentStatus.FakeAvailable, ContentStatus.FinalFailed]): - content['content_metadata']['old_final_status'] = content['substatus'] - content['substatus'] = ContentStatus.FinalFailed - # final_update_contents.append(content) # TODO: mark other contents to Missing - - if final_update_contents: - processing_status = ProcessingStatus.Running - return processing_status, updated_contents + final_update_contents + updated_contents, update_contents_full = self.get_update_contents(list(intersection_keys), + inputname_mapid_map, + inputname_jobid_map) + + return processing_status, updated_contents, update_contents_full else: - return ProcessingStatus.Failed, [] + return ProcessingStatus.Running, [], [] except Exception as ex: msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - self.logger.error(msg) - self.logger.error(ex) + self.logger.error(log_prefix + msg) + self.logger.error(log_prefix + str(ex)) self.logger.error(traceback.format_exc()) # raise exceptions.IDDSException(msg) - return ProcessingStatus.Submitting, [] + return ProcessingStatus.Running, [], [] - def kill_processing(self, processing): + def kill_processing(self, processing, log_prefix=''): try: if processing: from pandaclient import Client @@ -977,11 +988,13 @@ def kill_processing(self, processing): # task_id = processing['processing_metadata']['task_id'] # Client.killTask(task_id) Client.finishTask(task_id, soft=False) + self.logger.info(log_prefix + "finishTask: %s" % task_id) except Exception as ex: - msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - raise exceptions.IDDSException(msg) + msg = "Failed to kill the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) + # raise exceptions.IDDSException(msg) + self.logger.error(log_prefix + "Failed to finishTask: %s, %s" % (task_id, msg)) - def kill_processing_force(self, processing): + def kill_processing_force(self, processing, log_prefix=''): try: if processing: from pandaclient import Client @@ -990,11 +1003,13 @@ def kill_processing_force(self, processing): # task_id = processing['processing_metadata']['task_id'] Client.killTask(task_id) # Client.finishTask(task_id, soft=True) + self.logger.info(log_prefix + "killTask: %s" % task_id) except Exception as ex: - msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - raise exceptions.IDDSException(msg) + msg = "Failed to force kill the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) + # raise exceptions.IDDSException(msg) + self.logger.error(log_prefix + "Failed to force kill: %s, %s" % (task_id, msg)) - def reactivate_processing(self, processing): + def reactivate_processing(self, processing, log_prefix=''): try: if processing: from pandaclient import Client @@ -1004,122 +1019,40 @@ def reactivate_processing(self, processing): # Client.retryTask(task_id) status, out = Client.retryTask(task_id, newParams={}) - self.logger.warn("Retry processing(%s) with task id(%s): %s, %s" % (processing['processing_id'], task_id, status, out)) + self.logger.warn(log_prefix + "Resume processing(%s) with task id(%s): %s, %s" % (processing['processing_id'], task_id, status, out)) # Client.reactivateTask(task_id) # Client.resumeTask(task_id) except Exception as ex: - msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - raise exceptions.IDDSException(msg) + msg = "Failed to resume the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) + # raise exceptions.IDDSException(msg) + self.logger.error(log_prefix + msg) + + def abort_processing(self, processing, log_prefix=''): + self.kill_processing_force(processing, log_prefix=log_prefix) + + def resume_processing(self, processing, log_prefix=''): + self.reactivate_processing(processing, log_prefix=log_prefix) - def poll_processing_updates(self, processing, input_output_maps): + def poll_processing_updates(self, processing, input_output_maps, log_prefix=''): """ *** Function called by Carrier agent. """ - updated_contents = [] - update_processing = {} - reset_expired_at = False - reactive_contents = [] - self.logger.debug("poll_processing_updates, input_output_maps.keys[:5]: %s" % str(list(input_output_maps.keys())[:5])) + update_contents = [] + update_contents_full = [] + self.logger.debug(log_prefix + "poll_processing_updates, input_output_maps.keys[:3]: %s" % str(list(input_output_maps.keys())[:3])) if processing: proc = processing['processing_metadata']['processing'] - if proc.tocancel: - self.logger.info("Cancelling processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing_force(processing) - # self.kill_processing(processing) - proc.tocancel = False - proc.polling_retries = 0 - elif proc.tosuspend: - self.logger.info("Suspending processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing_force(processing) - # self.kill_processing(processing) - proc.tosuspend = False - proc.polling_retries = 0 - elif proc.toresume: - self.logger.info("Resuming processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.reactivate_processing(processing) - reset_expired_at = True - proc.toresume = False - proc.polling_retries = 0 - proc.has_new_updates() - reactive_contents = self.reactive_contents(input_output_maps) - # elif self.is_processing_expired(processing): - elif proc.toexpire: - self.logger.info("Expiring processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing(processing) - proc.toexpire = False - proc.polling_retries = 0 - elif proc.tofinish or proc.toforcefinish: - self.logger.info("Finishing processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing(processing) - proc.tofinish = False - proc.toforcefinish = False - proc.polling_retries = 0 - elif self.is_all_contents_terminated_and_with_missing(input_output_maps): - self.logger.info("All contents terminated(There are Missing contents). Finishing processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing(processing) - - processing_status, poll_updated_contents = self.poll_panda_task(processing=processing, input_output_maps=input_output_maps) - self.logger.debug("poll_processing_updates, processing_status: %s" % str(processing_status)) - self.logger.debug("poll_processing_updates, update_contents[:10]: %s" % str(poll_updated_contents[:10])) - - if poll_updated_contents: + + processing_status, update_contents, update_contents_full = self.poll_panda_task(processing=processing, + input_output_maps=input_output_maps, + log_prefix=log_prefix) + # self.logger.debug(log_prefix + "poll_processing_updates, processing_status: %s" % str(processing_status)) + # self.logger.debug(log_prefix + "poll_processing_updates, update_contents[:10]: %s" % str(update_contents[:10])) + + if update_contents: proc.has_new_updates() - for content in poll_updated_contents: - updated_content = {'content_id': content['content_id'], - 'substatus': content['substatus'], - 'content_metadata': content['content_metadata']} - updated_contents.append(updated_content) - - content_substatus = {'finished': 0, 'unfinished': 0} - for map_id in input_output_maps: - outputs = input_output_maps[map_id]['outputs'] - for content in outputs: - if content.get('substatus', ContentStatus.New) != ContentStatus.Available: - content_substatus['unfinished'] += 1 - else: - content_substatus['finished'] += 1 - - if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed] and updated_contents: - self.logger.info("Processing %s is terminated, but there are still contents to be flushed. Waiting." % (proc.workload_id)) - # there are still polling contents, should not terminate the task. - processing_status = ProcessingStatus.Running - - if processing_status in [ProcessingStatus.SubFinished] and content_substatus['finished'] > 0 and content_substatus['unfinished'] == 0: - # found that a 'done' panda task has got a 'finished' status. Maybe in this case 'finished' is a transparent status. - if proc.polling_retries is None: - proc.polling_retries = 0 - - if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed]: - if proc.polling_retries is not None and proc.polling_retries < 3: - self.logger.info("processing %s polling_retries(%s) < 3, keep running" % (processing['processing_id'], proc.polling_retries)) - processing_status = ProcessingStatus.Running - proc.polling_retries += 1 - else: - proc.polling_retries = 0 - - if proc.in_operation_time(): - processing_status = ProcessingStatus.Running - - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': processing_status}} - if reset_expired_at: - processing['expired_at'] = None - update_processing['parameters']['expired_at'] = None - proc.polling_retries = 0 - # if (processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed] - # or processing['status'] in [ProcessingStatus.Resuming]): # noqa W503 - # using polling_retries to poll it again when panda may update the status in a delay(when issuing retryTask, panda will not update it without any delay). - update_processing['parameters']['status'] = ProcessingStatus.Resuming - proc.status = update_processing['parameters']['status'] - - self.logger.debug("poll_processing_updates, task: %s, update_processing: %s" % - (proc.workload_id, str(update_processing))) - self.logger.debug("poll_processing_updates, task: %s, updated_contents[:100]: %s" % - (proc.workload_id, str(updated_contents[:100]))) - self.logger.debug("poll_processing_updates, task: %s, reactive_contents[:100]: %s" % - (proc.workload_id, str(reactive_contents[:100]))) - return update_processing, updated_contents + reactive_contents, {} + return processing_status, update_contents, {}, update_contents_full, {} def get_status_statistics(self, registered_input_output_maps): status_statistics = {} @@ -1142,9 +1075,9 @@ def syn_work_status(self, registered_input_output_maps, all_updates_flushed=True self.logger.debug("syn_work_status, self.active_processings: %s" % str(self.active_processings)) self.logger.debug("syn_work_status, self.has_new_inputs(): %s" % str(self.has_new_inputs)) self.logger.debug("syn_work_status, coll_metadata_is_open: %s" % - str(self.collections[self.primary_input_collection].coll_metadata['is_open'])) + str(self.collections[self._primary_input_collection].coll_metadata['is_open'])) self.logger.debug("syn_work_status, primary_input_collection_status: %s" % - str(self.collections[self.primary_input_collection].status)) + str(self.collections[self._primary_input_collection].status)) self.logger.debug("syn_work_status(%s): is_processings_terminated: %s" % (str(self.get_processing_ids()), str(self.is_processings_terminated()))) self.logger.debug("syn_work_status(%s): is_input_collections_closed: %s" % (str(self.get_processing_ids()), str(self.is_input_collections_closed()))) @@ -1152,24 +1085,26 @@ def syn_work_status(self, registered_input_output_maps, all_updates_flushed=True self.logger.debug("syn_work_status(%s): has_to_release_inputs: %s" % (str(self.get_processing_ids()), str(self.has_to_release_inputs()))) self.logger.debug("syn_work_status(%s): to_release_input_contents: %s" % (str(self.get_processing_ids()), str(to_release_input_contents))) - if self.is_processings_terminated() and self.is_input_collections_closed() and not self.has_new_inputs and not self.has_to_release_inputs() and not to_release_input_contents: + # if self.is_processings_terminated() and self.is_input_collections_closed() and not self.has_new_inputs and not self.has_to_release_inputs() and not to_release_input_contents: + if self.is_processings_terminated(): # if not self.is_all_outputs_flushed(registered_input_output_maps): if not all_updates_flushed: self.logger.warn("The work processings %s is terminated. but not all outputs are flushed. Wait to flush the outputs then finish the transform" % str(self.get_processing_ids())) return - keys = self.status_statistics.keys() - if len(keys) == 1: - if ContentStatus.Available.name in keys: - self.status = WorkStatus.Finished - else: - self.status = WorkStatus.Failed - else: + if self.is_processings_finished(): + self.status = WorkStatus.Finished + elif self.is_processings_subfinished(): self.status = WorkStatus.SubFinished + elif self.is_processings_failed(): + self.status = WorkStatus.Failed + elif self.is_processings_expired(): + self.status = WorkStatus.Expired + elif self.is_processings_cancelled(): + self.status = WorkStatus.Cancelled + elif self.is_processings_suspended(): + self.status = WorkStatus.Suspended elif self.is_processings_running(): self.status = WorkStatus.Running else: self.status = WorkStatus.Transforming - - if self.is_processings_started(): - self.started = True diff --git a/doma/lib/idds/doma/workflowv2/domapandawork.py b/doma/lib/idds/doma/workflowv2/domapandawork.py index e3601785..39fc274e 100644 --- a/doma/lib/idds/doma/workflowv2/domapandawork.py +++ b/doma/lib/idds/doma/workflowv2/domapandawork.py @@ -17,6 +17,7 @@ import datetime import os +import time import traceback from idds.common import exceptions @@ -245,7 +246,7 @@ def poll_external_collection(self, coll): self.logger.error(traceback.format_exc()) raise exceptions.IDDSException('%s: %s' % (str(ex), traceback.format_exc())) - def get_input_collections(self): + def get_input_collections(self, poll_externel=True): """ *** Function called by Transformer agent. """ @@ -256,7 +257,8 @@ def get_input_collections(self): # coll = self.poll_internal_collection(coll) # else: # coll = self.poll_external_collection(coll) - coll = self.poll_external_collection(coll) + if poll_externel: + coll = self.poll_external_collection(coll) self.collections[coll_int_id] = coll return super(DomaPanDAWork, self).get_input_collections() @@ -319,6 +321,34 @@ def get_unmapped_jobs(self, mapped_input_output_maps={}): unmapped_jobs.append(job) return unmapped_jobs + def has_dependency(self): + for job in self.dependency_map: + if "dependencies" in job and job["dependencies"]: + return True + return False + + def get_parent_work_names(self): + parent_work_names = [] + for job in self.dependency_map: + if "dependencies" in job and job["dependencies"]: + inputs_dependency = job["dependencies"] + for input_d in inputs_dependency: + task_name = input_d['task'] + if task_name not in parent_work_names: + parent_work_names.append(task_name) + return parent_work_names + + def get_parent_workload_ids(self): + parent_workload_ids = [] + parent_work_names = self.get_parent_work_names() + work_name_to_coll_map = self.get_work_name_to_coll_map() + for work_name in parent_work_names: + if work_name in work_name_to_coll_map: + input_d_coll = work_name_to_coll_map[work_name]['outputs'][0] + if input_d_coll and 'workload_id' in input_d_coll: + parent_workload_ids.append(input_d_coll['workload_id']) + return parent_workload_ids + def get_new_input_output_maps(self, mapped_input_output_maps={}): """ *** Function called by Transformer agent. @@ -437,6 +467,7 @@ def create_processing(self, input_output_maps=[]): task_param_map['transPath'] = 'https://storage.googleapis.com/drp-us-central1-containers/bash-c' task_param_map['processingType'] = self.processingType task_param_map['prodSourceLabel'] = self.prodSourceLabel + task_param_map['noWaitParent'] = True task_param_map['taskType'] = self.task_type task_param_map['coreCount'] = self.core_count task_param_map['skipScout'] = True @@ -444,12 +475,13 @@ def create_processing(self, input_output_maps=[]): task_param_map['PandaSite'] = self.task_site if self.task_rss and self.task_rss > 0: task_param_map['ramCount'] = self.task_rss - task_param_map['ramUnit'] = 'MB' + # task_param_map['ramUnit'] = 'MB' + task_param_map['ramUnit'] = 'MBPerCoreFixed' task_param_map['inputPreStaging'] = True task_param_map['prestagingRuleID'] = 123 task_param_map['nChunksToWait'] = 1 - task_param_map['maxCpuCount'] = self.maxWalltime + task_param_map['maxCpuCount'] = self.core_count task_param_map['maxWalltime'] = self.maxWalltime task_param_map['maxFailure'] = self.maxAttempt if self.maxAttempt else 5 task_param_map['maxAttempt'] = self.maxAttempt if self.maxAttempt else 5 @@ -475,44 +507,57 @@ def submit_panda_task(self, processing): proc = processing['processing_metadata']['processing'] task_param = proc.processing_metadata['task_param'] - return_code = Client.insertTaskParams(task_param, verbose=True) + if 'new_retries' in processing and processing['new_retries']: + new_retries = int(processing['new_retries']) + task_param['taskName'] = task_param['taskName'] + "_" + str(new_retries) + if self.has_dependency(): + parent_tid = None + if self.parent_workload_id and int(self.parent_workload_id) > time.time() - 604800: + parent_tid = self.parent_workload_id + return_code = Client.insertTaskParams(task_param, verbose=True, parent_tid=parent_tid) + else: + return_code = Client.insertTaskParams(task_param, verbose=True) if return_code[0] == 0 and return_code[1][0] is True: - return return_code[1][1] + try: + task_id = int(return_code[1][1]) + return task_id, None + except Exception as ex: + self.logger.warn("task id is not retruned: (%s) is not task id: %s" % (return_code[1][1], str(ex))) + # jediTaskID=26468582 + if return_code[1][1] and 'jediTaskID=' in return_code[1][1]: + parts = return_code[1][1].split(" ") + for part in parts: + if 'jediTaskID=' in part: + task_id = int(part.split("=")[1]) + return task_id, None + else: + return None, return_code else: self.logger.warn("submit_panda_task, return_code: %s" % str(return_code)) + return None, return_code except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) # raise exceptions.AgentPluginError('%s: %s' % (str(ex), traceback.format_exc())) - return None + return None, str(ex) + return None, None def submit_processing(self, processing): """ *** Function called by Carrier agent. """ proc = processing['processing_metadata']['processing'] - if proc.workload_id: - # if 'task_id' in processing['processing_metadata'] and processing['processing_metadata']['task_id']: + # if proc.workload_id: + if False: pass + return True, proc.workload_id, None else: - task_id = self.submit_panda_task(processing) - # processing['processing_metadata']['task_id'] = task_id - # processing['processing_metadata']['workload_id'] = task_id - proc.workload_id = task_id + task_id, errors = self.submit_panda_task(processing) if task_id: + proc.workload_id = task_id proc.submitted_at = datetime.datetime.utcnow() - - def resubmit_processing(self, processing): - proc = processing['processing_metadata']['processing'] - proc.workload_id = None - task_param = proc.processing_metadata['task_param'] - if self.retry_number > 0: - proc.task_name = self.task_name + "_" + str(self.retry_number) - task_param['taskName'] = proc.task_name - task_id = self.submit_panda_task(processing) - proc.workload_id = task_id - if task_id: - proc.submitted_at = datetime.datetime.utcnow() + return True, task_id, errors + return False, None, errors def get_panda_task_id(self, processing): from pandaclient import Client @@ -664,6 +709,9 @@ def get_map_id_from_input(self, input_output_maps, input_file): return None def get_content_status_from_panda_status(self, job_info): + if job_info is None: + return ContentStatus.Processing + jobstatus = job_info.jobStatus if jobstatus in ['finished', 'merging']: return ContentStatus.Available @@ -774,77 +822,6 @@ def get_final_update_contents(self, input_output_maps): return update_contents - def poll_panda_task_old(self, processing=None, input_output_maps=None): - task_id = None - try: - from pandaclient import Client - - jobs_ids = None - if processing: - proc = processing['processing_metadata']['processing'] - task_id = proc.workload_id - if task_id is None: - task_id = self.get_panda_task_id(processing) - - if task_id: - # ret_ids = Client.getPandaIDsWithTaskID(task_id, verbose=False) - self.logger.debug("poll_panda_task, task_id: %s" % str(task_id)) - task_info = Client.getJediTaskDetails({'jediTaskID': task_id}, True, True, verbose=False) - self.logger.debug("poll_panda_task, task_info[0]: %s" % str(task_info[0])) - if task_info[0] != 0: - self.logger.warn("poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) - return ProcessingStatus.Submitting, {} - - task_info = task_info[1] - - processing_status = self.get_processing_status_from_panda_status(task_info["status"]) - - if processing_status in [ProcessingStatus.SubFinished]: - if self.retry_number < self.num_retries: - self.reactivate_processing(processing) - processing_status = ProcessingStatus.Submitted - self.retry_number += 1 - - jobs_ids = task_info['PandaID'] - ret_get_registered_panda_jobids = self.get_registered_panda_jobids(input_output_maps) - terminated_job_ids, unterminated_job_ids, map_id_without_panda_ids, panda_id_to_map_ids = ret_get_registered_panda_jobids - - registered_job_ids = terminated_job_ids + unterminated_job_ids - unregistered_job_ids = [] - for job_id in jobs_ids: - if job_id not in registered_job_ids: - unregistered_job_ids.append(job_id) - - map_update_contents = self.map_panda_ids(unregistered_job_ids, input_output_maps) - status_changed_update_contents = self.get_status_changed_contents(unterminated_job_ids, input_output_maps, panda_id_to_map_ids) - final_update_contents = [] - - if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed]: - if (unregistered_job_ids or unterminated_job_ids): - # there are still polling contents, should not terminate the task. - log_warn = "Processing (%s) with panda id (%s) is %s, however there are still unregistered_job_ids(%s) or unterminated_job_ids(%s)" % (processing['processing_id'], - task_id, - processing_status, - str(unregistered_job_ids), - str(unterminated_job_ids)) - log_warn = log_warn + ". Keep the processing status as running now." - self.logger.warn(log_warn) - processing_status = ProcessingStatus.Running - else: - final_update_contents = self.get_final_update_contents(input_output_maps) - if final_update_contents: - processing_status = ProcessingStatus.Running - return processing_status, map_update_contents + status_changed_update_contents + final_update_contents - else: - return ProcessingStatus.Failed, {} - except Exception as ex: - msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - self.logger.error(msg) - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - # raise exceptions.IDDSException(msg) - return ProcessingStatus.Submitting, [] - def poll_panda_jobs(self, job_ids): job_ids = list(job_ids) self.logger.debug("poll_panda_jobs, poll_panda_jobs_chunk_size: %s, job_ids[:10]: %s" % (self.poll_panda_jobs_chunk_size, str(job_ids[:10]))) @@ -882,32 +859,27 @@ def get_job_maps(self, input_output_maps): for map_id in input_output_maps: inputs = input_output_maps[map_id]['inputs'] outputs = input_output_maps[map_id]['outputs'] - outputs_short = [] for content in outputs: - outputs_short.append({'content_id': content['content_id'], - 'status': content['status'], - 'substatus': content['substatus'], - 'content_metadata': content['content_metadata']}) - - if content['status'] in [ContentStatus.Available]: + if content['substatus'] in [ContentStatus.Available]: if 'panda_id' in content['content_metadata']: finished_jobs.append(content['content_metadata']['panda_id']) - elif content['status'] in [ContentStatus.Failed, ContentStatus.FinalFailed, - ContentStatus.Lost, ContentStatus.Deleted, - ContentStatus.Missing]: + elif content['substatus'] in [ContentStatus.Failed, ContentStatus.FinalFailed, + ContentStatus.Lost, ContentStatus.Deleted, + ContentStatus.Missing]: if 'panda_id' in content['content_metadata']: failed_jobs.append(content['content_metadata']['panda_id']) for content in inputs: inputname_mapid_map[content['name']] = {'map_id': map_id, - 'outputs': outputs_short} + 'outputs': outputs} return finished_jobs + failed_jobs, inputname_mapid_map def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_map): self.logger.debug("get_update_contents, inputnames[:5]: %s" % str(inputnames[:5])) - self.logger.debug("get_update_contents, inputname_mapid_map[:5]: %s" % str({k: inputname_mapid_map[k] for k in inputnames[:5]})) - self.logger.debug("get_update_contents, inputname_jobid_map[:5]: %s" % str({k: inputname_jobid_map[k] for k in inputnames[:5]})) + # self.logger.debug("get_update_contents, inputname_mapid_map[:5]: %s" % str({k: inputname_mapid_map[k] for k in inputnames[:5]})) + self.logger.debug("get_update_contents, inputname_jobid_map[:3]: %s" % str({k: inputname_jobid_map[k] for k in inputnames[:3]})) update_contents = [] + update_contents_full = [] num_updated_contents, num_unupdated_contents = 0, 0 for inputname in inputnames: panda_id_status = inputname_jobid_map[inputname] @@ -917,6 +889,13 @@ def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_m contents = map_id_contents['outputs'] for content in contents: if content['substatus'] != panda_status: + # content['status'] = panda_status + content['substatus'] = panda_status + update_contents_full.append(content) + update_content = {'content_id': content['content_id'], + # 'status': panda_status, + 'substatus': panda_status} + # 'content_metadata': content['content_metadata'] if 'panda_id' in content['content_metadata'] and content['content_metadata']['panda_id']: # if content['content_metadata']['panda_id'] != job_info.PandaID: if content['content_metadata']['panda_id'] < panda_id: @@ -926,7 +905,9 @@ def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_m if content['content_metadata']['panda_id'] not in content['content_metadata']['old_panda_id']: content['content_metadata']['old_panda_id'].append(content['content_metadata']['panda_id']) content['content_metadata']['panda_id'] = panda_id + # content['status'] = panda_status content['substatus'] = panda_status + update_content['content_metadata'] = content['content_metadata'] elif content['content_metadata']['panda_id'] > panda_id: if 'old_panda_id' not in content['content_metadata']: content['content_metadata']['old_panda_id'] = [] @@ -934,6 +915,7 @@ def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_m content['content_metadata']['old_panda_id'].append(panda_id) # content['content_metadata']['panda_id'] = content['content_metadata']['panda_id'] # content['substatus'] = panda_status + update_content['content_metadata'] = content['content_metadata'] else: pass # content['content_metadata']['panda_id'] = panda_id @@ -941,16 +923,19 @@ def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_m else: content['content_metadata']['panda_id'] = panda_id content['substatus'] = panda_status + update_content['content_metadata'] = content['content_metadata'] - update_contents.append(content) + update_contents.append(update_content) num_updated_contents += 1 else: - num_unupdated_contents += 1 + # num_unupdated_contents += 1 + pass + self.logger.debug("get_update_contents, num_updated_contents: %s, num_unupdated_contents: %s" % (num_updated_contents, num_unupdated_contents)) - self.logger.debug("get_update_contents, update_contents[:5]: %s" % (str(update_contents[:5]))) - return update_contents + self.logger.debug("get_update_contents, update_contents[:3]: %s" % (str(update_contents[:3]))) + return update_contents, update_contents_full - def poll_panda_task(self, processing=None, input_output_maps=None): + def poll_panda_task(self, processing=None, input_output_maps=None, log_prefix=''): task_id = None try: from pandaclient import Client @@ -963,35 +948,22 @@ def poll_panda_task(self, processing=None, input_output_maps=None): if task_id: # ret_ids = Client.getPandaIDsWithTaskID(task_id, verbose=False) - self.logger.debug("poll_panda_task, task_id: %s" % str(task_id)) - task_info = Client.getJediTaskDetails({'jediTaskID': task_id}, True, True, verbose=False) - self.logger.debug("poll_panda_task, task_info[0]: %s" % str(task_info[0])) + self.logger.debug(log_prefix + "poll_panda_task, task_id: %s" % str(task_id)) + task_info = Client.getJediTaskDetails({'jediTaskID': task_id}, True, True, verbose=True) + self.logger.debug(log_prefix + "poll_panda_task, task_info[0]: %s" % str(task_info[0])) if task_info[0] != 0: - self.logger.warn("poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) - return ProcessingStatus.Submitting, [] + self.logger.warn(log_prefix + "poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) + return ProcessingStatus.Running, [], [] task_info = task_info[1] processing_status = self.get_processing_status_from_panda_status(task_info["status"]) + self.logger.info(log_prefix + "poll_panda_task processing_status: %s" % processing_status) - if processing_status in [ProcessingStatus.SubFinished]: - if self.retry_number < self.num_retries: - self.reactivate_processing(processing) - processing_status = ProcessingStatus.Submitted - self.retry_number += 1 - if processing_status in [ProcessingStatus.Broken]: - self.logger.error("poll_panda_task, task_id: %s is broken. retry_number: %s, num_retries: %s" % (str(task_id), self.retry_number, self.num_retries)) - if self.num_retries == 0: - self.num_retries = 1 - if self.retry_number < self.num_retries: - self.retry_number += 1 - self.logger.error("poll_panda_task, task_id: %s is broken. resubmit the task. retry_number: %s, num_retries: %s" % (str(task_id), self.retry_number, self.num_retries)) - self.resubmit_processing(processing) - return ProcessingStatus.Submitting, [] all_jobs_ids = task_info['PandaID'] terminated_jobs, inputname_mapid_map = self.get_job_maps(input_output_maps) - self.logger.debug("poll_panda_task, task_id: %s, all jobs: %s, terminated_jobs: %s" % (str(task_id), len(all_jobs_ids), len(terminated_jobs))) + self.logger.debug(log_prefix + "poll_panda_task, task_id: %s, all jobs: %s, terminated_jobs: %s" % (str(task_id), len(all_jobs_ids), len(terminated_jobs))) all_jobs_ids = set(all_jobs_ids) terminated_jobs = set(terminated_jobs) @@ -1000,62 +972,22 @@ def poll_panda_task(self, processing=None, input_output_maps=None): inputname_jobid_map = self.poll_panda_jobs(unterminated_jobs) intersection_keys = set(inputname_mapid_map.keys()) & set(inputname_jobid_map.keys()) - updated_contents = self.get_update_contents(list(intersection_keys), inputname_mapid_map, inputname_jobid_map) - - final_update_contents = [] - if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed]: - if updated_contents: - # there are still polling contents, should not terminate the task. - log_warn = "Processing (%s) with panda task id (%s) is %s, however there are still updated_contents[:5]: %s" % (processing['processing_id'], - task_id, - processing_status, - str(updated_contents[:5])) - log_warn = log_warn + ". Keep the processing status as running now." - self.logger.warn(log_warn) - processing_status = ProcessingStatus.Running - elif list(unterminated_jobs): - log_warn = "Processing (%s) with panda task id (%s) is %s, however there are still unterminated_jobs[:5]: %s" % (processing['processing_id'], - task_id, - processing_status, - str(list(unterminated_jobs)[:5])) - log_warn = log_warn + ". Keep the processing status as running now." - self.logger.warn(log_warn) - processing_status = ProcessingStatus.Running - else: - # unsubmitted_inputnames = set(inputname_mapid_map.keys()) - set(inputname_jobid_map.keys()) - # unsubmitted_inputnames = list(unsubmitted_inputnames) - # if unsubmitted_inputnames: - # log_warn = "Processing (%s) with panda task id (%s) is %s, however there are still unsubmitted_inputnames[:5]: %s" % (processing['processing_id'], - # task_id, - # processing_status, - # str(unsubmitted_inputnames[:5])) - # log_warn = log_warn + ". Keep the processing status as running now." - # self.logger.warn(log_warn) - # processing_status = ProcessingStatus.Running - - for inputname in inputname_mapid_map: - map_id_contents = inputname_mapid_map[inputname] - contents = map_id_contents['outputs'] - for content in contents: - if (content['substatus'] not in [ContentStatus.Available, ContentStatus.FakeAvailable, ContentStatus.FinalFailed]): - content['content_metadata']['old_final_status'] = content['substatus'] - content['substatus'] = ContentStatus.FinalFailed - # final_update_contents.append(content) # TODO: mark other contents to Missing - - if final_update_contents: - processing_status = ProcessingStatus.Running - return processing_status, updated_contents + final_update_contents + updated_contents, update_contents_full = self.get_update_contents(list(intersection_keys), + inputname_mapid_map, + inputname_jobid_map) + + return processing_status, updated_contents, update_contents_full else: - return ProcessingStatus.New, [] + return ProcessingStatus.Running, [], [] except Exception as ex: msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - self.logger.error(msg) - self.logger.error(ex) + self.logger.error(log_prefix + msg) + self.logger.error(log_prefix + str(ex)) self.logger.error(traceback.format_exc()) # raise exceptions.IDDSException(msg) - return ProcessingStatus.Submitting, [] + return ProcessingStatus.Running, [], [] - def kill_processing(self, processing): + def kill_processing(self, processing, log_prefix=''): try: if processing: from pandaclient import Client @@ -1064,11 +996,13 @@ def kill_processing(self, processing): # task_id = processing['processing_metadata']['task_id'] # Client.killTask(task_id) Client.finishTask(task_id, soft=False) + self.logger.info(log_prefix + "finishTask: %s" % task_id) except Exception as ex: - msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - raise exceptions.IDDSException(msg) + msg = "Failed to kill the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) + # raise exceptions.IDDSException(msg) + self.logger.error(log_prefix + "Failed to finishTask: %s, %s" % (task_id, msg)) - def kill_processing_force(self, processing): + def kill_processing_force(self, processing, log_prefix=''): try: if processing: from pandaclient import Client @@ -1077,11 +1011,13 @@ def kill_processing_force(self, processing): # task_id = processing['processing_metadata']['task_id'] Client.killTask(task_id) # Client.finishTask(task_id, soft=True) + self.logger.info(log_prefix + "killTask: %s" % task_id) except Exception as ex: - msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - raise exceptions.IDDSException(msg) + msg = "Failed to force kill the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) + # raise exceptions.IDDSException(msg) + self.logger.error(log_prefix + "Failed to force kill: %s, %s" % (task_id, msg)) - def reactivate_processing(self, processing): + def reactivate_processing(self, processing, log_prefix=''): try: if processing: from pandaclient import Client @@ -1091,122 +1027,40 @@ def reactivate_processing(self, processing): # Client.retryTask(task_id) status, out = Client.retryTask(task_id, newParams={}) - self.logger.warn("Retry processing(%s) with task id(%s): %s, %s" % (processing['processing_id'], task_id, status, out)) + self.logger.warn(log_prefix + "Resume processing(%s) with task id(%s): %s, %s" % (processing['processing_id'], task_id, status, out)) # Client.reactivateTask(task_id) # Client.resumeTask(task_id) except Exception as ex: - msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) - raise exceptions.IDDSException(msg) + msg = "Failed to resume the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) + # raise exceptions.IDDSException(msg) + self.logger.error(log_prefix + msg) - def poll_processing_updates(self, processing, input_output_maps): + def abort_processing(self, processing, log_prefix=''): + self.kill_processing_force(processing, log_prefix=log_prefix) + + def resume_processing(self, processing, log_prefix=''): + self.reactivate_processing(processing, log_prefix=log_prefix) + + def poll_processing_updates(self, processing, input_output_maps, log_prefix=''): """ *** Function called by Carrier agent. """ - updated_contents = [] - update_processing = {} - reset_expired_at = False - reactive_contents = [] - self.logger.debug("poll_processing_updates, input_output_maps.keys[:5]: %s" % str(list(input_output_maps.keys())[:5])) + update_contents = [] + update_contents_full = [] + self.logger.debug(log_prefix + "poll_processing_updates, input_output_maps.keys[:3]: %s" % str(list(input_output_maps.keys())[:3])) if processing: proc = processing['processing_metadata']['processing'] - if proc.tocancel: - self.logger.info("Cancelling processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing_force(processing) - # self.kill_processing(processing) - proc.tocancel = False - proc.polling_retries = 0 - elif proc.tosuspend: - self.logger.info("Suspending processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - # self.kill_processing_force(processing) - self.kill_processing_force(processing) - proc.tosuspend = False - proc.polling_retries = 0 - elif proc.toresume: - self.logger.info("Resuming processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.reactivate_processing(processing) - reset_expired_at = True - proc.toresume = False - proc.polling_retries = 0 - proc.has_new_updates() - reactive_contents = self.reactive_contents(input_output_maps) - # elif self.is_processing_expired(processing): - elif proc.toexpire: - self.logger.info("Expiring processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing(processing) - proc.toexpire = False - proc.polling_retries = 0 - elif proc.tofinish or proc.toforcefinish: - self.logger.info("Finishing processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing(processing) - proc.tofinish = False - proc.toforcefinish = False - proc.polling_retries = 0 - elif self.is_all_contents_terminated_and_with_missing(input_output_maps): - self.logger.info("All contents terminated(There are Missing contents). Finishing processing (processing id: %s, jediTaskId: %s)" % (processing['processing_id'], proc.workload_id)) - self.kill_processing(processing) - - processing_status, poll_updated_contents = self.poll_panda_task(processing=processing, input_output_maps=input_output_maps) - self.logger.debug("poll_processing_updates, processing_status: %s" % str(processing_status)) - self.logger.debug("poll_processing_updates, update_contents[:10]: %s" % str(poll_updated_contents[:10])) - - if poll_updated_contents: + + processing_status, update_contents, update_contents_full = self.poll_panda_task(processing=processing, + input_output_maps=input_output_maps, + log_prefix=log_prefix) + # self.logger.debug(log_prefix + "poll_processing_updates, processing_status: %s" % str(processing_status)) + # self.logger.debug(log_prefix + "poll_processing_updates, update_contents[:10]: %s" % str(update_contents[:10])) + + if update_contents: proc.has_new_updates() - for content in poll_updated_contents: - updated_content = {'content_id': content['content_id'], - 'substatus': content['substatus'], - 'content_metadata': content['content_metadata']} - updated_contents.append(updated_content) - - content_substatus = {'finished': 0, 'unfinished': 0} - for map_id in input_output_maps: - outputs = input_output_maps[map_id]['outputs'] - for content in outputs: - if content.get('substatus', ContentStatus.New) != ContentStatus.Available: - content_substatus['unfinished'] += 1 - else: - content_substatus['finished'] += 1 - - if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed] and updated_contents: - self.logger.info("Processing %s is terminated, but there are still contents to be flushed. Waiting." % (proc.workload_id)) - # there are still polling contents, should not terminate the task. - processing_status = ProcessingStatus.Running - - if processing_status in [ProcessingStatus.SubFinished] and content_substatus['finished'] > 0 and content_substatus['unfinished'] == 0: - # found that a 'done' panda task has got a 'finished' status. Maybe in this case 'finished' is a transparent status. - if proc.polling_retries is None: - proc.polling_retries = 0 - - if processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed]: - if proc.polling_retries is not None and proc.polling_retries < 3: - self.logger.info("processing %s polling_retries(%s) < 3, keep running" % (processing['processing_id'], proc.polling_retries)) - processing_status = ProcessingStatus.Running - proc.polling_retries += 1 - else: - proc.polling_retries = 0 - - if proc.in_operation_time(): - processing_status = ProcessingStatus.Running - - update_processing = {'processing_id': processing['processing_id'], - 'parameters': {'status': processing_status}} - if reset_expired_at: - processing['expired_at'] = None - update_processing['parameters']['expired_at'] = None - proc.polling_retries = 0 - # if (processing_status in [ProcessingStatus.SubFinished, ProcessingStatus.Finished, ProcessingStatus.Failed] - # or processing['status'] in [ProcessingStatus.Resuming]): # noqa W503 - # using polling_retries to poll it again when panda may update the status in a delay(when issuing retryTask, panda will not update it without any delay). - update_processing['parameters']['status'] = ProcessingStatus.Resuming - proc.status = update_processing['parameters']['status'] - - self.logger.debug("poll_processing_updates, task: %s, update_processing: %s" % - (proc.workload_id, str(update_processing))) - self.logger.debug("poll_processing_updates, task: %s, updated_contents[:100]: %s" % - (proc.workload_id, str(updated_contents[:100]))) - self.logger.debug("poll_processing_updates, task: %s, reactive_contents[:100]: %s" % - (proc.workload_id, str(reactive_contents[:100]))) - return update_processing, updated_contents + reactive_contents, {} + return processing_status, update_contents, {}, update_contents_full, {} def get_status_statistics(self, registered_input_output_maps): status_statistics = {} @@ -1239,24 +1093,26 @@ def syn_work_status(self, registered_input_output_maps, all_updates_flushed=True self.logger.debug("syn_work_status(%s): has_to_release_inputs: %s" % (str(self.get_processing_ids()), str(self.has_to_release_inputs()))) self.logger.debug("syn_work_status(%s): to_release_input_contents: %s" % (str(self.get_processing_ids()), str(to_release_input_contents))) - if self.is_processings_terminated() and self.is_input_collections_closed() and not self.has_new_inputs and not self.has_to_release_inputs() and not to_release_input_contents: + # if self.is_processings_terminated() and self.is_input_collections_closed() and not self.has_new_inputs and not self.has_to_release_inputs() and not to_release_input_contents: + if self.is_processings_terminated(): # if not self.is_all_outputs_flushed(registered_input_output_maps): if not all_updates_flushed: self.logger.warn("The work processings %s is terminated. but not all outputs are flushed. Wait to flush the outputs then finish the transform" % str(self.get_processing_ids())) return - keys = self.status_statistics.keys() - if len(keys) == 1: - if ContentStatus.Available.name in keys: - self.status = WorkStatus.Finished - else: - self.status = WorkStatus.Failed - else: + if self.is_processings_finished(): + self.status = WorkStatus.Finished + elif self.is_processings_subfinished(): self.status = WorkStatus.SubFinished + elif self.is_processings_failed(): + self.status = WorkStatus.Failed + elif self.is_processings_expired(): + self.status = WorkStatus.Expired + elif self.is_processings_cancelled(): + self.status = WorkStatus.Cancelled + elif self.is_processings_suspended(): + self.status = WorkStatus.Suspended elif self.is_processings_running(): self.status = WorkStatus.Running else: self.status = WorkStatus.Transforming - - if self.is_processings_started(): - self.started = True diff --git a/main/bin/run-idds b/main/bin/run-idds index 0ed63c44..aa1c0777 100755 --- a/main/bin/run-idds +++ b/main/bin/run-idds @@ -13,5 +13,9 @@ echo $SITE_PACKAGES_PATH # PROGRAM=${RootDir}/lib/python3.6/site-packages/idds/agents/main.py PROGRAM=${SITE_PACKAGES_PATH}/idds/agents/main.py +trap 'kill -TERM $PID; wait $PID' TERM + which python python $PROGRAM +PID=$! +wait $PID diff --git a/main/etc/idds/idds.cfg.template b/main/etc/idds/idds.cfg.template index 875400f2..d140b4ef 100755 --- a/main/etc/idds/idds.cfg.template +++ b/main/etc/idds/idds.cfg.template @@ -112,3 +112,7 @@ plugin.receiver.destination = /queue/Consumer.monitor.atlas.idds plugin.receiver.username = atlasidds plugin.receiver.password = ***** plugin.receiver.broker_timeout = 10 + +[cache] +host = localhost +port = 6379 diff --git a/main/etc/idds/supervisord.d/idds.ini b/main/etc/idds/supervisord.d/idds.ini index bd807ea7..280a8d2b 100644 --- a/main/etc/idds/supervisord.d/idds.ini +++ b/main/etc/idds/supervisord.d/idds.ini @@ -5,7 +5,8 @@ environment = RUCIO_AUTH_TYPE=x509_proxy, X509_USER_PROXY=/data/atlpilo1/x509up ;command=/opt/idds/bin/run-idds -command=bash -c "source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds" +;command=bash -c "source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds" +command=bash -c "trap 'kill -TERM $PID; wait $PID' TERM && source /etc/profile.d/conda.sh && conda activate /opt/idds && /opt/idds/bin/run-idds && PID=$! && wait $PID" process_name=%(process_num)02d user=atlpan childlogdir=/var/log/idds diff --git a/main/etc/sql/oracle_update.sql b/main/etc/sql/oracle_update.sql new file mode 100644 index 00000000..276513b1 --- /dev/null +++ b/main/etc/sql/oracle_update.sql @@ -0,0 +1,110 @@ +-- 2022.08.23 +alter table REQUESTS add (oldstatus NUMBER(2)); +alter table REQUESTS add (new_retries NUMBER(5) DEFAULT 0); +alter table REQUESTS add (update_retries NUMBER(5) DEFAULT 0); +alter table REQUESTS add (max_new_retries NUMBER(5) DEFAULT 3); +alter table REQUESTS add (max_update_retries NUMBER(5) DEFAULT 0); +# alter table REQUESTS add (new_poll_period NUMBER(10) DEFAULT 10); +# alter table REQUESTS add (update_poll_period NUMBER(10) DEFAULT 10); +# alter table REQUESTS drop column new_poll_period +# alter table REQUESTS drop column update_poll_period +alter table REQUESTS add (new_poll_period INTERVAL DAY TO SECOND DEFAULT '00 00:00:01'); +alter table REQUESTS add (update_poll_period INTERVAL DAY TO SECOND DEFAULT '00 00:00:10'); + +alter table TRANSFORMS add (oldstatus NUMBER(2)); +alter table TRANSFORMS add (new_retries NUMBER(5) DEFAULT 0); +alter table TRANSFORMS add (update_retries NUMBER(5) DEFAULT 0); +alter table TRANSFORMS add (max_new_retries NUMBER(5) DEFAULT 3); +alter table TRANSFORMS add (max_update_retries NUMBER(5) DEFAULT 0); +#alter table TRANSFORMS add (new_poll_period NUMBER(10) DEFAULT 10); +#alter table TRANSFORMS add (update_poll_period NUMBER(10) DEFAULT 10); +alter table TRANSFORMS add (errors VARCHAR2(1024)); +# alter table TRANSFORMS drop column new_poll_period +# alter table TRANSFORMS drop column update_poll_period +alter table TRANSFORMS add (new_poll_period INTERVAL DAY TO SECOND DEFAULT '00 00:00:01'); +alter table TRANSFORMS add (update_poll_period INTERVAL DAY TO SECOND DEFAULT '00 00:00:10'); + + +alter table PROCESSINGS add (oldstatus NUMBER(2)); +alter table PROCESSINGS add (new_retries NUMBER(5) DEFAULT 0); +alter table PROCESSINGS add (update_retries NUMBER(5) DEFAULT 0); +alter table PROCESSINGS add (max_new_retries NUMBER(5) DEFAULT 3); +alter table PROCESSINGS add (max_update_retries NUMBER(5) DEFAULT 0); +#alter table PROCESSINGS add (new_poll_period NUMBER(10) DEFAULT 10); +#alter table PROCESSINGS add (update_poll_period NUMBER(10) DEFAULT 10); +alter table PROCESSINGS add (errors VARCHAR2(1024)); +# alter table PROCESSINGS drop column new_poll_period +# alter table PROCESSINGS drop column update_poll_period +alter table PROCESSINGS add (new_poll_period INTERVAL DAY TO SECOND DEFAULT '00 00:00:01'); +alter table PROCESSINGS add (update_poll_period INTERVAL DAY TO SECOND DEFAULT '00 00:00:10'); + + +alter table MESSAGES add (retries NUMBER(5) DEFAULT 0); + +-- oracle 11 +CREATE SEQUENCE COMMAND_ID_SEQ MINVALUE 1 INCREMENT BY 1 START WITH 1 NOCACHE NOORDER NOCYCLE; +CREATE TABLE COMMANDS +( + cmd_id NUMBER(12), + request_id NUMBER(12), + workload_id NUMBER(10), + transform_id NUMBER(12), + processing_id NUMBER(12), + cmd_type NUMBER(2), + status NUMBER(2), + substatus NUMBER(2), + locking NUMBER(2), + username VARCHAR2(20), + retries NUMBER(5) DEFAULT 0, + source NUMBER(2), + destination NUMBER(2), + num_contents NUMBER(7), + created_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), + updated_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), + cmd_content CLOB, + errors VARCHAR2(1024), + CONSTRAINT COMMANDS_PK PRIMARY KEY (cmd_id) -- USING INDEX LOCAL, +); + +CREATE OR REPLACE TRIGGER TRIG_COMMAND_ID + BEFORE INSERT + ON COMMANDS + FOR EACH ROW + BEGIN + :NEW.cmd_id := COMMAND_ID_SEQ.NEXTVAL ; + END; + / + +CREATE INDEX COMMANDS_TYPE_ST_IDX ON COMMANDS (cmd_type, status, destination, request_id); +CREATE INDEX COMMANDS_TYPE_ST_TF_IDX ON COMMANDS (cmd_type, status, destination, transform_id); +CREATE INDEX COMMANDS_TYPE_ST_PR_IDX ON COMMANDS (cmd_type, status, destination, processing_id); + +-- oracle 19 +CREATE SEQUENCE COMMAND_ID_SEQ MINVALUE 1 INCREMENT BY 1 START WITH 1 NOCACHE ORDER NOCYCLE GLOBAL; +CREATE TABLE COMMANDS +( + cmd_id NUMBER(12) DEFAULT ON NULL COMMAND_ID_SEQ.NEXTVAL constraint COMMAND_ID_NN NOT NULL, + request_id NUMBER(12), + workload_id NUMBER(10), + transform_id NUMBER(12), + processing_id NUMBER(12), + cmd_type NUMBER(2), + status NUMBER(2), + substatus NUMBER(2), + locking NUMBER(2), + username VARCHAR2(20), + retries NUMBER(5) DEFAULT 0, + source NUMBER(2), + destination NUMBER(2), + num_contents NUMBER(7), + created_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), + updated_at DATE DEFAULT SYS_EXTRACT_UTC(systimestamp(0)), + cmd_content CLOB, + errors VARCHAR2(1024), + CONSTRAINT COMMANDS_PK PRIMARY KEY (cmd_id) -- USING INDEX LOCAL, +); + +CREATE INDEX COMMANDS_TYPE_ST_IDX ON COMMANDS (cmd_type, status, destination, request_id); +CREATE INDEX COMMANDS_TYPE_ST_TF_IDX ON COMMANDS (cmd_type, status, destination, transform_id); +CREATE INDEX COMMANDS_TYPE_ST_PR_IDX ON COMMANDS (cmd_type, status, destination, processing_id); + diff --git a/main/lib/idds/agents/carrier/carrier.py b/main/lib/idds/agents/carrier/carrier.py deleted file mode 100644 index 4a731153..00000000 --- a/main/lib/idds/agents/carrier/carrier.py +++ /dev/null @@ -1,585 +0,0 @@ -#!/usr/bin/env python -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0OA -# -# Authors: -# - Wen Guan, , 2019 - 2021 - -import datetime -import time -import traceback -try: - # python 3 - from queue import Queue -except ImportError: - # Python 2 - from Queue import Queue - -from idds.common import exceptions -from idds.common.constants import (Sections, ProcessingStatus, ProcessingLocking, - MessageStatus, ContentStatus, ContentType, - ContentRelationType) -from idds.common.utils import setup_logging -from idds.core import (transforms as core_transforms, - processings as core_processings) -from idds.agents.common.baseagent import BaseAgent - -setup_logging(__name__) - - -class Carrier(BaseAgent): - """ - Carrier works to submit and running tasks to WFMS. - """ - - def __init__(self, num_threads=1, poll_time_period=10, retrieve_bulk_size=None, - message_bulk_size=1000, **kwargs): - super(Carrier, self).__init__(num_threads=num_threads, **kwargs) - self.config_section = Sections.Carrier - self.poll_time_period = int(poll_time_period) - self.retrieve_bulk_size = int(retrieve_bulk_size) - self.message_bulk_size = int(message_bulk_size) - - self.new_task_queue = Queue() - self.new_output_queue = Queue() - self.running_task_queue = Queue() - self.running_output_queue = Queue() - self.new_processing_size = 0 - self.running_processing_size = 0 - - def show_queue_size(self): - q_str = "new queue size: %s, processing size: %s, output queue size: %s, " % (self.new_task_queue.qsize(), - self.new_processing_size, - self.new_output_queue.qsize()) - q_str += "running queue size: %s, processing size: %s, output queue size: %s" % (self.running_task_queue.qsize(), - self.running_processing_size, - self.running_output_queue.qsize()) - self.logger.debug(q_str) - - def init(self): - status = [ProcessingStatus.New, ProcessingStatus.Submitting, ProcessingStatus.Submitted, - ProcessingStatus.Running, ProcessingStatus.FinishedOnExec] - core_processings.clean_next_poll_at(status) - - def get_new_processings(self): - """ - Get new processing - """ - try: - if self.new_task_queue.qsize() > 0 or self.new_output_queue.qsize() > 0: - return [] - - self.show_queue_size() - - processing_status = [ProcessingStatus.New] - processings = core_processings.get_processings_by_status(status=processing_status, locking=True, bulk_size=self.retrieve_bulk_size) - - self.logger.debug("Main thread get %s [new] processings to process" % len(processings)) - if processings: - self.logger.info("Main thread get %s [new] processings to process" % len(processings)) - return processings - except exceptions.DatabaseException as ex: - if 'ORA-00060' in str(ex): - self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") - else: - # raise ex - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - return [] - - def process_new_processing(self, processing): - try: - # transform_id = processing['transform_id'] - # transform = core_transforms.get_transform(transform_id=transform_id) - # work = transform['transform_metadata']['work'] - proc = processing['processing_metadata']['processing'] - work = proc.work - work.set_agent_attributes(self.agent_attributes, processing) - - work.submit_processing(processing) - ret = {'processing_id': processing['processing_id'], - 'status': ProcessingStatus.Submitting, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period), - # 'expired_at': work.get_expired_at(processing), - 'processing_metadata': processing['processing_metadata']} - if proc.submitted_at: - if not processing['submitted_at'] or processing['submitted_at'] < proc.submitted_at: - ret['submitted_at'] = proc.submitted_at - - # if processing['processing_metadata'] and 'processing' in processing['processing_metadata']: - if proc.workload_id: - ret['workload_id'] = proc.workload_id - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - ret = {'processing_id': processing['processing_id'], - 'status': ProcessingStatus.New, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period * 4)} - return ret - - def process_new_processings(self): - ret = [] - while not self.new_task_queue.empty(): - try: - processing = self.new_task_queue.get() - if processing: - self.new_processing_size += 1 - self.logger.info("Main thread processing new processing: %s" % processing) - ret_processing = self.process_new_processing(processing) - self.new_processing_size -= 1 - if ret_processing: - # ret.append(ret_processing) - self.new_output_queue.put(ret_processing) - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - return ret - - def finish_new_processings(self): - while not self.new_output_queue.empty(): - try: - processing = self.new_output_queue.get() - self.logger.info("Main thread submitted new processing: %s" % (processing['processing_id'])) - processing_id = processing['processing_id'] - if 'next_poll_at' not in processing: - processing['next_poll_at'] = datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period) - del processing['processing_id'] - processing['locking'] = ProcessingLocking.Idle - # self.logger.debug("wen: %s" % str(processing)) - - retry = True - retry_num = 0 - while retry: - retry = False - retry_num += 1 - try: - core_processings.update_processing(processing_id=processing_id, parameters=processing) - except exceptions.DatabaseException as ex: - if 'ORA-00060' in str(ex): - self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") - if retry_num < 5: - retry = True - time.sleep(60 * retry_num * 2) - else: - raise ex - else: - # self.logger.error(ex) - # self.logger.error(traceback.format_exc()) - raise ex - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - try: - parameters = {'status': ProcessingStatus.Running, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period * 4)} - core_processings.update_processing(processing_id=processing_id, parameters=parameters) - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - - def get_running_processings(self): - """ - Get running processing - """ - try: - if self.running_task_queue.qsize() > 0 or self.running_output_queue.qsize() > 0: - return [] - - self.show_queue_size() - - processing_status = [ProcessingStatus.Submitting, ProcessingStatus.Submitted, - ProcessingStatus.Running, ProcessingStatus.FinishedOnExec, - ProcessingStatus.ToCancel, ProcessingStatus.Cancelling, - ProcessingStatus.ToSuspend, ProcessingStatus.Suspending, - ProcessingStatus.ToResume, ProcessingStatus.Resuming, - ProcessingStatus.ToExpire, ProcessingStatus.Expiring, - ProcessingStatus.ToFinish, ProcessingStatus.ToForceFinish] - processings = core_processings.get_processings_by_status(status=processing_status, - # time_period=self.poll_time_period, - locking=True, - with_messaging=True, - bulk_size=self.retrieve_bulk_size) - - self.logger.debug("Main thread get %s [submitting + submitted + running] processings to process: %s" % (len(processings), str([processing['processing_id'] for processing in processings]))) - if processings: - self.logger.info("Main thread get %s [submitting + submitted + running] processings to process: %s" % (len(processings), str([processing['processing_id'] for processing in processings]))) - return processings - except exceptions.DatabaseException as ex: - if 'ORA-00060' in str(ex): - self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") - else: - # raise ex - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - return [] - - def get_collection_ids(self, collections): - coll_ids = [] - for coll in collections: - coll_ids.append(coll.coll_id) - return coll_ids - - def get_new_contents(self, processing, new_input_output_maps): - new_input_contents, new_output_contents, new_log_contents = [], [], [] - new_input_dependency_contents = [] - for map_id in new_input_output_maps: - inputs = new_input_output_maps[map_id]['inputs'] if 'inputs' in new_input_output_maps[map_id] else [] - inputs_dependency = new_input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in new_input_output_maps[map_id] else [] - outputs = new_input_output_maps[map_id]['outputs'] if 'outputs' in new_input_output_maps[map_id] else [] - logs = new_input_output_maps[map_id]['logs'] if 'logs' in new_input_output_maps[map_id] else [] - - for input_content in inputs: - content = {'transform_id': processing['transform_id'], - 'coll_id': input_content['coll_id'], - 'request_id': processing['request_id'], - 'workload_id': processing['workload_id'], - 'map_id': map_id, - 'scope': input_content['scope'], - 'name': input_content['name'], - 'min_id': input_content['min_id'] if 'min_id' in input_content else 0, - 'max_id': input_content['max_id'] if 'max_id' in input_content else 0, - 'status': input_content['status'] if 'status' in input_content and input_content['status'] is not None else ContentStatus.New, - 'substatus': input_content['substatus'] if 'substatus' in input_content and input_content['substatus'] is not None else ContentStatus.New, - 'path': input_content['path'] if 'path' in input_content else None, - 'content_type': input_content['content_type'] if 'content_type' in input_content else ContentType.File, - 'content_relation_type': ContentRelationType.Input, - 'bytes': input_content['bytes'], - 'adler32': input_content['adler32'], - 'content_metadata': input_content['content_metadata']} - if content['min_id'] is None: - content['min_id'] = 0 - if content['max_id'] is None: - content['max_id'] = 0 - new_input_contents.append(content) - for input_content in inputs_dependency: - content = {'transform_id': processing['transform_id'], - 'coll_id': input_content['coll_id'], - 'request_id': processing['request_id'], - 'workload_id': processing['workload_id'], - 'map_id': map_id, - 'scope': input_content['scope'], - 'name': input_content['name'], - 'min_id': input_content['min_id'] if 'min_id' in input_content else 0, - 'max_id': input_content['max_id'] if 'max_id' in input_content else 0, - 'status': input_content['status'] if 'status' in input_content and input_content['status'] is not None else ContentStatus.New, - 'substatus': input_content['substatus'] if 'substatus' in input_content and input_content['substatus'] is not None else ContentStatus.New, - 'path': input_content['path'] if 'path' in input_content else None, - 'content_type': input_content['content_type'] if 'content_type' in input_content else ContentType.File, - 'content_relation_type': ContentRelationType.InputDependency, - 'bytes': input_content['bytes'], - 'adler32': input_content['adler32'], - 'content_metadata': input_content['content_metadata']} - if content['min_id'] is None: - content['min_id'] = 0 - if content['max_id'] is None: - content['max_id'] = 0 - new_input_dependency_contents.append(content) - for output_content in outputs: - content = {'transform_id': processing['transform_id'], - 'coll_id': output_content['coll_id'], - 'request_id': processing['request_id'], - 'workload_id': processing['workload_id'], - 'map_id': map_id, - 'scope': output_content['scope'], - 'name': output_content['name'], - 'min_id': output_content['min_id'] if 'min_id' in output_content else 0, - 'max_id': output_content['max_id'] if 'max_id' in output_content else 0, - 'status': ContentStatus.New, - 'substatus': ContentStatus.New, - 'path': output_content['path'] if 'path' in output_content else None, - 'content_type': output_content['content_type'] if 'content_type' in output_content else ContentType.File, - 'content_relation_type': ContentRelationType.Output, - 'bytes': output_content['bytes'], - 'adler32': output_content['adler32'], - 'content_metadata': output_content['content_metadata']} - if content['min_id'] is None: - content['min_id'] = 0 - if content['max_id'] is None: - content['max_id'] = 0 - new_output_contents.append(content) - for log_content in logs: - content = {'transform_id': processing['transform_id'], - 'coll_id': log_content['coll_id'], - 'request_id': processing['request_id'], - 'workload_id': processing['workload_id'], - 'map_id': map_id, - 'scope': log_content['scope'], - 'name': log_content['name'], - 'min_id': log_content['min_id'] if 'min_id' in log_content else 0, - 'max_id': log_content['max_id'] if 'max_id' in log_content else 0, - 'status': ContentStatus.New, - 'substatus': ContentStatus.New, - 'path': log_content['path'] if 'path' in log_content else None, - 'content_type': log_content['content_type'] if 'content_type' in log_content else ContentType.File, - 'content_relation_type': ContentRelationType.Log, - 'bytes': log_content['bytes'], - 'adler32': log_content['adler32'], - 'content_metadata': log_content['content_metadata']} - if content['min_id'] is None: - content['min_id'] = 0 - if content['max_id'] is None: - content['max_id'] = 0 - new_output_contents.append(content) - return new_input_contents + new_output_contents + new_log_contents + new_input_dependency_contents - - def process_running_processing(self, processing): - try: - transform_id = processing['transform_id'] - # transform = core_transforms.get_transform(transform_id=transform_id) - # work = transform['transform_metadata']['work'] - # work = processing['processing_metadata']['work'] - # work.set_agent_attributes(self.agent_attributes) - if 'processing' not in processing['processing_metadata']: - raise exceptions.ProcessFormatNotSupported - - proc = processing['processing_metadata']['processing'] - work = proc.work - work.set_agent_attributes(self.agent_attributes, processing) - - input_collections = work.get_input_collections() - output_collections = work.get_output_collections() - log_collections = work.get_log_collections() - - input_coll_ids = self.get_collection_ids(input_collections) - output_coll_ids = self.get_collection_ids(output_collections) - log_coll_ids = self.get_collection_ids(log_collections) - - input_output_maps = core_transforms.get_transform_input_output_maps(transform_id, - input_coll_ids=input_coll_ids, - output_coll_ids=output_coll_ids, - log_coll_ids=log_coll_ids) - - # processing_substatus = None - is_operation = False - if processing['status'] in [ProcessingStatus.ToCancel]: - work.abort_processing(processing) - is_operation = True - # processing_substatus = ProcessingStatus.Cancelling - if processing['status'] in [ProcessingStatus.ToSuspend]: - work.suspend_processing(processing) - is_operation = True - # processing_substatus = ProcessingStatus.Suspending - if processing['status'] in [ProcessingStatus.ToResume]: - work.resume_processing(processing) - is_operation = True - # processing_substatus = ProcessingStatus.Resuming - if processing['status'] in [ProcessingStatus.ToExpire]: - work.expire_processing(processing) - is_operation = True - # processing_substatus = ProcessingStatus.Expiring - if processing['status'] in [ProcessingStatus.ToFinish]: - work.finish_processing(processing) - is_operation = True - # processing_substatus = ProcessingStatus.Running - if processing['status'] in [ProcessingStatus.ToForceFinish]: - work.finish_processing(processing, forcing=True) - is_operation = True - # processing_substatus = ProcessingStatus.Running - - # work = processing['processing_metadata']['work'] - # outputs = work.poll_processing() - processing_update, content_updates, new_input_output_maps = work.poll_processing_updates(processing, input_output_maps) - new_contents = self.get_new_contents(processing, new_input_output_maps) - if processing_update: - processing_update['parameters']['locking'] = ProcessingLocking.Idle - else: - processing_update = {'processing_id': processing['processing_id'], - 'parameters': {'locking': ProcessingLocking.Idle}} - - # if processing_substatus: - # processing_update['parameters']['substatus'] = processing_substatus - - if not is_operation: - next_poll_at = datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period) - else: - if processing['status'] in [ProcessingStatus.ToResume]: - next_poll_at = datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_operation_time_period * 5) - else: - next_poll_at = datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_operation_time_period) - - if proc.submitted_at: - if not processing['submitted_at'] or processing['submitted_at'] < proc.submitted_at: - processing_update['parameters']['submitted_at'] = proc.submitted_at - - if proc.workload_id: - processing_update['parameters']['workload_id'] = proc.workload_id - - processing_update['parameters']['next_poll_at'] = next_poll_at - # processing_update['parameters']['expired_at'] = work.get_expired_at(processing) - processing_update['parameters']['processing_metadata'] = processing['processing_metadata'] - - ret = {'processing_update': processing_update, - 'content_updates': content_updates, - 'new_contents': new_contents} - - except exceptions.ProcessFormatNotSupported as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - processing_update = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Failed, - 'locking': ProcessingLocking.Idle, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period * 4)}} - ret = {'processing_update': processing_update, - 'content_updates': []} - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - processing_update = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Running, - 'locking': ProcessingLocking.Idle, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period * 4)}} - ret = {'processing_update': processing_update, - 'content_updates': []} - return ret - - def process_running_processing_message(self, processing, messages): - """ - process running processing message - """ - try: - self.logger.info("process_running_processing_message: processing_id: %s, messages: %s" % (processing['processing_id'], str(messages) if messages else messages)) - msg = messages[0] - message = messages[0]['msg_content'] - if message['command'] == 'update_processing': - parameters = message['parameters'] - parameters['locking'] = ProcessingLocking.Idle - processing_update = {'processing_id': processing['processing_id'], - 'parameters': parameters, - } - update_messages = [{'msg_id': msg['msg_id'], 'status': MessageStatus.Delivered}] - else: - self.logger.error("Unknown message: %s" % str(msg)) - processing_update = {'processing_id': processing['processing_id'], - 'parameters': {'locking': ProcessingLocking.Idle} - } - update_messages = [{'msg_id': msg['msg_id'], 'status': MessageStatus.Failed}] - - ret = {'processing_update': processing_update, - 'content_updates': [], - 'update_messages': update_messages} - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - processing_update = {'processing_id': processing['processing_id'], - 'parameters': {'status': ProcessingStatus.Failed, - 'locking': ProcessingLocking.Idle, - 'errors': {'msg': '%s: %s' % (ex, traceback.format_exc())}}} - ret = {'processing_update': processing_update, - 'content_updates': []} - return ret - - def process_running_processings(self): - ret = [] - while not self.running_task_queue.empty(): - try: - processing = self.running_task_queue.get() - if processing: - self.running_processing_size += 1 - self.logger.debug("Main thread processing running processing: %s" % processing) - self.logger.info("Main thread processing running processing: %s" % processing['processing_id']) - - msgs = self.get_processing_message(processing_id=processing['processing_id'], bulk_size=1) - if msgs: - ret_processing = self.process_running_processing_message(processing, msgs) - else: - ret_processing = self.process_running_processing(processing) - self.running_processing_size -= 1 - if ret_processing: - # ret.append(ret_processing) - self.running_output_queue.put(ret_processing) - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - return ret - - def finish_running_processings(self): - while not self.running_output_queue.empty(): - try: - processing = self.running_output_queue.get() - if processing: - self.logger.info("Main thread processing(processing_id: %s) updates: %s" % (processing['processing_update']['processing_id'], - processing['processing_update']['parameters'])) - - # self.logger.info("Main thread finishing running processing %s" % str(processing)) - - retry = True - retry_num = 0 - while retry: - retry = False - retry_num += 1 - try: - core_processings.update_processing_contents(processing_update=processing.get('processing_update', None), - content_updates=processing.get('content_updates', None), - update_messages=processing.get('update_messages', None), - new_contents=processing.get('new_contents', None)) - except exceptions.DatabaseException as ex: - if 'ORA-00060' in str(ex): - self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") - if retry_num < 5: - retry = True - time.sleep(60 * retry_num * 2) - else: - raise ex - else: - # self.logger.error(ex) - # self.logger.error(traceback.format_exc()) - raise ex - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - try: - parameters = {'status': ProcessingStatus.Running, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period * 4)} - core_processings.update_processing(processing_id=processing['processing_update']['processing_id'], parameters=parameters) - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - - def clean_locks(self): - self.logger.info("clean locking") - core_processings.clean_locking() - - def run(self): - """ - Main run function. - """ - try: - self.logger.info("Starting main thread") - - self.load_plugins() - self.init() - - self.add_default_tasks() - - task = self.create_task(task_func=self.get_new_processings, task_output_queue=self.new_task_queue, task_args=tuple(), task_kwargs={}, delay_time=60, priority=1) - self.add_task(task) - for _ in range(self.num_threads): - # task = self.create_task(task_func=self.process_new_processings, task_output_queue=self.new_output_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - task = self.create_task(task_func=self.process_new_processings, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - self.add_task(task) - task = self.create_task(task_func=self.finish_new_processings, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - self.add_task(task) - - task = self.create_task(task_func=self.get_running_processings, task_output_queue=self.running_task_queue, task_args=tuple(), task_kwargs={}, delay_time=60, priority=1) - self.add_task(task) - for _ in range(self.num_threads): - # task = self.create_task(task_func=self.process_running_processings, task_output_queue=self.running_output_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - task = self.create_task(task_func=self.process_running_processings, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - self.add_task(task) - task = self.create_task(task_func=self.finish_running_processings, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - self.add_task(task) - - task = self.create_task(task_func=self.clean_locks, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1800, priority=1) - self.add_task(task) - - self.execute() - except KeyboardInterrupt: - self.stop() - - -if __name__ == '__main__': - agent = Carrier() - agent() diff --git a/main/lib/idds/agents/carrier/finisher.py b/main/lib/idds/agents/carrier/finisher.py new file mode 100644 index 00000000..6d3bfbd3 --- /dev/null +++ b/main/lib/idds/agents/carrier/finisher.py @@ -0,0 +1,333 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 - 2022 + +import traceback + +from idds.common.constants import (Sections, ProcessingStatus, ProcessingLocking) +from idds.common.utils import setup_logging, truncate_string +from idds.agents.common.eventbus.event import (EventType, + UpdateProcessingEvent, + UpdateTransformEvent) + +from .utils import (handle_abort_processing, + handle_resume_processing, + is_process_terminated, + sync_processing) +from .poller import Poller + +setup_logging(__name__) + + +class Finisher(Poller): + """ + Finisher works to submit and running tasks to WFMS. + """ + + def __init__(self, num_threads=1, poll_time_period=10, retries=3, retrieve_bulk_size=2, + message_bulk_size=1000, **kwargs): + super(Finisher, self).__init__(num_threads=num_threads, name='Finisher', + poll_time_period=poll_time_period, retries=retries, + retrieve_bulk_size=retrieve_bulk_size, + message_bulk_size=message_bulk_size, **kwargs) + self.config_section = Sections.Carrier + self.poll_time_period = int(poll_time_period) + self.retries = int(retries) + + if hasattr(self, 'finisher_max_number_workers'): + self.max_number_workers = int(self.finisher_max_number_workers) + + def show_queue_size(self): + q_str = "number of processings: %s, max number of processings: %s" % (self.number_workers, self.max_number_workers) + self.logger.debug(q_str) + + def handle_sync_processing(self, processing, log_prefix=""): + """ + process terminated processing + """ + try: + processing, update_collections, messages = sync_processing(processing, self.agent_attributes, logger=self.logger, log_prefix=log_prefix) + + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': processing['status'], + 'locking': ProcessingLocking.Idle}} + ret = {'update_processing': update_processing, + 'update_collections': update_collections, + 'messages': messages} + return ret + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + error = {'sync_err': {'msg': truncate_string('%s' % (ex), length=200)}} + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': ProcessingStatus.Running, + 'locking': ProcessingLocking.Idle, + 'errors': processing['errors'] if processing['errors'] else {}}} + update_processing['parameters']['errors'].update(error) + ret = {'update_processing': update_processing} + return ret + return None + + def process_sync_processing(self, event): + self.number_workers += 1 + try: + if event: + self.logger.info("process_sync_processing: event: %s" % event) + pr = self.get_processing(processing_id=event._processing_id, locking=True) + if not pr: + self.logger.error("Cannot find processing for event: %s" % str(event)) + else: + log_pre = self.get_log_prefix(pr) + + self.logger.info(log_pre + "process_sync_processing") + ret = self.handle_sync_processing(pr, log_prefix=log_pre) + self.logger.info(log_pre + "process_sync_processing result: %s" % str(ret)) + + self.update_processing(ret, pr) + + # no need to update transform + # self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % pr['transform_id']) + # event = UpdateTransformEvent(publisher_id=self.id, transform_id=pr['transform_id']) + # self.event_bus.send(event) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 + + def handle_terminated_processing(self, processing, log_prefix=""): + """ + process terminated processing + """ + try: + processing, update_collections, messages = sync_processing(processing, self.agent_attributes, terminate=True, logger=self.logger, log_prefix=log_prefix) + + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': processing['status'], + 'locking': ProcessingLocking.Idle}} + ret = {'update_processing': update_processing, + 'update_collections': update_collections, + 'messages': messages} + + return ret + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + error = {'term_err': {'msg': truncate_string('%s' % (ex), length=200)}} + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': ProcessingStatus.Running, + 'locking': ProcessingLocking.Idle, + 'errors': processing['errors'] if processing['errors'] else {}}} + update_processing['parameters']['errors'].update(error) + ret = {'update_processing': update_processing} + return ret + return None + + def process_terminated_processing(self, event): + self.number_workers += 1 + try: + if event: + pr = self.get_processing(processing_id=event._processing_id, locking=True) + if not pr: + self.logger.error("Cannot find processing for event: %s" % str(event)) + else: + log_pre = self.get_log_prefix(pr) + + self.logger.info(log_pre + "process_terminated_processing") + ret = self.handle_terminated_processing(pr, log_prefix=log_pre) + self.logger.info(log_pre + "process_terminated_processing result: %s" % str(ret)) + + if pr['status'] == ProcessingStatus.Terminating and is_process_terminated(pr['substatus']): + pr['status'] = pr['substatus'] + + self.update_processing(ret, pr) + self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % pr['transform_id']) + event = UpdateTransformEvent(publisher_id=self.id, transform_id=pr['transform_id']) + self.event_bus.send(event) + + if pr['status'] not in [ProcessingStatus.Finished, ProcessingStatus.Failed, ProcessingStatus.SubFinished]: + # some files are missing, poll it. + self.logger.info(log_pre + "UpdateProcessingEvent(processing_id: %s)" % pr['processing_id']) + event = UpdateProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id']) + self.event_bus.send(event) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 + + def handle_abort_processing(self, processing, log_prefix=""): + """ + process abort processing + """ + try: + processing, update_collections, update_contents = handle_abort_processing(processing, self.agent_attributes, logger=self.logger, log_prefix=log_prefix) + + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': processing['status'], + 'locking': ProcessingLocking.Idle}} + ret = {'update_processing': update_processing, + 'update_collections': update_collections, + 'update_contents': update_contents, + } + return ret + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + error = {'abort_err': {'msg': truncate_string('%s' % (ex), length=200)}} + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': ProcessingStatus.ToCancel, + 'locking': ProcessingLocking.Idle, + 'errors': processing['errors'] if processing['errors'] else {}}} + update_processing['parameters']['errors'].update(error) + ret = {'update_processing': update_processing} + return ret + return None + + def process_abort_processing(self, event): + self.number_workers += 1 + try: + if event: + processing_status = [ProcessingStatus.Finished, ProcessingStatus.Failed, + ProcessingStatus.Lost, ProcessingStatus.Cancelled, + ProcessingStatus.Suspended, ProcessingStatus.Expired, + ProcessingStatus.Broken] + + pr = self.get_processing(processing_id=event._processing_id, locking=True) + + if not pr: + self.logger.error("Cannot find processing for event: %s" % str(event)) + else: + log_pre = self.get_log_prefix(pr) + self.logger.info(log_pre + "process_abort_processing") + + if pr and pr['status'] in processing_status: + update_processing = {'processing_id': pr['processing_id'], + 'parameters': {'locking': ProcessingLocking.Idle, + 'errors': {'abort_err': {'msg': truncate_string("Processing is already terminated. Cannot be aborted", length=200)}}}} + ret = {'update_processing': update_processing} + self.logger.info(log_pre + "process_abort_processing result: %s" % str(ret)) + self.update_processing(ret, pr) + elif pr: + ret = self.handle_abort_processing(pr, log_prefix=log_pre) + self.logger.info(log_pre + "process_abort_processing result: %s" % str(ret)) + self.update_processing(ret, pr) + self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % pr['transform_id']) + event = UpdateTransformEvent(publisher_id=self.id, transform_id=pr['transform_id'], content=event._content) + self.event_bus.send(event) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 + + def handle_resume_processing(self, processing, log_prefix=""): + """ + process resume processing + """ + try: + processing, update_collections, update_contents = handle_resume_processing(processing, self.agent_attributes, logger=self.logger, log_prefix=log_prefix) + + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': processing['status'], + 'locking': ProcessingLocking.Idle}} + ret = {'update_processing': update_processing, + 'update_collections': update_collections, + 'update_contents': update_contents, + } + return ret + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + error = {'resume_err': {'msg': truncate_string('%s' % (ex), length=200)}} + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': ProcessingStatus.ToResume, + 'locking': ProcessingLocking.Idle, + 'errors': processing['errors'] if processing['errors'] else {}}} + update_processing['parameters']['errors'].update(error) + ret = {'update_processing': update_processing} + return ret + return None + + def process_resume_processing(self, event): + self.number_workers += 1 + try: + if event: + processing_status = [ProcessingStatus.Finished] + + pr = self.get_processing(processing_id=event._processing_id, locking=True) + + if not pr: + self.logger.error("Cannot find processing for event: %s" % str(event)) + else: + log_pre = self.get_log_prefix(pr) + self.logger.info(log_pre + "process_resume_processing") + + if pr and pr['status'] in processing_status: + update_processing = {'processing_id': pr['processing_id'], + 'parameters': {'locking': ProcessingLocking.Idle, + 'errors': {'abort_err': {'msg': truncate_string("Processing has already finished. Cannot be resumed", length=200)}}}} + ret = {'update_processing': update_processing} + + self.logger.info(log_pre + "process_resume_processing result: %s" % str(ret)) + + self.update_processing(ret, pr) + elif pr: + ret = self.handle_resume_processing(pr, log_prefix=log_pre) + self.logger.info(log_pre + "process_resume_processing result: %s" % str(ret)) + + self.update_processing(ret, pr) + + self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % pr['transform_id']) + event = UpdateTransformEvent(publisher_id=self.id, transform_id=pr['transform_id'], content=event._content) + self.event_bus.send(event) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 + + def init_event_function_map(self): + self.event_func_map = { + EventType.SyncProcessing: { + 'pre_check': self.is_ok_to_run_more_processings, + 'exec_func': self.process_sync_processing + }, + EventType.TerminatedProcessing: { + 'pre_check': self.is_ok_to_run_more_processings, + 'exec_func': self.process_terminated_processing + }, + EventType.AbortProcessing: { + 'pre_check': self.is_ok_to_run_more_processings, + 'exec_func': self.process_abort_processing + }, + EventType.ResumeProcessing: { + 'pre_check': self.is_ok_to_run_more_processings, + 'exec_func': self.process_resume_processing + } + } + + def run(self): + """ + Main run function. + """ + try: + self.logger.info("Starting main thread") + + self.load_plugins() + self.init() + + self.add_default_tasks() + + self.init_event_function_map() + + self.execute() + except KeyboardInterrupt: + self.stop() + + +if __name__ == '__main__': + agent = Finisher() + agent() diff --git a/main/lib/idds/agents/carrier/poller.py b/main/lib/idds/agents/carrier/poller.py new file mode 100644 index 00000000..e83fc377 --- /dev/null +++ b/main/lib/idds/agents/carrier/poller.py @@ -0,0 +1,394 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 - 2022 + +import datetime +import random +import time +import traceback + +from idds.common import exceptions +from idds.common.constants import Sections, ProcessingStatus, ProcessingLocking +from idds.common.utils import setup_logging, truncate_string +from idds.core import processings as core_processings +from idds.agents.common.baseagent import BaseAgent +from idds.agents.common.eventbus.event import (EventType, + UpdateProcessingEvent, + TriggerProcessingEvent, + SyncProcessingEvent, + TerminatedProcessingEvent) + +from .utils import handle_update_processing, is_process_terminated + +setup_logging(__name__) + + +class Poller(BaseAgent): + """ + Poller works to submit and running tasks to WFMS. + """ + + def __init__(self, num_threads=1, poll_period=10, retries=3, retrieve_bulk_size=2, + name='Poller', message_bulk_size=1000, **kwargs): + super(Poller, self).__init__(num_threads=num_threads, name=name, **kwargs) + self.config_section = Sections.Carrier + self.poll_period = int(poll_period) + self.retries = int(retries) + self.retrieve_bulk_size = int(retrieve_bulk_size) + self.message_bulk_size = int(message_bulk_size) + + if not hasattr(self, 'new_poll_period') or not self.new_poll_period: + self.new_poll_period = self.poll_period + else: + self.new_poll_period = int(self.new_poll_period) + if not hasattr(self, 'update_poll_period') or not self.update_poll_period: + self.update_poll_period = self.poll_period + else: + self.update_poll_period = int(self.update_poll_period) + + if hasattr(self, 'poll_period_increase_rate'): + self.poll_period_increase_rate = float(self.poll_period_increase_rate) + else: + self.poll_period_increase_rate = 2 + + if hasattr(self, 'max_new_poll_period'): + self.max_new_poll_period = int(self.max_new_poll_period) + else: + self.max_new_poll_period = 3600 * 6 + if hasattr(self, 'max_update_poll_period'): + self.max_update_poll_period = int(self.max_update_poll_period) + else: + self.max_update_poll_period = 3600 * 6 + + self.number_workers = 0 + if not hasattr(self, 'max_number_workers') or not self.max_number_workers: + self.max_number_workers = 3 + else: + self.max_number_workers = int(self.max_number_workers) + + def is_ok_to_run_more_processings(self): + if self.number_workers >= self.max_number_workers: + return False + return True + + def show_queue_size(self): + if self.number_workers > 0: + q_str = "number of processings: %s, max number of processings: %s" % (self.number_workers, self.max_number_workers) + self.logger.debug(q_str) + + def init(self): + status = [ProcessingStatus.New, ProcessingStatus.Submitting, ProcessingStatus.Submitted, + ProcessingStatus.Running, ProcessingStatus.FinishedOnExec] + core_processings.clean_next_poll_at(status) + + def get_running_processings(self): + """ + Get running processing + """ + try: + if not self.is_ok_to_run_more_processings(): + return [] + + self.show_queue_size() + + processing_status = [ProcessingStatus.Submitting, ProcessingStatus.Submitted, + ProcessingStatus.Running, ProcessingStatus.FinishedOnExec, + ProcessingStatus.ToCancel, ProcessingStatus.Cancelling, + ProcessingStatus.ToSuspend, ProcessingStatus.Suspending, + ProcessingStatus.ToResume, ProcessingStatus.Resuming, + ProcessingStatus.ToExpire, ProcessingStatus.Expiring, + ProcessingStatus.ToFinish, ProcessingStatus.ToForceFinish, + ProcessingStatus.Terminating] + # next_poll_at = datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_period) + processings = core_processings.get_processings_by_status(status=processing_status, + locking=True, update_poll=True, + not_lock=True, + only_return_id=True, + bulk_size=self.retrieve_bulk_size) + + # self.logger.debug("Main thread get %s [submitting + submitted + running] processings to process" % (len(processings))) + if processings: + self.logger.info("Main thread get [submitting + submitted + running] processings to process: %s" % (str(processings))) + + for pr_id in processings: + self.logger.info("UpdateProcessingEvent(processing_id: %s)" % pr_id) + event = UpdateProcessingEvent(publisher_id=self.id, processing_id=pr_id) + self.event_bus.send(event) + + return processings + except exceptions.DatabaseException as ex: + if 'ORA-00060' in str(ex): + self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") + else: + # raise ex + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + return [] + + def get_processing(self, processing_id, status=None, locking=False): + try: + return core_processings.get_processing_by_id_status(processing_id=processing_id, status=status, locking=locking) + except exceptions.DatabaseException as ex: + if 'ORA-00060' in str(ex): + self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") + else: + # raise ex + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + return None + + def load_poll_period(self, processing, parameters): + if self.new_poll_period and processing['new_poll_period'] != self.new_poll_period: + parameters['new_poll_period'] = self.new_poll_period + if self.update_poll_period and processing['update_poll_period'] != self.update_poll_period: + parameters['update_poll_period'] = self.update_poll_period + return parameters + + def get_log_prefix(self, processing): + return "" % (processing['request_id'], + processing['transform_id'], + processing['processing_id']) + + def update_processing(self, processing, processing_model): + try: + if processing: + log_prefix = self.get_log_prefix(processing_model) + + self.logger.info(log_prefix + "update_processing: %s" % (processing['update_processing']['parameters'])) + + processing['update_processing']['parameters']['locking'] = ProcessingLocking.Idle + # self.logger.debug("wen: %s" % str(processing)) + processing['update_processing']['parameters']['updated_at'] = datetime.datetime.utcnow() + + retry = True + retry_num = 0 + while retry: + retry = False + retry_num += 1 + try: + core_processings.update_processing_contents(update_processing=processing.get('update_processing', None), + update_collections=processing.get('update_collections', None), + update_contents=processing.get('update_contents', None), + messages=processing.get('messages', None), + update_messages=processing.get('update_messages', None), + new_contents=processing.get('new_contents', None)) + except exceptions.DatabaseException as ex: + if 'ORA-00060' in str(ex): + self.logger.warn(log_prefix + "(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") + if retry_num < 5: + retry = True + if retry_num <= 1: + random_sleep = random.randint(1, 10) + elif retry_num <= 2: + random_sleep = random.randint(1, 60) + else: + random_sleep = random.randint(1, 120) + time.sleep(random_sleep) + else: + raise ex + else: + # self.logger.error(ex) + # self.logger.error(traceback.format_exc()) + raise ex + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + try: + processing_id = processing['update_processing']['processing_id'] + + parameters = {'status': processing['update_processing']['parameters']['status'], + 'locking': ProcessingLocking.Idle} + if 'new_retries' in processing['update_processing']['parameters']: + parameters['new_retries'] = processing['update_processing']['parameters']['new_retries'] + if 'update_retries' in processing['update_processing']['parameters']: + parameters['update_retries'] = processing['update_processing']['parameters']['update_retries'] + if 'errors' in processing['update_processing']['parameters']: + parameters['errors'] = processing['update_processing']['parameters']['errors'] + + self.logger.warn(log_prefix + "update_processing exception result: %s" % (parameters)) + core_processings.update_processing(processing_id=processing_id, parameters=parameters) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + + def handle_update_processing(self, processing): + try: + log_prefix = self.get_log_prefix(processing) + process_status, new_contents, ret_msgs, update_contents, parameters = handle_update_processing(processing, + self.agent_attributes, + logger=self.logger, + log_prefix=log_prefix) + + proc = processing['processing_metadata']['processing'] + work = proc.work + if work.use_dependency_to_release_jobs(): + new_process_status = ProcessingStatus.Triggering + else: + new_process_status = process_status + if is_process_terminated(process_status): + new_process_status = ProcessingStatus.Terminating + + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': new_process_status, + 'substatus': process_status, + 'locking': ProcessingLocking.Idle}} + + update_processing['parameters'] = self.load_poll_period(processing, update_processing['parameters']) + + if proc.submitted_at: + if not processing['submitted_at'] or processing['submitted_at'] < proc.submitted_at: + update_processing['parameters']['submitted_at'] = proc.submitted_at + + if proc.workload_id: + update_processing['parameters']['workload_id'] = proc.workload_id + + # update_processing['parameters']['expired_at'] = work.get_expired_at(processing) + update_processing['parameters']['processing_metadata'] = processing['processing_metadata'] + + if parameters: + # special parameters such as 'output_metadata' + for p in parameters: + update_processing['parameters'][p] = parameters[p] + + ret = {'update_processing': update_processing, + 'update_contents': update_contents, + 'new_contents': new_contents, + 'messages': ret_msgs, + 'processing_status': new_process_status} + + except exceptions.ProcessFormatNotSupported as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + + retries = processing['update_retries'] + 1 + if not processing['max_update_retries'] or retries < processing['max_update_retries']: + proc_status = ProcessingStatus.Running + else: + proc_status = ProcessingStatus.Failed + error = {'update_err': {'msg': truncate_string('%s' % (ex), length=200)}} + + # increase poll period + update_poll_period = int(processing['update_poll_period'].total_seconds() * self.poll_period_increase_rate) + if update_poll_period > self.max_update_poll_period: + update_poll_period = self.max_update_poll_period + + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': proc_status, + 'locking': ProcessingLocking.Idle, + 'update_retries': retries, + 'update_poll_period': update_poll_period, + 'errors': processing['errors'] if processing['errors'] else {}}} + update_processing['parameters']['errors'].update(error) + + ret = {'update_processing': update_processing, + 'update_contents': []} + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + + retries = processing['update_retries'] + 1 + if not processing['max_update_retries'] or retries < processing['max_update_retries']: + proc_status = ProcessingStatus.Running + else: + proc_status = ProcessingStatus.Failed + error = {'update_err': {'msg': truncate_string('%s' % (ex), length=200)}} + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': proc_status, + 'locking': ProcessingLocking.Idle, + 'update_retries': retries, + 'errors': processing['errors'] if processing['errors'] else {}}} + update_processing['parameters']['errors'].update(error) + update_processing['parameters'] = self.load_poll_period(processing, update_processing['parameters']) + + ret = {'update_processing': update_processing, + 'update_contents': []} + return ret + + def process_update_processing(self, event): + self.number_workers += 1 + try: + if event: + self.logger.info("process_update_processing, event: %s" % str(event)) + + pr = self.get_processing(processing_id=event._processing_id, status=None, locking=True) + if not pr: + self.logger.error("Cannot find processing for event: %s" % str(event)) + else: + log_pre = self.get_log_prefix(pr) + + self.logger.info(log_pre + "process_update_processing") + ret = self.handle_update_processing(pr) + # self.logger.info(log_pre + "process_update_processing result: %s" % str(ret)) + + self.update_processing(ret, pr) + + if 'processing_status' in ret and ret['processing_status'] == ProcessingStatus.Triggering: + event_content = {} + if (('update_contents' in ret and ret['update_contents']) or ('new_contents' in ret and ret['new_contents'])): + event_content['has_updates'] = True + if is_process_terminated(pr['substatus']): + event_content['Terminated'] = True + event = TriggerProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], content=event_content) + self.event_bus.send(event) + elif 'processing_status' in ret and ret['processing_status'] == ProcessingStatus.Terminating: + self.logger.info(log_pre + "TerminatedProcessingEvent(processing_id: %s)" % pr['processing_id']) + event = TerminatedProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id']) + self.event_bus.send(event) + else: + if (('update_contents' in ret and ret['update_contents']) + or ('new_contents' in ret and ret['new_contents']) # noqa W503 + or ('messages' in ret and ret['messages'])): # noqa E129 + self.logger.info(log_pre + "SyncProcessingEvent(processing_id: %s)" % pr['processing_id']) + event = SyncProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id']) + self.event_bus.send(event) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 + + def clean_locks(self): + self.logger.info("clean locking") + core_processings.clean_locking() + + def init_event_function_map(self): + self.event_func_map = { + EventType.UpdateProcessing: { + 'pre_check': self.is_ok_to_run_more_processings, + 'exec_func': self.process_update_processing + } + } + + def run(self): + """ + Main run function. + """ + try: + self.logger.info("Starting main thread") + + self.load_plugins() + self.init() + + self.add_default_tasks() + + self.init_event_function_map() + + task = self.create_task(task_func=self.get_running_processings, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=60, priority=1) + self.add_task(task) + + task = self.create_task(task_func=self.clean_locks, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1800, priority=1) + self.add_task(task) + + self.execute() + except KeyboardInterrupt: + self.stop() + + +if __name__ == '__main__': + agent = Poller() + agent() diff --git a/main/lib/idds/agents/carrier/receiver.py b/main/lib/idds/agents/carrier/receiver.py index d88c8c44..5e95ad83 100644 --- a/main/lib/idds/agents/carrier/receiver.py +++ b/main/lib/idds/agents/carrier/receiver.py @@ -8,11 +8,6 @@ # Authors: # - Wen Guan, , 2019 - 2022 -import json -import logging -import socket -import stomp -import threading import time import traceback try: @@ -22,194 +17,62 @@ # Python 2 from Queue import Queue -from idds.common.cache import get_cache, update_cache -from idds.common.constants import (Sections) +from idds.common.constants import Sections +from idds.common.exceptions import AgentPluginError, IDDSException from idds.common.utils import setup_logging +from idds.common.utils import json_dumps +from idds.core import messages as core_messages, catalog as core_catalog from idds.agents.common.baseagent import BaseAgent +# from idds.agents.common.eventbus.event import TerminatedProcessingEvent +from idds.agents.common.eventbus.event import TriggerProcessingEvent -setup_logging(__name__) -logging.getLogger("stomp").setLevel(logging.CRITICAL) - - -class MessagingListener(stomp.ConnectionListener): - ''' - Messaging Listener - ''' - def __init__(self, broker, output_queue): - ''' - __init__ - ''' - self.__broker = broker - self.__output_queue = output_queue - self.logger = logging.getLogger(self.__class__.__name__) - - def on_error(self, headers, body): - ''' - Error handler - ''' - self.logger.error('[broker] [%s]: %s', self.__broker, body) - - def on_message(self, headers, body): - # self.logger.info('[broker] [%s]: %s', self.__broker, body) - self.__output_queue.put(body) - pass - - -class MessagingReceiver(threading.Thread): - def __init__(self, **kwargs): - # threading.Thread.__init__(self) - super(MessagingReceiver, self).__init__(**kwargs) - - for key in kwargs: - setattr(self, key, kwargs[key]) +from .utils import handle_messages_processing - self.logger = None - self.setup_logger() - self.graceful_stop = threading.Event() - self.output_queue = None - - self.conns = [] - - def get_class_name(self): - return self.__class__.__name__ - - def setup_logger(self): - """ - Setup logger - """ - self.logger = logging.getLogger(self.get_class_name()) - - def stop(self): - self.graceful_stop.set() - - def set_output_queue(self, output_queue): - self.output_queue = output_queue - - def subscribe(self, listener=MessagingListener): - self.conns = [] - - broker_addresses = [] - for b in self.brokers: - try: - addrinfos = socket.getaddrinfo(b, 0, socket.AF_INET, 0, socket.IPPROTO_TCP) - for addrinfo in addrinfos: - b_addr = addrinfo[4][0] - broker_addresses.append(b_addr) - except socket.gaierror as error: - self.logger.error('Cannot resolve hostname %s: %s' % (b, str(error))) - - self.logger.info("Resolved broker addresses: %s" % broker_addresses) - - for broker in broker_addresses: - conn = stomp.Connection12(host_and_ports=[(broker, self.port)], - vhost=self.vhost, - keepalive=True) - conn.set_listener('message-receiver', listener(conn.transport._Transport__host_and_ports[0], self.output_queue)) - conn.connect(self.username, self.password, wait=True) - conn.subscribe(destination=self.destination, id='atlas-idds-messaging', ack='auto') - self.conns.append(conn) - - while not self.graceful_stop.is_set(): - try: - for conn in self.conns: - if not conn.is_connected(): - self.logger.info('connecting to %s' % conn.transport._Transport__host_and_ports[0][0]) - conn.set_listener('message-receiver', listener(conn.transport._Transport__host_and_ports[0], self.output_queue)) - # conn.start() - conn.connect(self.username, self.password, wait=True) - conn.subscribe(destination=self.destination, id='atlas-idds-messaging', ack='auto') - time.sleep(1) - except Exception as error: - self.logger.error("Messaging receiver throws an exception: %s, %s" % (error, traceback.format_exc())) - - self.logger.info('receiver graceful stop requested') - - for conn in self.conns: - try: - conn.disconnect() - except Exception: - pass - - def run(self): - try: - self.subscribe() - except Exception as error: - self.logger.error("Messaging receiver throws an exception: %s, %s" % (error, traceback.format_exc())) - - def __call__(self): - self.run() +setup_logging(__name__) class Receiver(BaseAgent): """ - Receiver works to receive messages and then update the processing or content status. + Receiver works to receive workload management messages to update task/job status. """ - def __init__(self, num_threads=1, poll_time_period=10, retrieve_bulk_size=10, pending_time=None, **kwargs): - super(Receiver, self).__init__(num_threads=num_threads, **kwargs) - self.poll_time_period = int(poll_time_period) - self.retrieve_bulk_size = int(retrieve_bulk_size) - self.config_section = Sections.Receiver - if pending_time: - self.pending_time = float(pending_time) - else: - self.pending_time = None - - self.messageing_reciver = None - self.messaging_queue = Queue() - - self.tasks = {} - - if not hasattr(self, 'messaging_brokers'): - raise Exception('messaging brokers is required but not defined.') - else: - self.messaging_brokers = [b.strip() for b in self.messaging_brokers.split(',')] - if not hasattr(self, 'messaging_port'): - raise Exception('messaging port is required but not defined.') - if not hasattr(self, 'messaging_vhost'): - self.messaging_vhost = None - if not hasattr(self, 'messaging_destination'): - raise Exception('messaging destination is required but not defined.') - if not hasattr(self, 'messaging_broker_timeout'): - self.messaging_broker_timeout = 10 - else: - self.messaging_broker_timeout = int(self.messaging_broker_timeout) - - def start_messaging_receiver(self): - kwargs = {'broker': self.messaging_broker, - 'port': self.messaging_port, - 'vhost': self.messaging_vhost, - 'destination': self.messaging_destinaton, - 'broker_timeout': self.messaging_broker_timeout} - self.messageing_reciver = MessagingReceiver(**kwargs) - self.messageing_reciver.setup_output_queue(self.messaging_queue) + def __init__(self, num_threads=1, bulk_message_delay=5, bulk_message_size=2000, + random_delay=None, **kwargs): + super(Receiver, self).__init__(num_threads=num_threads, name='Receiver', **kwargs) + self.config_section = Sections.Carrier + self.bulk_message_delay = int(bulk_message_delay) + self.bulk_message_size = int(bulk_message_size) + self.message_queue = Queue() + + def __del__(self): + self.stop_receiver() + + def start_receiver(self): + if 'receiver' not in self.plugins: + raise AgentPluginError('Plugin receiver is required') + self.receiver = self.plugins['receiver'] + + self.logger.info("Starting receiver: %s" % self.receiver) + self.receiver.set_output_queue(self.message_queue) + self.set_logger(self.logger) + self.receiver.start() def stop_receiver(self): - self.messageing_reciver.stop() + if hasattr(self, 'receiver') and self.receiver: + self.logger.info("Stopping receiver: %s" % self.receiver) + self.receiver.stop() - def handle_messages(self): + def get_output_messages(self): + msgs = [] try: - while not self.messaging_queue.empty(): - msg = self.messaging_queue.get(False) + while not self.message_queue.empty(): + msg = self.message_queue.get(False) if msg: - msg = json.loads(msg) - if 'msg_type' in msg: - if msg['msg_type'] == 'job_status' and 'taskid' in msg and 'status' in msg and msg['status'] in ['finished', 'failed']: - taskid = msg['taskid'] - jobid = msg['jobid'] - status = msg['status'] - inputs = msg['inputs'] - if taskid not in self.tasks: - self.tasks[taskid] = [] - self.tasks[taskid].append({'taskid': taskid, 'jobid': jobid, 'status': status, 'inputs': inputs}) - for taskid in self.tasks: - data = get_cache(taskid) - if data: - self.tasks[taskid] = self.tasks[taskid] + data - update_cache(taskid, self.tasks[taskid]) - del self.tasks[taskid] + # self.logger.debug("Received message: %s" % str(msg)) + msgs.append(msg) except Exception as error: - self.logger.error("Failed to handle messages: %s, %s" % (error, traceback.format_exc())) + self.logger.error("Failed to get output messages: %s, %s" % (error, traceback.format_exc())) + return msgs def run(self): """ @@ -217,17 +80,49 @@ def run(self): """ try: self.logger.info("Starting main thread") + self.load_plugins() - self.start_messaging_receiver() + self.start_receiver() - while not self.graceful_stop.is_set(): - self.handle_messages() - time.sleep(self.poll_time_period) + self.add_health_message_task() + log_prefix = "" + + while not self.graceful_stop.is_set(): + try: + time_start = time.time() + output_messages = self.get_output_messages() + update_processings, terminated_processings, update_contents, msgs = handle_messages_processing(output_messages) + + if msgs: + # self.logger.debug(log_prefix + "adding messages[:3]: %s" % json_dumps(msgs[:3])) + core_messages.add_messages(msgs, bulk_size=self.bulk_message_size) + + if update_contents: + self.logger.info(log_prefix + "update_contents[:3]: %s" % json_dumps(update_contents[:3])) + core_catalog.update_contents(update_contents) + + for pr_id in update_processings: + # self.logger.info(log_prefix + "TerminatedProcessingEvent(processing_id: %s)" % pr_id) + # event = TerminatedProcessingEvent(publisher_id=self.id, processing_id=pr_id) + self.logger.info(log_prefix + "TriggerProcessingEvent(processing_id: %s)" % pr_id) + event = TriggerProcessingEvent(publisher_id=self.id, processing_id=pr_id) + self.event_bus.send(event) + + for pr_id in terminated_processings: + self.logger.info(log_prefix + "TriggerProcessingEvent(processing_id: %s)" % pr_id) + event = TriggerProcessingEvent(publisher_id=self.id, processing_id=pr_id, content={'Terminated': True, 'source': 'Receiver'}) + self.event_bus.send(event) + + time_delay = self.bulk_message_delay - (time.time() - time_start) + if time_delay > 0: + time.sleep(time_delay) + except IDDSException as error: + self.logger.error("Main thread IDDSException: %s" % str(error)) + except Exception as error: + self.logger.critical("Main thread exception: %s\n%s" % (str(error), traceback.format_exc())) except KeyboardInterrupt: self.stop() - finally: - self.stop() def stop(self): super(Receiver, self).stop() diff --git a/main/lib/idds/agents/carrier/submitter.py b/main/lib/idds/agents/carrier/submitter.py new file mode 100644 index 00000000..bcc33aca --- /dev/null +++ b/main/lib/idds/agents/carrier/submitter.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 - 2022 + +import traceback + +from idds.common import exceptions +from idds.common.constants import ProcessingStatus, ProcessingLocking +from idds.common.utils import setup_logging, truncate_string +from idds.core import processings as core_processings +from idds.agents.common.eventbus.event import (EventType, + NewProcessingEvent, + UpdateTransformEvent) + +from .utils import handle_new_processing +from .poller import Poller + +setup_logging(__name__) + + +class Submitter(Poller): + """ + Submitter works to submit and running tasks to WFMS. + """ + + def __init__(self, num_threads=1, poll_period=10, retries=3, retrieve_bulk_size=2, + name='Submitter', message_bulk_size=1000, **kwargs): + super(Submitter, self).__init__(num_threads=num_threads, name=name, **kwargs) + + def get_new_processings(self): + """ + Get new processing + """ + try: + if not self.is_ok_to_run_more_processings(): + return [] + + self.show_queue_size() + + processing_status = [ProcessingStatus.New] + processings = core_processings.get_processings_by_status(status=processing_status, locking=True, + not_lock=True, + new_poll=True, only_return_id=True, + bulk_size=self.retrieve_bulk_size) + + # self.logger.debug("Main thread get %s [new] processings to process" % len(processings)) + if processings: + self.logger.info("Main thread get [new] processings to process: %s" % str(processings)) + + for pr_id in processings: + self.logger.info("NewProcessingEvent(processing_id: %s)" % pr_id) + event = NewProcessingEvent(publisher_id=self.id, processing_id=pr_id) + self.event_bus.send(event) + + return processings + except exceptions.DatabaseException as ex: + if 'ORA-00060' in str(ex): + self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") + else: + # raise ex + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + return [] + + def handle_new_processing(self, processing): + try: + log_prefix = self.get_log_prefix(processing) + + # transform_id = processing['transform_id'] + # transform = core_transforms.get_transform(transform_id=transform_id) + # work = transform['transform_metadata']['work'] + status, processing, update_colls, new_contents, msgs, errors = handle_new_processing(processing, + self.agent_attributes, + logger=self.logger, + log_prefix=log_prefix) + + if not status: + raise exceptions.ProcessSubmitFailed(errors) + + parameters = {'status': ProcessingStatus.Submitting, + 'locking': ProcessingLocking.Idle, + 'processing_metadata': processing['processing_metadata']} + parameters = self.load_poll_period(processing, parameters) + + proc = processing['processing_metadata']['processing'] + if proc.submitted_at: + if not processing['submitted_at'] or processing['submitted_at'] < proc.submitted_at: + parameters['submitted_at'] = proc.submitted_at + + # if processing['processing_metadata'] and 'processing' in processing['processing_metadata']: + if proc.workload_id: + parameters['workload_id'] = proc.workload_id + + update_processing = {'processing_id': processing['processing_id'], + 'parameters': parameters} + ret = {'update_processing': update_processing, + 'update_collections': update_colls, + 'update_contents': [], + 'new_contents': new_contents, + 'messages': msgs, + } + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + + retries = processing['new_retries'] + 1 + if not processing['max_new_retries'] or retries < processing['max_new_retries']: + pr_status = processing['status'] + else: + pr_status = ProcessingStatus.Failed + # increase poll period + new_poll_period = int(processing['new_poll_period'].total_seconds() * self.poll_period_increase_rate) + if new_poll_period > self.max_new_poll_period: + new_poll_period = self.max_new_poll_period + + error = {'submit_err': {'msg': truncate_string('%s' % (ex), length=200)}} + parameters = {'status': pr_status, + 'new_poll_period': new_poll_period, + 'errors': processing['errors'] if processing['errors'] else {}, + 'new_retries': retries} + parameters['errors'].update(error) + + update_processing = {'processing_id': processing['processing_id'], + 'parameters': parameters} + ret = {'update_processing': update_processing, + 'update_contents': []} + return ret + + def process_new_processing(self, event): + self.number_workers += 1 + try: + if event: + # pr_status = [ProcessingStatus.New] + self.logger.info("process_new_processing, event: %s" % str(event)) + pr = self.get_processing(processing_id=event._processing_id, status=None, locking=True) + if not pr: + self.logger.error("Cannot find processing for event: %s" % str(event)) + else: + log_pre = self.get_log_prefix(pr) + self.logger.info(log_pre + "process_new_processing") + ret = self.handle_new_processing(pr) + # self.logger.info(log_pre + "process_new_processing result: %s" % str(ret)) + + self.update_processing(ret, pr) + + self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % pr['transform_id']) + submit_event_content = {'event': 'submitted'} + event = UpdateTransformEvent(publisher_id=self.id, transform_id=pr['transform_id'], content=submit_event_content) + self.event_bus.send(event) + + # self.logger.info(log_pre + "SyncProcessingEvent(processing_id: %s)" % pr['processing_id']) + # event = SyncProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id']) + # self.event_bus.send(event) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 + + def init_event_function_map(self): + self.event_func_map = { + EventType.NewProcessing: { + 'pre_check': self.is_ok_to_run_more_processings, + 'exec_func': self.process_new_processing + } + } + + def run(self): + """ + Main run function. + """ + try: + self.logger.info("Starting main thread") + + self.load_plugins() + self.init() + + self.add_default_tasks() + + self.init_event_function_map() + + task = self.create_task(task_func=self.get_new_processings, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=60, priority=1) + self.add_task(task) + + self.execute() + except KeyboardInterrupt: + self.stop() + + +if __name__ == '__main__': + agent = Submitter() + agent() diff --git a/main/lib/idds/agents/carrier/trigger.py b/main/lib/idds/agents/carrier/trigger.py new file mode 100644 index 00000000..549b48af --- /dev/null +++ b/main/lib/idds/agents/carrier/trigger.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 - 2022 + +import traceback + +from idds.common import exceptions +from idds.common.constants import ProcessingStatus, ProcessingLocking +from idds.common.utils import setup_logging, truncate_string +from idds.core import processings as core_processings +from idds.agents.common.eventbus.event import (EventType, + TriggerProcessingEvent, + TerminatedProcessingEvent, + SyncProcessingEvent) + +from .utils import handle_trigger_processing, is_process_terminated +from .poller import Poller + +setup_logging(__name__) + + +class Trigger(Poller): + """ + Trigger works to trigger to release jobs + """ + + def __init__(self, num_threads=1, poll_period=10, retries=3, retrieve_bulk_size=2, + name='Trigger', message_bulk_size=1000, **kwargs): + super(Trigger, self).__init__(num_threads=num_threads, name=name, **kwargs) + + if hasattr(self, 'trigger_max_number_workers'): + self.max_number_workers = int(self.trigger_max_number_workers) + + def get_trigger_processings(self): + """ + Get trigger processing + """ + try: + if not self.is_ok_to_run_more_processings(): + return [] + self.show_queue_size() + + processing_status = [ProcessingStatus.ToTrigger, ProcessingStatus.Triggering] + processings = core_processings.get_processings_by_status(status=processing_status, + locking=True, update_poll=True, + not_lock=True, + only_return_id=True, + bulk_size=self.retrieve_bulk_size) + if processings: + self.logger.info("Main thread get [ToTrigger, Triggering] processings to process: %s" % (str(processings))) + + for pr_id in processings: + self.logger.info("UpdateProcessingEvent(processing_id: %s)" % pr_id) + event = TriggerProcessingEvent(publisher_id=self.id, processing_id=pr_id) + self.event_bus.send(event) + + return processings + except exceptions.DatabaseException as ex: + if 'ORA-00060' in str(ex): + self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") + else: + # raise ex + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + return [] + + def handle_trigger_processing(self, processing): + try: + log_prefix = self.get_log_prefix(processing) + process_status, update_contents, ret_msgs, parameters = handle_trigger_processing(processing, + self.agent_attributes, + logger=self.logger, + log_prefix=log_prefix) + + new_process_status = process_status + if is_process_terminated(process_status): + new_process_status = ProcessingStatus.Terminating + + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': new_process_status, + 'substatus': process_status, + 'locking': ProcessingLocking.Idle}} + + if parameters: + # special parameters such as 'output_metadata' + for p in parameters: + update_processing['parameters'][p] = parameters[p] + + ret = {'update_processing': update_processing, + 'update_contents': update_contents, + 'messages': ret_msgs, + 'processing_status': new_process_status} + except exceptions.ProcessFormatNotSupported as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + + retries = processing['update_retries'] + 1 + if not processing['max_update_retries'] or retries < processing['max_update_retries']: + proc_status = ProcessingStatus.Running + else: + proc_status = ProcessingStatus.Failed + error = {'update_err': {'msg': truncate_string('%s' % (ex), length=200)}} + + # increase poll period + update_poll_period = int(processing['update_poll_period'].total_seconds() * self.poll_period_increase_rate) + if update_poll_period > self.max_update_poll_period: + update_poll_period = self.max_update_poll_period + + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': proc_status, + 'locking': ProcessingLocking.Idle, + 'update_retries': retries, + 'update_poll_period': update_poll_period, + 'errors': processing['errors'] if processing['errors'] else {}}} + update_processing['parameters']['errors'].update(error) + + ret = {'update_processing': update_processing, + 'update_contents': []} + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + + retries = processing['update_retries'] + 1 + if not processing['max_update_retries'] or retries < processing['max_update_retries']: + proc_status = ProcessingStatus.Running + else: + proc_status = ProcessingStatus.Failed + error = {'update_err': {'msg': truncate_string('%s' % (ex), length=200)}} + update_processing = {'processing_id': processing['processing_id'], + 'parameters': {'status': proc_status, + 'locking': ProcessingLocking.Idle, + 'update_retries': retries, + 'errors': processing['errors'] if processing['errors'] else {}}} + update_processing['parameters']['errors'].update(error) + update_processing['parameters'] = self.load_poll_period(processing, update_processing['parameters']) + + ret = {'update_processing': update_processing, + 'update_contents': []} + return ret + + def process_trigger_processing(self, event): + self.number_workers += 1 + try: + if event: + # pr_status = [ProcessingStatus.New] + self.logger.info("process_trigger_processing, event: %s" % str(event)) + pr = self.get_processing(processing_id=event._processing_id, status=None, locking=True) + if not pr: + self.logger.error("Cannot find processing for event: %s" % str(event)) + else: + log_pre = self.get_log_prefix(pr) + self.logger.info(log_pre + "process_trigger_processing") + ret = self.handle_trigger_processing(pr) + # self.logger.info(log_pre + "process_trigger_processing result: %s" % str(ret)) + + self.update_processing(ret, pr) + + if (('processing_status' in ret and ret['processing_status'] == ProcessingStatus.Terminating) + or (event._content and 'Terminated' in event._content and event._content['Terminated'])): # noqa W503 + self.logger.info(log_pre + "TerminatedProcessingEvent(processing_id: %s)" % pr['processing_id']) + event = TerminatedProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id'], content=event._content) + self.event_bus.send(event) + else: + if ((event._content and 'has_updates' in event._content and event._content['has_updates']) + or ('update_contents' in ret and ret['update_contents']) # noqa W503 + or ('new_contents' in ret and ret['new_contents']) # noqa W503 + or ('messages' in ret and ret['messages'])): # noqa E129 + self.logger.info(log_pre + "SyncProcessingEvent(processing_id: %s)" % pr['processing_id']) + event = SyncProcessingEvent(publisher_id=self.id, processing_id=pr['processing_id']) + self.event_bus.send(event) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 + + def init_event_function_map(self): + self.event_func_map = { + EventType.TriggerProcessing: { + 'pre_check': self.is_ok_to_run_more_processings, + 'exec_func': self.process_trigger_processing + } + } + + def run(self): + """ + Main run function. + """ + try: + self.logger.info("Starting main thread") + + self.load_plugins() + self.init() + + self.add_default_tasks() + + self.init_event_function_map() + + task = self.create_task(task_func=self.get_trigger_processings, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=60, priority=1) + self.add_task(task) + + self.execute() + except KeyboardInterrupt: + self.stop() + + +if __name__ == '__main__': + agent = Trigger() + agent() diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py new file mode 100644 index 00000000..ea5128be --- /dev/null +++ b/main/lib/idds/agents/carrier/utils.py @@ -0,0 +1,1344 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2022 + +import json +import logging + + +from idds.common.constants import (ProcessingStatus, + CollectionStatus, + ContentStatus, ContentType, + ContentRelationType, + WorkStatus, + TransformType, + TransformType2MessageTypeMap, + MessageStatus, MessageSource, + MessageDestination) +from idds.common.utils import setup_logging +from idds.core import (transforms as core_transforms, + processings as core_processings, + catalog as core_catalog) +from idds.agents.common.cache.redis import get_redis_cache + + +setup_logging(__name__) + + +def get_logger(logger=None): + if logger: + return logger + logger = logging.getLogger(__name__) + return logger + + +def get_new_content(request_id, transform_id, workload_id, map_id, input_content, content_relation_type=ContentRelationType.Input): + content = {'transform_id': transform_id, + 'coll_id': input_content['coll_id'], + 'request_id': request_id, + 'workload_id': workload_id, + 'map_id': map_id, + 'scope': input_content['scope'], + 'name': input_content['name'], + 'min_id': input_content['min_id'] if 'min_id' in input_content else 0, + 'max_id': input_content['max_id'] if 'max_id' in input_content else 0, + 'status': input_content['status'] if 'status' in input_content and input_content['status'] is not None else ContentStatus.New, + 'substatus': input_content['substatus'] if 'substatus' in input_content and input_content['substatus'] is not None else ContentStatus.New, + 'path': input_content['path'] if 'path' in input_content else None, + 'content_type': input_content['content_type'] if 'content_type' in input_content else ContentType.File, + 'content_relation_type': content_relation_type, + 'bytes': input_content['bytes'], + 'adler32': input_content['adler32'], + 'content_metadata': input_content['content_metadata']} + if content['min_id'] is None: + content['min_id'] = 0 + if content['max_id'] is None: + content['max_id'] = 0 + return content + + +def is_process_terminated(processing_status): + if processing_status in [ProcessingStatus.Finished, ProcessingStatus.Failed, + ProcessingStatus.SubFinished, ProcessingStatus.Cancelled, + ProcessingStatus.Suspended, ProcessingStatus.Expired, + ProcessingStatus.Broken, ProcessingStatus.FinishedOnStep, + ProcessingStatus.FinishedOnExec, ProcessingStatus.FinishedTerm]: + return True + return False + + +def is_all_contents_available(contents): + for content in contents: + if type(content) is dict: + if content['substatus'] not in [ContentStatus.Available, ContentStatus.FakeAvailable]: + return False + else: + # list of content_id, status, + if content[1] not in [ContentStatus.Available, ContentStatus.FakeAvailable]: + return False + return True + + +def is_all_contents_terminated(contents): + for content in contents: + if type(content) is dict: + if content['substatus'] not in [ContentStatus.Available, ContentStatus.FakeAvailable, + ContentStatus.FinalFailed, ContentStatus.Missing]: + return False + else: + if content[1] not in [ContentStatus.Available, ContentStatus.FakeAvailable, + ContentStatus.FinalFailed, ContentStatus.Missing]: + return False + return True + + +def is_input_dependency_terminated(input_dependency): + if type(input_dependency) is dict: + if input_dependency['substatus'] in [ContentStatus.Available, ContentStatus.FakeAvailable, + ContentStatus.FinalFailed, ContentStatus.Missing]: + return True + else: + if input_dependency[1] in [ContentStatus.Available, ContentStatus.FakeAvailable, + ContentStatus.FinalFailed, ContentStatus.Missing]: + return True + return False + + +def is_all_contents_terminated_but_not_available(inputs): + all_contents_available = True + for content in inputs: + if type(content) is dict: + if content['substatus'] not in [ContentStatus.Available, ContentStatus.FakeAvailable, + ContentStatus.FinalFailed, ContentStatus.Missing]: + return False + if content['substatus'] not in [ContentStatus.Available]: + all_contents_available = False + else: + if content[1] not in [ContentStatus.Available, ContentStatus.FakeAvailable, + ContentStatus.FinalFailed, ContentStatus.Missing]: + return False + if content[1] not in [ContentStatus.Available]: + all_contents_available = False + if all_contents_available: + return False + return True + + +def is_all_contents_available_with_status_map(inputs_dependency, content_status_map): + for content_id in inputs_dependency: + status = content_status_map[str(content_id)] + if status not in [ContentStatus.Available, ContentStatus.FakeAvailable]: + return False + return True + + +def is_all_contents_terminated_with_status_map(input_dependency, content_status_map): + for content_id in input_dependency: + status = content_status_map[str(content_id)] + if status not in [ContentStatus.Available, ContentStatus.FakeAvailable, + ContentStatus.FinalFailed, ContentStatus.Missing]: + return False + return True + + +def get_collection_ids(collections): + coll_ids = [coll.coll_id for coll in collections] + return coll_ids + + +def get_input_output_maps(transform_id, work): + # link collections + input_collections = work.get_input_collections() + output_collections = work.get_output_collections() + log_collections = work.get_log_collections() + + # for coll in input_collections + output_collections + log_collections: + # coll_model = core_catalog.get_collection(coll_id=coll.coll_id) + # coll.collection = coll_model + + input_coll_ids = get_collection_ids(input_collections) + output_coll_ids = get_collection_ids(output_collections) + log_coll_ids = get_collection_ids(log_collections) + + mapped_input_output_maps = core_transforms.get_transform_input_output_maps(transform_id, + input_coll_ids=input_coll_ids, + output_coll_ids=output_coll_ids, + log_coll_ids=log_coll_ids) + + # work_name_to_coll_map = core_transforms.get_work_name_to_coll_map(request_id=transform['request_id']) + # work.set_work_name_to_coll_map(work_name_to_coll_map) + + # new_input_output_maps = work.get_new_input_output_maps(mapped_input_output_maps) + return mapped_input_output_maps + + +def get_new_contents(request_id, transform_id, workload_id, new_input_output_maps): + new_input_contents, new_output_contents, new_log_contents = [], [], [] + new_input_dependency_contents = [] + for map_id in new_input_output_maps: + inputs = new_input_output_maps[map_id]['inputs'] if 'inputs' in new_input_output_maps[map_id] else [] + inputs_dependency = new_input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in new_input_output_maps[map_id] else [] + outputs = new_input_output_maps[map_id]['outputs'] if 'outputs' in new_input_output_maps[map_id] else [] + logs = new_input_output_maps[map_id]['logs'] if 'logs' in new_input_output_maps[map_id] else [] + + for input_content in inputs: + content = get_new_content(request_id, transform_id, workload_id, map_id, input_content, content_relation_type=ContentRelationType.Input) + new_input_contents.append(content) + for input_content in inputs_dependency: + content = get_new_content(request_id, transform_id, workload_id, map_id, input_content, content_relation_type=ContentRelationType.InputDependency) + new_input_dependency_contents.append(content) + for output_content in outputs: + content = get_new_content(request_id, transform_id, workload_id, map_id, output_content, content_relation_type=ContentRelationType.Output) + new_output_contents.append(content) + for log_content in logs: + content = get_new_content(request_id, transform_id, workload_id, map_id, log_content, content_relation_type=ContentRelationType.Log) + new_log_contents.append(content) + return new_input_contents, new_output_contents, new_log_contents, new_input_dependency_contents + + +def get_update_content(content): + updated_content = {'content_id': content['content_id'], + 'status': content['substatus'], + 'substatus': content['substatus']} + content['status'] = content['substatus'] + return updated_content, content + + +def get_update_contents(request_id, transform_id, workload_id, input_output_maps): + updated_contents = [] + updated_input_contents_full, updated_output_contents_full = [], [] + + for map_id in input_output_maps: + inputs = input_output_maps[map_id]['inputs'] if 'inputs' in input_output_maps[map_id] else [] + inputs_dependency = input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in input_output_maps[map_id] else [] + outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] + # logs = input_output_maps[map_id]['logs'] if 'logs' in input_output_maps[map_id] else [] + + content_update_status = None + if is_all_contents_available(inputs_dependency): + # logger.debug("all input dependency available: %s, inputs: %s" % (str(inputs_dependency), str(inputs))) + content_update_status = ContentStatus.Available + elif is_all_contents_terminated(inputs_dependency): + # logger.debug("all input dependency terminated: %s, inputs: %s, outputs: %s" % (str(inputs_dependency), str(inputs), str(outputs))) + content_update_status = ContentStatus.Missing + + if content_update_status: + for content in inputs: + content['substatus'] = content_update_status + if content['status'] != content['substatus']: + updated_content, content = get_update_content(content) + updated_contents.append(updated_content) + updated_input_contents_full.append(content) + if content_update_status in [ContentStatus.Missing]: + for content in outputs: + content['substatus'] = content_update_status + if content['status'] != content['substatus']: + updated_content, content = get_update_content(content) + updated_contents.append(updated_content) + updated_output_contents_full.append(content) + + for content in outputs: + if content['status'] != content['substatus']: + updated_content, content = get_update_content(content) + updated_contents.append(updated_content) + updated_output_contents_full.append(content) + return updated_contents, updated_input_contents_full, updated_output_contents_full + + +def get_message_type(work_type, input_type='file'): + work_type_value = str(work_type.value) + if work_type_value not in TransformType2MessageTypeMap: + return TransformType2MessageTypeMap['0'][input_type] + else: + return TransformType2MessageTypeMap[work_type_value][input_type] + + +def generate_file_messages(request_id, transform_id, workload_id, work, files, relation_type): + if work: + work_type = work.get_work_type() + else: + work_type = TransformType.Processing + + i_msg_type, i_msg_type_str = get_message_type(work_type, input_type='file') + files_message = [] + for file in files: + file_status = file['substatus'].name + if file['substatus'] == ContentStatus.FakeAvailable: + file_status = ContentStatus.Available.name + file_message = {'scope': file['scope'], + 'name': file['name'], + 'path': file['path'], + 'status': file_status} + files_message.append(file_message) + msg_content = {'msg_type': i_msg_type_str.value, + 'request_id': request_id, + 'workload_id': workload_id, + 'relation_type': relation_type, + 'files': files_message} + num_msg_content = len(files_message) + return i_msg_type, msg_content, num_msg_content + + +def generate_collection_messages(request_id, transform_id, workload_id, work, collection, relation_type): + coll_name = collection.name + if coll_name.endswith(".idds.stagein"): + coll_name = coll_name.replace(".idds.stagein", "") + + i_msg_type, i_msg_type_str = get_message_type(work.get_work_type(), input_type='collection') + msg_content = {'msg_type': i_msg_type_str.value, + 'request_id': request_id, + 'workload_id': workload_id, + 'relation_type': relation_type, + 'collections': [{'scope': collection.scope, + 'name': coll_name, + 'status': collection.status.name}], + 'output': work.get_output_data(), + 'error': work.get_terminated_msg()} + num_msg_content = 1 + return i_msg_type, msg_content, num_msg_content + + +def generate_work_messages(request_id, transform_id, workload_id, work, relation_type): + i_msg_type, i_msg_type_str = get_message_type(work.get_work_type(), input_type='work') + msg_content = {'msg_type': i_msg_type_str.value, + 'request_id': request_id, + 'workload_id': workload_id, + 'relation_type': relation_type, + 'status': work.get_status().name, + 'output': work.get_output_data(), + 'error': work.get_terminated_msg()} + num_msg_content = 1 + return i_msg_type, msg_content, num_msg_content + + +def generate_messages(request_id, transform_id, workload_id, work, msg_type='file', files=[], relation_type='input'): + if msg_type == 'file': + i_msg_type, msg_content, num_msg_content = generate_file_messages(request_id, transform_id, workload_id, work, files=files, relation_type=relation_type) + msg = {'msg_type': i_msg_type, + 'status': MessageStatus.New, + 'source': MessageSource.Transformer, + 'destination': MessageDestination.Outside, + 'request_id': request_id, + 'workload_id': workload_id, + 'transform_id': transform_id, + 'num_contents': num_msg_content, + 'msg_content': msg_content} + return [msg] + elif msg_type == 'work': + # link collections + input_collections = work.get_input_collections() + output_collections = work.get_output_collections() + log_collections = work.get_log_collections() + + msg_type_contents = [] + msg_type_content = generate_work_messages(request_id, transform_id, workload_id, work, relation_type='input') + msg_type_contents.append(msg_type_content) + for coll in input_collections: + msg_type_content = generate_collection_messages(request_id, transform_id, workload_id, work, coll, relation_type='input') + msg_type_contents.append(msg_type_content) + for coll in output_collections: + msg_type_content = generate_collection_messages(request_id, transform_id, workload_id, work, coll, relation_type='output') + msg_type_contents.append(msg_type_content) + for coll in log_collections: + msg_type_content = generate_collection_messages(request_id, transform_id, workload_id, work, coll, relation_type='log') + msg_type_contents.append(msg_type_content) + + msgs = [] + for i_msg_type, msg_content, num_msg_content in msg_type_contents: + msg = {'msg_type': i_msg_type, + 'status': MessageStatus.New, + 'source': MessageSource.Transformer, + 'destination': MessageDestination.Outside, + 'request_id': request_id, + 'workload_id': workload_id, + 'transform_id': transform_id, + 'num_contents': num_msg_content, + 'msg_content': msg_content} + msgs.append(msg) + return msgs + + +def handle_new_processing(processing, agent_attributes, logger=None, log_prefix=''): + logger = get_logger(logger) + + proc = processing['processing_metadata']['processing'] + work = proc.work + work.set_agent_attributes(agent_attributes, processing) + transform_id = processing['transform_id'] + + status, workload_id, errors = work.submit_processing(processing) + logger.info(log_prefix + "submit_processing (status: %s, workload_id: %s, errors: %s)" % (status, workload_id, errors)) + + if not status: + logger.error(log_prefix + "Failed to submit processing (status: %s, workload_id: %s, errors: %s)" % (status, workload_id, errors)) + return False, processing, [], [], [], errors + + ret_msgs = [] + new_contents = [] + update_collections = [] + if proc.workload_id: + processing['workload_id'] = proc.workload_id + input_collections = work.get_input_collections() + output_collections = work.get_output_collections() + log_collections = work.get_log_collections() + for coll in input_collections + output_collections + log_collections: + u_coll = {'coll_id': coll.coll_id, 'workload_id': proc.workload_id} + update_collections.append(u_coll) + + if proc.submitted_at: + input_output_maps = get_input_output_maps(transform_id, work) + new_input_output_maps = work.get_new_input_output_maps(input_output_maps) + request_id = processing['request_id'] + transform_id = processing['transform_id'] + workload_id = processing['workload_id'] + ret_new_contents = get_new_contents(request_id, transform_id, workload_id, new_input_output_maps) + new_input_contents, new_output_contents, new_log_contents, new_input_dependency_contents = ret_new_contents + new_contents = new_input_contents + new_output_contents + new_log_contents + new_input_dependency_contents + + if new_input_contents: + msgs = generate_messages(request_id, transform_id, workload_id, work, msg_type='file', files=new_input_contents, relation_type='input') + ret_msgs = ret_msgs + msgs + if new_output_contents: + msgs = generate_messages(request_id, transform_id, workload_id, work, msg_type='file', files=new_input_contents, relation_type='output') + ret_msgs = ret_msgs + msgs + return True, processing, update_collections, new_contents, ret_msgs, errors + + +def get_updated_contents_by_request(request_id, transform_id, workload_id, work, terminated=False, logger=None, log_prefix=''): + logger = get_logger(logger) + + status_to_check = [ContentStatus.Available, ContentStatus.FakeAvailable, ContentStatus.FinalFailed, + ContentStatus.Missing, ContentStatus.Failed, ContentStatus.Lost, + ContentStatus.Deleted] + contents = core_catalog.get_contents_by_request_transform(request_id=request_id, transform_id=transform_id, + status=status_to_check, status_updated=True) + updated_contents, updated_contents_full_input, updated_contents_full_output = [], [], [] + updated_contents_full_input_deps = [] + for content in contents: + if (content['status'] != content['substatus']) and content['substatus'] in status_to_check: + u_content = {'content_id': content['content_id'], + 'status': content['substatus']} + updated_contents.append(u_content) + if content['content_relation_type'] == ContentRelationType.Output: + updated_contents_full_output.append(content) + elif content['content_relation_type'] == ContentRelationType.Input: + updated_contents_full_input.append(content) + elif content['content_relation_type'] == ContentRelationType.InputDependency: + updated_contents_full_input_deps.append(content) + # logger.debug(log_prefix + "get_updated_contents_by_request: updated_contents[:3]: %s" % str(updated_contents[:3])) + return updated_contents, updated_contents_full_input, updated_contents_full_output, updated_contents_full_input_deps + + +def get_transform_dependency_map(transform_id, logger=None, log_prefix=''): + cache = get_redis_cache() + transform_dependcy_map_key = "transform_dependcy_map_%s" % transform_id + transform_dependcy_map = cache.get(transform_dependcy_map_key, default={}) + return transform_dependcy_map + + +def set_transform_dependency_map(transform_id, transform_dependcy_map, logger=None, log_prefix=''): + cache = get_redis_cache() + transform_dependcy_map_key = "transform_dependcy_map_%s" % transform_id + cache.set(transform_dependcy_map_key, transform_dependcy_map) + + +def get_content_dependcy_map(request_id, logger=None, log_prefix=''): + cache = get_redis_cache() + content_dependcy_map_key = "request_content_dependcy_map_%s" % request_id + content_dependcy_map = cache.get(content_dependcy_map_key, default={}) + + request_dependcy_map_key = "request_dependcy_map_%s" % request_id + request_dependcy_map = cache.get(request_dependcy_map_key, default=[]) + + collection_dependcy_map_key = "request_collections_dependcy_map_%s" % request_id + collection_dependcy_map = cache.get(collection_dependcy_map_key, default=[]) + + return content_dependcy_map, request_dependcy_map, collection_dependcy_map + + +def set_content_dependcy_map(request_id, content_dependcy_map, request_dependcy_map, + collection_dependcy_map, logger=None, log_prefix=''): + cache = get_redis_cache() + content_dependcy_map_key = "request_content_dependcy_map_%s" % request_id + cache.set(content_dependcy_map_key, content_dependcy_map) + + request_dependcy_map_key = "request_dependcy_map_%s" % request_id + cache.set(request_dependcy_map_key, request_dependcy_map) + + collection_dependcy_map_key = "request_collections_dependcy_map_%s" % request_id + cache.set(collection_dependcy_map_key, collection_dependcy_map) + + +def get_content_status_map(request_id, logger=None, log_prefix=''): + cache = get_redis_cache() + content_status_map_key = "request_content_status_map_%s" % request_id + content_status_map = cache.get(content_status_map_key, default={}) + return content_status_map + + +def set_content_status_map(request_id, content_status_map, logger=None, log_prefix=''): + cache = get_redis_cache() + content_status_map_key = "request_content_status_map_%s" % request_id + cache.set(content_status_map_key, content_status_map) + + +def get_input_dependency_map_by_request(request_id, transform_id, workload_id, work, logger=None, log_prefix=''): + logger = get_logger(logger) + + content_dependcy_map, request_dependcy_map, collection_dependcy_map = get_content_dependcy_map(request_id, logger=logger, log_prefix=log_prefix) + content_status_map = get_content_status_map(request_id, logger=logger, log_prefix=log_prefix) + + transform_dependcy_maps = {} + + refresh = False + if not content_dependcy_map or not content_status_map: + refresh = True + elif transform_id and transform_id not in request_dependcy_map: + refresh = True + elif work: + output_collections = work.get_output_collections() + for coll in output_collections: + if coll.coll_id not in collection_dependcy_map: + refresh = True + + for tf_id in request_dependcy_map: + transform_dependcy_maps[str(tf_id)] = get_transform_dependency_map(tf_id, logger=logger, log_prefix=log_prefix) + if not transform_dependcy_maps[str(tf_id)]: + refresh = True + + if refresh: + logger.debug(log_prefix + "refresh content_dependcy_map") + content_dependcy_map = {} + request_dependcy_map = [] + collection_dependcy_map = [] + content_status_map = {} + transform_dependcy_maps = {} + + content_output_name2id = {} + content_input_deps = [] + + contents = core_catalog.get_contents_by_request_transform(request_id=request_id) + # logger.debug("contents: ", contents) + for content in contents: + if content['transform_id'] not in request_dependcy_map: + request_dependcy_map.append(content['transform_id']) + if content['coll_id'] not in collection_dependcy_map: + collection_dependcy_map.append(content['coll_id']) + + content_status_map[str(content['content_id'])] = content['substatus'].value + + str_tf_id = str(content['transform_id']) + str_map_id = str(content['map_id']) + if str_tf_id not in transform_dependcy_maps: + transform_dependcy_maps[str_tf_id] = get_transform_dependency_map(str_tf_id, logger=logger, log_prefix=log_prefix) + if str_map_id not in transform_dependcy_maps[str_tf_id]: + transform_dependcy_maps[str_tf_id][str_map_id] = {'inputs': [], 'outputs': [], 'input_deps': []} + + if content['content_relation_type'] == ContentRelationType.Output: + if content['coll_id'] not in content_output_name2id: + content_output_name2id[content['coll_id']] = {} + collection_dependcy_map.append(content['coll_id']) + content_output_name2id[content['coll_id']][content['name']] = content + # content_id, status + transform_dependcy_maps[str_tf_id][str_map_id]['outputs'].append(content['content_id']) + elif content['content_relation_type'] == ContentRelationType.InputDependency: + content_input_deps.append(content) + # content_id, status + transform_dependcy_maps[str_tf_id][str_map_id]['input_deps'].append(content['content_id']) + elif content['content_relation_type'] == ContentRelationType.Input: + # content_id, status + transform_dependcy_maps[str_tf_id][str_map_id]['inputs'].append(content['content_id']) + # logger.debug("content_output_name2id: ", content_output_name2id) + + for content in content_input_deps: + dep_coll_id = content['coll_id'] + if dep_coll_id not in content_output_name2id: + logger.warn(log_prefix + "dep_coll_id: %s contents are not added yet" % dep_coll_id) + else: + dep_content = content_output_name2id[dep_coll_id].get(content['name'], None) + if dep_content: + dep_content_id = str(dep_content['content_id']) + if dep_content_id not in content_dependcy_map: + content_dependcy_map[dep_content_id] = [] + content_dependcy_map[dep_content_id].append((content['content_id'], content['transform_id'], content['map_id'])) + else: + logger.error(log_prefix + "Failed to find input dependcy for content_id: %s" % content['content_id']) + + set_content_dependcy_map(request_id, content_dependcy_map, request_dependcy_map, + collection_dependcy_map, logger=logger, log_prefix=log_prefix) + for str_tf_id in transform_dependcy_maps: + set_transform_dependency_map(str_tf_id, transform_dependcy_maps[str_tf_id], logger=logger, log_prefix=log_prefix) + set_content_status_map(request_id, content_status_map, logger=logger, log_prefix=log_prefix) + + return content_dependcy_map, transform_dependcy_maps, content_status_map + + +def get_content_status_with_status_map(content_ids, content_status_map): + content_id_status = [] + for content_id in content_ids: + status_value = content_status_map[str(content_id)] + status = ContentStatus(status_value) + content_id_status.append((content_id, status)) + return content_id_status + + +def trigger_release_inputs_no_deps(request_id, transform_id, workload_id, work, logger=None, log_prefix=''): + logger = get_logger(logger) + + content_depency_map, transform_dependency_maps, content_status_map = get_input_dependency_map_by_request(request_id, transform_id, workload_id, work, + logger=logger, log_prefix=log_prefix) + # logger.debug(log_prefix + "content_depency_map[:2]: %s" % str({k: content_depency_map[k] for k in list(content_depency_map.keys())[:2]})) + # logger.debug(log_prefix + "transform_dependency_map[:2]: %s" % str({key: transform_dependency_map[key] for k in list(transform_dependency_map.keys())[:2]})) + logger.debug(log_prefix + "transform_dependency_maps.keys[:2]: %s" % str(list(transform_dependency_maps.keys())[:2])) + + update_contents, update_contents_dict = [], {} + update_input_contents_full = {} + update_content_ids = [] + + # release jobs without inputs_dependency + # after reload from the cache, the key will be changed to string. + str_transform_id = str(transform_id) + + if str_transform_id not in transform_dependency_maps: + logger.warn(log_prefix + "transform_id %s not in transform_dependency_maps" % transform_id) + else: + transform_dependency_map = transform_dependency_maps[str_transform_id] + for map_id in transform_dependency_map: + inputs_dependency = transform_dependency_map[map_id]['input_deps'] + if len(inputs_dependency) == 0: + inputs = transform_dependency_map[map_id]['inputs'] + for content_id in inputs: + update_content_ids.append(content_id) + u_content = {'content_id': content_id, + # 'status': ContentStatus.Available, + 'substatus': ContentStatus.Available} + # update_contents.append(u_content) + update_contents_dict[content_id] = u_content + + if update_content_ids: + contents = core_catalog.get_contents_by_content_ids(update_content_ids, request_id=request_id) + for content in contents: + content_id = content['content_id'] + if update_contents_dict[content_id]['substatus'] != content['substatus']: + update_contents.append(update_contents_dict[content_id]) + if content['transform_id'] not in update_input_contents_full: + update_input_contents_full[content['transform_id']] = [] + u_content_full = {'content_id': content_id, + 'request_id': content['request_id'], + 'transform_id': content['transform_id'], + 'workload_id': content['workload_id'], + 'status': ContentStatus.Available, + 'substatus': ContentStatus.Available, + 'scope': content['scope'], + 'name': content['name'], + 'path': content['path']} + update_input_contents_full[content['transform_id']].append(u_content_full) + return update_contents, update_input_contents_full + + +def trigger_release_inputs(request_id, transform_id, workload_id, work, updated_contents_full_output, updated_contents_full_input, + updated_contents_full_input_deps, logger=None, log_prefix=''): + logger = get_logger(logger) + + status_to_check = [ContentStatus.Available, ContentStatus.FakeAvailable, ContentStatus.FinalFailed, ContentStatus.Missing] + # status_to_check_fake = [ContentStatus.FakeAvailable, ContentStatus.Missing] + + content_depency_map, transform_dependency_maps, content_status_map = get_input_dependency_map_by_request(request_id, transform_id, workload_id, work, + logger=logger, log_prefix=log_prefix) + # logger.debug(log_prefix + "content_depency_map[:2]: %s" % str({k: content_depency_map[k] for k in list(content_depency_map.keys())[:2]})) + # logger.debug(log_prefix + "transform_dependency_map[:2]: %s" % str({key: transform_dependency_maps[key] for k in list(transform_dependency_maps.keys())[:2]})) + logger.debug(log_prefix + "transform_dependency_maps.keys[:2]: %s" % str(list(transform_dependency_maps.keys())[:2])) + + triggered_map_ids, triggered_map_ids_input = [], [] + update_contents = [] + update_trigger_contents_dict, update_contents_dict = {}, {} + update_input_contents_full = {} + # 1. use the outputs to check input_dependency + for content in updated_contents_full_output: + # update the status + u_content = {'content_id': content['content_id'], 'status': content['substatus']} + update_contents.append(u_content) + + if content['substatus'] in status_to_check: + str_content_id = str(content['content_id']) + content_status_map[str_content_id] = content['substatus'].value + + if str_content_id in content_depency_map: + # t_contents are the contents to be triggered + t_contents = content_depency_map[str_content_id] + for t_content in t_contents: + t_content_id, t_transform_id, t_map_id = t_content + + content_status_map[str(t_content_id)] = content['substatus'].value + # update the input_dependency + u_content = {'content_id': t_content_id, + # 'status': content['status'], + 'substatus': content['substatus']} + # update_contents.append(u_content) + # update_contents_ids.append(t_content_id) + update_trigger_contents_dict[t_content_id] = u_content + + t_tf_map_id = (str(t_transform_id), str(t_map_id)) + if t_tf_map_id not in triggered_map_ids: + triggered_map_ids.append(t_tf_map_id) + + # 2. use the inputs to check the outputs + # If the input is missing or fakeavailable, set the outputs missing. + for content in updated_contents_full_input: + # u_content = {'content_id': content['content_id'], 'status': content['substatus']} + # update_contents.append(u_content) + # if content['substatus'] in status_to_check_fake: + if content['substatus'] in status_to_check: + content_status_map[str(content['content_id'])] = content['substatus'].value + t_tf_map_id = (str(content['transform_id']), str(content['map_id'])) + if t_tf_map_id not in triggered_map_ids_input: + triggered_map_ids_input.append(t_tf_map_id) + for t_tf_map_id in triggered_map_ids_input: + transform_id, map_id = t_tf_map_id + inputs = transform_dependency_maps[transform_id][map_id]['inputs'] + inputs_status = get_content_status_with_status_map(inputs, content_status_map) + + if is_all_contents_available(inputs_status): + for content_id, status in inputs_status: + u_content = {'content_id': content_id, + 'status': status, + 'substatus': status} + update_contents.append(u_content) + elif is_all_contents_terminated_but_not_available(inputs_status): + # for this case, will not generate jobs. so we need to set the output status. + for content_id, status in inputs_status: + u_content = {'content_id': content_id, + 'status': status, + 'substatus': status} + update_contents.append(u_content) + # update the outputs + outputs = transform_dependency_maps[transform_id][map_id]['outputs'] + for content_id in outputs: + u_content = {'content_id': content_id, + # 'status': content_update_status, + 'substatus': ContentStatus.Missing} + update_contents.append(u_content) + + # 3. use the input_deps to update the trigger contents + for content in updated_contents_full_input_deps: + if content['substatus'] in status_to_check: + content_status_map[str(content['content_id'])] = content['substatus'].value + # u_content = {'content_id': content['content_id'], + # # 'status': content['status'], + # 'substatus': content['substatus']} + # update_trigger_contents_dict[content['content_id']] = u_content + t_tf_map_id = (str(content['transform_id']), str(content['map_id'])) + if t_tf_map_id not in triggered_map_ids: + triggered_map_ids.append(t_tf_map_id) + + # update the content status + set_content_status_map(request_id, content_status_map, logger=logger, log_prefix=log_prefix) + + # 4. use the updated input_dependency to release inputs + input_update_content_ids = [] + for t_tf_map_id in triggered_map_ids: + transform_id, map_id = t_tf_map_id + inputs_dependency = transform_dependency_maps[transform_id][map_id]['input_deps'] + inputs_dependency_status = get_content_status_with_status_map(inputs_dependency, content_status_map) + + content_update_status = None + if is_all_contents_available(inputs_dependency_status): + content_update_status = ContentStatus.Available + elif is_all_contents_terminated(inputs_dependency_status): + content_update_status = ContentStatus.Missing + if content_update_status: + for content_id, status in inputs_dependency_status: + # update input dependencies status from substatus + u_content = {'content_id': content_id, + 'status': status, + 'substatus': status} + update_contents.append(u_content) + if content_id in update_trigger_contents_dict: + del update_trigger_contents_dict[content_id] + + inputs = transform_dependency_maps[transform_id][map_id]['inputs'] + for content_id in inputs: + u_content = {'content_id': content_id, + # 'status': content_update_status, + 'substatus': content_update_status} + update_contents.append(u_content) + update_contents_dict[content_id] = u_content + input_update_content_ids.append(content_id) + + # the input is not triggered. only update the substatus, not update the status + for content_id in update_trigger_contents_dict: + update_contents.append(update_trigger_contents_dict[content_id]) + + if input_update_content_ids: + contents = core_catalog.get_contents_by_content_ids(input_update_content_ids, request_id=request_id) + for content in contents: + content_id = content['content_id'] + # if update_contents_dict[content_id]['status'] != content['status']: + # update_contents.append(update_contents_dict[content_id]) + if True: + if content['transform_id'] not in update_input_contents_full: + update_input_contents_full[content['transform_id']] = [] + u_content_full = {'content_id': content_id, + 'request_id': content['request_id'], + 'transform_id': content['transform_id'], + 'workload_id': content['workload_id'], + 'status': update_contents_dict[content_id]['substatus'], + 'substatus': update_contents_dict[content_id]['substatus'], + 'scope': content['scope'], + 'name': content['name'], + 'path': content['path']} + update_input_contents_full[content['transform_id']].append(u_content_full) + + return update_contents, update_input_contents_full + + +def poll_missing_outputs(input_output_maps): + content_updates_missing, updated_contents_full_missing = [], [] + + for map_id in input_output_maps: + inputs = input_output_maps[map_id]['inputs'] if 'inputs' in input_output_maps[map_id] else [] + # inputs_dependency = input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in input_output_maps[map_id] else [] + outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] + # logs = input_output_maps[map_id]['logs'] if 'logs' in input_output_maps[map_id] else [] + + content_update_status = None + if is_all_contents_terminated_but_not_available(inputs): + content_update_status = ContentStatus.Missing + + for content in outputs: + content['substatus'] = content_update_status + if content['status'] != content['substatus']: + u_content = {'content_id': content['content_id'], + 'substatus': content['substatus']} + + content_updates_missing.append(u_content) + updated_contents_full_missing.append(content) + + return content_updates_missing, updated_contents_full_missing + + +def handle_update_processing(processing, agent_attributes, logger=None, log_prefix=''): + logger = get_logger(logger) + + ret_msgs = [] + new_contents = [] + + request_id = processing['request_id'] + transform_id = processing['transform_id'] + workload_id = processing['workload_id'] + + proc = processing['processing_metadata']['processing'] + work = proc.work + work.set_agent_attributes(agent_attributes, processing) + + input_output_maps = get_input_output_maps(transform_id, work) + logger.debug(log_prefix + "get_input_output_maps: len: %s" % len(input_output_maps)) + logger.debug(log_prefix + "get_input_output_maps.keys[:3]: %s" % str(list(input_output_maps.keys())[:3])) + + new_input_output_maps = work.get_new_input_output_maps(input_output_maps) + logger.debug(log_prefix + "get_new_input_output_maps: len: %s" % len(new_input_output_maps)) + logger.debug(log_prefix + "get_new_input_output_maps.keys[:3]: %s" % str(list(new_input_output_maps.keys())[:3])) + + ret_poll_processing = work.poll_processing_updates(processing, input_output_maps, log_prefix=log_prefix) + process_status, content_updates, new_input_output_maps1, updated_contents_full, parameters = ret_poll_processing + new_input_output_maps.update(new_input_output_maps1) + logger.debug(log_prefix + "poll_processing_updates process_status: %s" % process_status) + logger.debug(log_prefix + "poll_processing_updates content_updates[:3]: %s" % content_updates[:3]) + logger.debug(log_prefix + "poll_processing_updates new_input_output_maps1.keys[:3]: %s" % (list(new_input_output_maps1.keys())[:3])) + logger.debug(log_prefix + "poll_processing_updates updated_contents_full[:3]: %s" % (updated_contents_full[:3])) + + ret_new_contents = get_new_contents(request_id, transform_id, workload_id, new_input_output_maps) + new_input_contents, new_output_contents, new_log_contents, new_input_dependency_contents = ret_new_contents + new_contents = new_input_contents + new_output_contents + new_log_contents + new_input_dependency_contents + + content_updates_missing, updated_contents_full_missing = poll_missing_outputs(input_output_maps) + + if new_input_contents: + msgs = generate_messages(request_id, transform_id, workload_id, work, msg_type='file', + files=new_input_contents, relation_type='input') + ret_msgs = ret_msgs + msgs + if new_output_contents: + msgs = generate_messages(request_id, transform_id, workload_id, work, msg_type='file', + files=new_output_contents, relation_type='output') + ret_msgs = ret_msgs + msgs + + updated_contents_full = updated_contents_full + updated_contents_full_missing + if updated_contents_full: + msgs = generate_messages(request_id, transform_id, workload_id, work, msg_type='file', + files=updated_contents_full, relation_type='output') + ret_msgs = ret_msgs + msgs + + return process_status, new_contents, ret_msgs, content_updates + content_updates_missing, parameters + + +def handle_trigger_processing(processing, agent_attributes, logger=None, log_prefix=''): + logger = get_logger(logger) + + ret_msgs = [] + content_updates = [] + + request_id = processing['request_id'] + transform_id = processing['transform_id'] + workload_id = processing['workload_id'] + + proc = processing['processing_metadata']['processing'] + work = proc.work + work.set_agent_attributes(agent_attributes, processing) + + if not work.use_dependency_to_release_jobs(): + return processing['substatus'], [], [], {} + else: + content_updates_trigger_no_deps, updated_input_contents_no_deps = [], [] + content_updates_trigger_no_deps, updated_input_contents_no_deps = trigger_release_inputs_no_deps(request_id, transform_id, workload_id, work, + logger, log_prefix) + logger.debug(log_prefix + "trigger_release_inputs_no_deps: content_updates_trigger_no_deps[:3] %s" % (content_updates_trigger_no_deps[:3])) + # logger.debug(log_prefix + "trigger_release_inputs_no_deps: updated_input_contents_no_deps[:3] %s" % (updated_input_contents_no_deps[:3])) + + content_updates = content_updates + content_updates_trigger_no_deps + if updated_input_contents_no_deps: + for trigger_tf_id in updated_input_contents_no_deps: + logger.debug(log_prefix + "trigger_release_inputs_no_deps: updated_input_contents_no_deps[%s][:3] %s" % (trigger_tf_id, + updated_input_contents_no_deps[trigger_tf_id][:3])) + trigger_req_id = updated_input_contents_no_deps[trigger_tf_id][0]['request_id'] + trigger_workload_id = updated_input_contents_no_deps[trigger_tf_id][0]['workload_id'] + + msgs = generate_messages(trigger_req_id, trigger_tf_id, trigger_workload_id, work, msg_type='file', + files=updated_input_contents_no_deps[trigger_tf_id], relation_type='input') + ret_msgs = ret_msgs + msgs + + is_terminated = is_process_terminated(processing['substatus']) + updated_contents_ret = get_updated_contents_by_request(request_id, transform_id, workload_id, work, terminated=is_terminated, + logger=logger, log_prefix=log_prefix) + + updated_contents, updated_contents_full_input, updated_contents_full_output, updated_contents_full_input_deps = updated_contents_ret + logger.debug(log_prefix + "handle_trigger_processing: updated_contents[:3] %s" % (updated_contents[:3])) + + if updated_contents_full_input: + # if the content is updated by receiver, here is the place to broadcast the messages + msgs = generate_messages(request_id, transform_id, workload_id, work, msg_type='file', + files=updated_contents_full_input, relation_type='input') + ret_msgs = ret_msgs + msgs + if updated_contents_full_output: + # if the content is updated by receiver, here is the place to broadcast the messages + msgs = generate_messages(request_id, transform_id, workload_id, work, msg_type='file', + files=updated_contents_full_output, relation_type='output') + ret_msgs = ret_msgs + msgs + + # not flusht updated_contents here + # content_updates = content_updates + updated_contents + + # updated_contents_full_output_input_deps = updated_contents_full_output + updated_contents_full_input_deps + if updated_contents_full_output or updated_contents_full_input_deps or updated_contents_full_input: + content_updates_trigger, updated_input_contents = trigger_release_inputs(request_id, transform_id, workload_id, work, + updated_contents_full_output, + updated_contents_full_input, + updated_contents_full_input_deps, + logger, log_prefix) + logger.debug(log_prefix + "trigger_release_inputs: content_updates_trigger[:3] %s" % (content_updates_trigger[:3])) + # logger.debug(log_prefix + "trigger_release_inputs: updated_input_contents[:3] %s" % (updated_input_contents[:3])) + + content_updates = content_updates + content_updates_trigger + if updated_input_contents: + for trigger_tf_id in updated_input_contents: + logger.debug(log_prefix + "trigger_release_inputs: updated_input_contents[%s][:3] %s" % (trigger_tf_id, + updated_input_contents[trigger_tf_id][:3])) + trigger_req_id = updated_input_contents[trigger_tf_id][0]['request_id'] + trigger_workload_id = updated_input_contents[trigger_tf_id][0]['workload_id'] + + msgs = generate_messages(trigger_req_id, trigger_tf_id, trigger_workload_id, work, msg_type='file', + files=updated_input_contents[trigger_tf_id], relation_type='input') + ret_msgs = ret_msgs + msgs + return processing['substatus'], content_updates, ret_msgs, {} + + +def get_content_status_from_panda_msg_status(status): + status_map = {'starting': ContentStatus.New, + 'running': ContentStatus.Processing, + 'finished': ContentStatus.Available, + 'failed': ContentStatus.Failed} + if status in status_map: + return status_map[status] + return ContentStatus.New + + +def get_collection_id_transform_id_map(coll_id, request_id, request_ids=[]): + cache = get_redis_cache() + coll_tf_id_map_key = "collection_id_transform_id_map" + coll_tf_id_map = cache.get(coll_tf_id_map_key, default={}) + + if coll_id is None or coll_id not in coll_tf_id_map: + if not request_ids: + request_ids = [] + if request_id not in request_ids: + request_ids.append(request_id) + colls = core_catalog.get_collections_by_request_ids(request_ids) + for coll in colls: + coll_tf_id_map[coll['coll_id']] = (coll['request_id'], coll['transform_id'], coll['workload_id']) + + cache.set(coll_tf_id_map_key, coll_tf_id_map) + + if coll_id is None or coll_id not in coll_tf_id_map: + return None, None, None + return coll_tf_id_map[coll_id] + + +def get_workload_id_transform_id_map(workload_id): + cache = get_redis_cache() + workload_id_transform_id_map_key = "all_worloadid2transformid_map" + workload_id_transform_id_map = cache.get(workload_id_transform_id_map_key, default={}) + + workload_id_transform_id_map_notexist_key = "all_worloadid2transformid_map_notexist" + workload_id_transform_id_map_notexist = cache.get(workload_id_transform_id_map_notexist_key, default=[]) + + if workload_id in workload_id_transform_id_map_notexist: + return None + + request_ids = [] + if not workload_id_transform_id_map or workload_id not in workload_id_transform_id_map: + processing_status = [ProcessingStatus.New, + ProcessingStatus.Submitting, ProcessingStatus.Submitted, + ProcessingStatus.Running, ProcessingStatus.FinishedOnExec, + ProcessingStatus.Cancel, ProcessingStatus.FinishedOnStep, + ProcessingStatus.ToCancel, ProcessingStatus.Cancelling, + ProcessingStatus.ToSuspend, ProcessingStatus.Suspending, + ProcessingStatus.ToResume, ProcessingStatus.Resuming, + ProcessingStatus.ToExpire, ProcessingStatus.Expiring, + ProcessingStatus.ToFinish, ProcessingStatus.ToForceFinish] + + procs = core_processings.get_processings_by_status(status=processing_status) + for proc in procs: + processing = proc['processing_metadata']['processing'] + work = processing.work + if work.use_dependency_to_release_jobs(): + workload_id_transform_id_map[proc['workload_id']] = (proc['request_id'], proc['transform_id'], proc['processing_id']) + if proc['request_id'] not in request_ids: + request_ids.append(proc['request_id']) + + cache.set(workload_id_transform_id_map_key, workload_id_transform_id_map) + + # renew the collection to transform map + if request_ids: + get_collection_id_transform_id_map(coll_id=None, request_id=request_ids[0], request_ids=request_ids) + + # for tasks running in some other instances + if workload_id not in workload_id_transform_id_map: + workload_id_transform_id_map_notexist.append(workload_id) + return None + + return workload_id_transform_id_map[workload_id] + + +def get_input_name_content_id_map(request_id, workload_id, transform_id): + cache = get_redis_cache() + input_name_content_id_map_key = "transform_input_contentid_map_%s" % transform_id + input_name_content_id_map = cache.get(input_name_content_id_map_key, default={}) + + if not input_name_content_id_map: + contents = core_catalog.get_contents_by_request_transform(request_id=request_id, transform_id=transform_id) + input_name_content_id_map = {} + for content in contents: + if content['content_relation_type'] == ContentRelationType.Output: + input_name_content_id_map[content['name']] = content['content_id'] + + cache.set(input_name_content_id_map_key, input_name_content_id_map) + return input_name_content_id_map + + +def get_jobid_content_id_map(request_id, workload_id, transform_id, job_id, inputs): + cache = get_redis_cache() + jobid_content_id_map_key = "transform_jobid_contentid_map_%s" % transform_id + jobid_content_id_map = cache.get(jobid_content_id_map_key, default={}) + + to_update_jobid = False + job_id = str(job_id) + if not jobid_content_id_map or job_id not in jobid_content_id_map: + to_update_jobid = True + input_name_content_id_map = get_input_name_content_id_map(request_id, workload_id, transform_id) + for ip in inputs: + if ':' in ip: + pos = ip.find(":") + ip = ip[pos + 1:] + if ip in input_name_content_id_map: + content_id = input_name_content_id_map[ip] + jobid_content_id_map[job_id] = content_id + break + + cache.set(jobid_content_id_map_key, jobid_content_id_map) + return jobid_content_id_map, to_update_jobid + + +def get_content_id_from_job_id(request_id, workload_id, transform_id, job_id, inputs): + jobid_content_id_map, to_update_jobid = get_jobid_content_id_map(request_id, workload_id, transform_id, job_id, inputs) + + if str(job_id) in jobid_content_id_map: + content_id = jobid_content_id_map[str(job_id)] + else: + content_id = None + return content_id, to_update_jobid + + +def handle_messages_processing(messages): + logger = get_logger() + log_prefix = "" + + update_processings = [] + terminated_processings = [] + update_contents = [] + + for ori_msg in messages: + msg = json.loads(ori_msg) + if 'taskid' not in msg or not msg['taskid']: + continue + + workload_id = msg['taskid'] + ret_req_tf_pr_id = get_workload_id_transform_id_map(workload_id) + if not ret_req_tf_pr_id: + # request is submitted by some other instances + continue + + logger.debug(log_prefix + "Received message: %s" % str(ori_msg)) + + if msg['msg_type'] in ['task_status']: + workload_id = msg['taskid'] + status = msg['status'] + if status in ['pending']: # 'prepared' + req_id, tf_id, processing_id = ret_req_tf_pr_id + # new_processings.append((req_id, tf_id, processing_id, workload_id, status)) + if processing_id not in update_processings: + update_processings.append(processing_id) + elif status in ['finished', 'done']: + req_id, tf_id, processing_id = ret_req_tf_pr_id + # update_processings.append((processing_id, status)) + if processing_id not in update_processings: + terminated_processings.append(processing_id) + + if msg['msg_type'] in ['job_status']: + workload_id = msg['taskid'] + job_id = msg['jobid'] + status = msg['status'] + inputs = msg['inputs'] + if inputs and status in ['finished']: + req_id, tf_id, processing_id = ret_req_tf_pr_id + content_id, to_update_jobid = get_content_id_from_job_id(req_id, workload_id, tf_id, job_id, inputs) + if content_id: + if to_update_jobid: + u_content = {'content_id': content_id, + # 'status': get_content_status_from_panda_msg_status(status), + 'substatus': get_content_status_from_panda_msg_status(status), + 'content_metadata': {'panda_id': job_id}} + else: + u_content = {'content_id': content_id, + 'substatus': get_content_status_from_panda_msg_status(status)} + # # 'status': get_content_status_from_panda_msg_status(status)} + + update_contents.append(u_content) + if processing_id not in update_processings: + update_processings.append(processing_id) + + return update_processings, terminated_processings, update_contents, [] + + +def sync_collection_status(request_id, transform_id, workload_id, work, input_output_maps=None, + close_collection=False, force_close_collection=False, terminate=False): + if input_output_maps is None: + input_output_maps = get_input_output_maps(transform_id, work) + + all_updates_flushed = True + coll_status = {} + for map_id in input_output_maps: + inputs = input_output_maps[map_id]['inputs'] if 'inputs' in input_output_maps[map_id] else [] + # inputs_dependency = input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in input_output_maps[map_id] else [] + outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] + logs = input_output_maps[map_id]['logs'] if 'logs' in input_output_maps[map_id] else [] + + for content in inputs + outputs + logs: + if content['coll_id'] not in coll_status: + coll_status[content['coll_id']] = {'total_files': 0, 'processed_files': 0, 'processing_files': 0, 'bytes': 0} + coll_status[content['coll_id']]['total_files'] += 1 + + if content['status'] in [ContentStatus.Available, ContentStatus.Mapped, + ContentStatus.Available.value, ContentStatus.Mapped.value, + ContentStatus.FakeAvailable, ContentStatus.FakeAvailable.value]: + coll_status[content['coll_id']]['processed_files'] += 1 + coll_status[content['coll_id']]['bytes'] += content['bytes'] + else: + coll_status[content['coll_id']]['processing_files'] += 1 + + if content['status'] != content['substatus']: + all_updates_flushed = False + + input_collections = work.get_input_collections(poll_externel=True) + output_collections = work.get_output_collections() + log_collections = work.get_log_collections() + + update_collections = [] + for coll in input_collections + output_collections + log_collections: + if coll.coll_id in coll_status: + if 'total_files' in coll.coll_metadata and coll.coll_metadata['total_files']: + coll.total_files = coll.coll_metadata['total_files'] + else: + coll.total_files = coll_status[coll.coll_id]['total_files'] + coll.processed_files = coll_status[coll.coll_id]['processed_files'] + coll.processing_files = coll_status[coll.coll_id]['processing_files'] + coll.bytes = coll_status[coll.coll_id]['bytes'] + else: + coll.total_files = 0 + coll.processed_files = 0 + coll.processing_files = 0 + + u_coll = {'coll_id': coll.coll_id, + 'total_files': coll.total_files, + 'processed_files': coll.processed_files, + 'processing_files': coll.processing_files, + 'bytes': coll.bytes} + if terminate: + if force_close_collection or close_collection and all_updates_flushed or coll.status == CollectionStatus.Closed: + u_coll['status'] = CollectionStatus.Closed + u_coll['substatus'] = CollectionStatus.Closed + coll.status = CollectionStatus.Closed + coll.substatus = CollectionStatus.Closed + + update_collections.append(u_coll) + return update_collections, all_updates_flushed + + +def sync_work_status(request_id, transform_id, workload_id, work): + input_collections = work.get_input_collections() + output_collections = work.get_output_collections() + log_collections = work.get_log_collections() + + is_all_collections_closed = True + is_all_files_processed = True + is_all_files_failed = True + for coll in input_collections + output_collections + log_collections: + if coll.status != CollectionStatus.Closed: + is_all_collections_closed = False + for coll in output_collections: + if coll.total_files != coll.processed_files: + is_all_files_processed = False + if coll.processed_files > 0: + is_all_files_failed = False + + if is_all_collections_closed: + if is_all_files_failed: + work.status = WorkStatus.Failed + elif is_all_files_processed: + work.status = WorkStatus.Finished + else: + work.status = WorkStatus.SubFinished + + +def sync_processing(processing, agent_attributes, terminate=False, logger=None, log_prefix=""): + logger = get_logger() + + request_id = processing['request_id'] + transform_id = processing['transform_id'] + workload_id = processing['workload_id'] + + proc = processing['processing_metadata']['processing'] + work = proc.work + work.set_agent_attributes(agent_attributes, processing) + + # input_output_maps = get_input_output_maps(transform_id, work) + update_collections, all_updates_flushed = sync_collection_status(request_id, transform_id, workload_id, work, + input_output_maps=None, close_collection=True, + terminate=terminate) + + messages = [] + sync_work_status(request_id, transform_id, workload_id, work) + logger.info(log_prefix + "sync_processing: work status: %s" % work.get_status()) + if terminate and work.is_terminated(): + messages = generate_messages(request_id, transform_id, workload_id, work, msg_type='work') + if work.is_finished(): + processing['status'] = ProcessingStatus.Finished + elif work.is_subfinished(): + processing['status'] = ProcessingStatus.SubFinished + elif work.is_failed(): + processing['status'] = ProcessingStatus.Failed + else: + processing['status'] = ProcessingStatus.SubFinished + return processing, update_collections, messages + + +def handle_abort_processing(processing, agent_attributes, logger=None, log_prefix=''): + logger = get_logger(logger) + + request_id = processing['request_id'] + transform_id = processing['transform_id'] + workload_id = processing['workload_id'] + + proc = processing['processing_metadata']['processing'] + work = proc.work + work.set_agent_attributes(agent_attributes, processing) + + work.abort_processing(processing, log_prefix=log_prefix) + + input_collections = work.get_input_collections() + output_collections = work.get_output_collections() + log_collections = work.get_log_collections() + + # input_output_maps = get_input_output_maps(transform_id, work) + update_collections, all_updates_flushed = sync_collection_status(request_id, transform_id, workload_id, work, + input_output_maps=None, close_collection=True, + force_close_collection=True) + + for coll in input_collections + output_collections + log_collections: + coll.status = CollectionStatus.Closed + coll.substatus = CollectionStatus.Closed + update_contents = [] + + # processing['status'] = ProcessingStatus.Cancelled + return processing, update_collections, update_contents + + +def reactive_contents(request_id, transform_id, workload_id, work, input_output_maps): + updated_contents = [] + contents = core_catalog.get_contents_by_request_transform(request_id=request_id, transform_id=transform_id) + for content in contents: + if content['status'] not in [ContentStatus.Available, ContentStatus.Mapped, + ContentStatus.Available.value, ContentStatus.Mapped.value, + ContentStatus.FakeAvailable, ContentStatus.FakeAvailable.value]: + u_content = {'content_id': content['content_id'], + 'substatus': ContentStatus.New, + 'status': ContentStatus.New} + updated_contents.append(u_content) + return updated_contents + + +def handle_resume_processing(processing, agent_attributes, logger=None, log_prefix=''): + logger = get_logger(logger) + + request_id = processing['request_id'] + transform_id = processing['transform_id'] + workload_id = processing['workload_id'] + + proc = processing['processing_metadata']['processing'] + work = proc.work + work.set_agent_attributes(agent_attributes, processing) + + work.resume_processing(processing, log_prefix=log_prefix) + + input_collections = work.get_input_collections() + output_collections = work.get_output_collections() + log_collections = work.get_log_collections() + + update_collections = [] + for coll in input_collections + output_collections + log_collections: + coll.status = CollectionStatus.Open + coll.substatus = CollectionStatus.Open + u_collection = {'coll_id': coll.coll_id, + 'status': CollectionStatus.Open, + 'substatus': CollectionStatus.Open} + update_collections.append(u_collection) + + input_output_maps = get_input_output_maps(transform_id, work) + update_contents = reactive_contents(request_id, transform_id, workload_id, work, input_output_maps) + + processing['status'] = ProcessingStatus.Processing + return processing, update_collections, update_contents diff --git a/main/lib/idds/agents/clerk/clerk.py b/main/lib/idds/agents/clerk/clerk.py index 8f6fea2e..a68374ee 100644 --- a/main/lib/idds/agents/clerk/clerk.py +++ b/main/lib/idds/agents/clerk/clerk.py @@ -9,27 +9,29 @@ # - Wen Guan, , 2019 - 2022 import datetime +import random import time import traceback -try: - # python 3 - from queue import Queue -except ImportError: - # Python 2 - from Queue import Queue from idds.common import exceptions from idds.common.constants import (Sections, RequestStatus, RequestLocking, - TransformStatus, MessageType, MessageStatus, - ProcessingStatus, - MessageSource, MessageDestination, - ContentStatus, ContentRelationType) + TransformStatus, CommandType, + CommandStatus, CommandLocking) from idds.common.utils import setup_logging, truncate_string from idds.core import (requests as core_requests, transforms as core_transforms, - processings as core_processings, - catalog as core_catalog) + commands as core_commands) from idds.agents.common.baseagent import BaseAgent +from idds.agents.common.eventbus.event import (EventType, + NewRequestEvent, + UpdateRequestEvent, + AbortRequestEvent, + ResumeRequestEvent, + NewTransformEvent, + UpdateTransformEvent, + AbortTransformEvent, + ResumeTransformEvent, + ExpireRequestEvent) setup_logging(__name__) @@ -39,9 +41,9 @@ class Clerk(BaseAgent): Clerk works to process requests and converts requests to transforms. """ - def __init__(self, num_threads=1, poll_time_period=10, retrieve_bulk_size=10, pending_time=None, **kwargs): - super(Clerk, self).__init__(num_threads=num_threads, **kwargs) - self.poll_time_period = int(poll_time_period) + def __init__(self, num_threads=1, poll_period=10, retrieve_bulk_size=10, pending_time=None, **kwargs): + super(Clerk, self).__init__(num_threads=num_threads, name='Clerk', **kwargs) + self.poll_period = int(poll_period) self.retrieve_bulk_size = int(retrieve_bulk_size) self.config_section = Sections.Clerk if pending_time: @@ -56,48 +58,63 @@ def __init__(self, num_threads=1, poll_time_period=10, retrieve_bulk_size=10, pe else: self.release_helper = False - self.new_task_queue = Queue() - self.new_output_queue = Queue() - self.running_task_queue = Queue() - self.running_output_queue = Queue() - self.new_processing_size = 0 - self.running_processing_size = 0 + if not hasattr(self, 'new_poll_period') or not self.new_poll_period: + self.new_poll_period = self.poll_period + else: + self.new_poll_period = int(self.new_poll_period) + if not hasattr(self, 'update_poll_period') or not self.update_poll_period: + self.update_poll_period = self.poll_period + else: + self.update_poll_period = int(self.update_poll_period) - def show_queue_size(self): - q_str = "new queue size: %s, processing size: %s, output queue size: %s, " % (self.new_task_queue.qsize(), - self.new_processing_size, - self.new_output_queue.qsize()) - q_str += "running queue size: %s, processing size: %s, output queue size: %s" % (self.running_task_queue.qsize(), - self.running_processing_size, - self.running_output_queue.qsize()) - self.logger.debug(q_str) + if hasattr(self, 'poll_period_increase_rate'): + self.poll_period_increase_rate = float(self.poll_period_increase_rate) + else: + self.poll_period_increase_rate = 2 - def generate_transform(self, req, work): - wf = req['request_metadata']['workflow'] + if hasattr(self, 'max_new_poll_period'): + self.max_new_poll_period = int(self.max_new_poll_period) + else: + self.max_new_poll_period = 3600 * 6 + if hasattr(self, 'max_update_poll_period'): + self.max_update_poll_period = int(self.max_update_poll_period) + else: + self.max_update_poll_period = 3600 * 6 - work.set_request_id(req['request_id']) - work.username = req['username'] + if not hasattr(self, 'new_command_poll_period') or not self.new_command_poll_period: + self.new_command_poll_period = 1 + else: + self.new_command_poll_period = int(self.new_command_poll_period) + if not hasattr(self, 'update_command_poll_period') or not self.update_command_poll_period: + self.update_command_poll_period = self.poll_period + else: + self.update_command_poll_period = int(self.update_command_poll_period) - new_transform = {'request_id': req['request_id'], - 'workload_id': req['workload_id'], - 'transform_type': work.get_work_type(), - 'transform_tag': work.get_work_tag(), - 'priority': req['priority'], - 'status': TransformStatus.New, - 'retries': 0, - # 'expired_at': req['expired_at'], - 'expired_at': None, - 'transform_metadata': {'internal_id': work.get_internal_id(), - 'template_work_id': work.get_template_work_id(), - 'sequence_id': work.get_sequence_id(), - 'work_name': work.get_work_name(), - 'work': work, - 'workflow': wf} - # 'running_metadata': {'work_data': new_work.get_running_data()} - # 'collections': related_collections - } + if hasattr(self, 'max_new_retries'): + self.max_new_retries = int(self.max_new_retries) + else: + self.max_new_retries = 3 + if hasattr(self, 'max_update_retries'): + self.max_update_retries = int(self.max_update_retries) + else: + # 0 or None means no limitations. + self.max_update_retries = 0 - return new_transform + self.number_workers = 0 + if not hasattr(self, 'max_number_workers') or not self.max_number_workers: + self.max_number_workers = 3 + else: + self.max_number_workers = int(self.max_number_workers) + + def is_ok_to_run_more_requests(self): + if self.number_workers >= self.max_number_workers: + return False + return True + + def show_queue_size(self): + if self.number_workers > 0: + q_str = "number of requests: %s, max number of requests: %s" % (self.number_workers, self.max_number_workers) + self.logger.debug(q_str) def get_new_requests(self): """ @@ -108,18 +125,24 @@ def get_new_requests(self): # reqs_open = core_requests.get_requests_by_status_type(status=req_status, time_period=3600) # self.logger.info("Main thread get %s TransformingOpen requests to process" % len(reqs_open)) - if self.new_task_queue.qsize() > 0 or self.new_output_queue.qsize() > 0: + if not self.is_ok_to_run_more_requests(): return [] self.show_queue_size() req_status = [RequestStatus.New, RequestStatus.Extend] reqs_new = core_requests.get_requests_by_status_type(status=req_status, locking=True, + not_lock=True, + new_poll=True, only_return_id=True, bulk_size=self.retrieve_bulk_size) - self.logger.debug("Main thread get %s [New+Extend] requests to process" % len(reqs_new)) + # self.logger.debug("Main thread get %s [New+Extend] requests to process" % len(reqs_new)) if reqs_new: - self.logger.info("Main thread get %s [New+Extend] requests to process" % len(reqs_new)) + self.logger.info("Main thread get [New+Extend] requests to process: %s" % str(reqs_new)) + + for req_id in reqs_new: + event = NewRequestEvent(publisher_id=self.id, request_id=req_id) + self.event_bus.send(event) return reqs_new except exceptions.DatabaseException as ex: @@ -131,9 +154,166 @@ def get_new_requests(self): self.logger.error(traceback.format_exc()) return [] - def process_new_request(self, req): + def get_running_requests(self): + """ + Get running requests + """ try: - self.logger.info("Processing request(%s)" % (req['request_id'])) + if not self.is_ok_to_run_more_requests(): + return [] + + self.show_queue_size() + + req_status = [RequestStatus.Transforming, RequestStatus.ToCancel, RequestStatus.Cancelling, + RequestStatus.ToSuspend, RequestStatus.Suspending, + RequestStatus.ToExpire, RequestStatus.Expiring, + RequestStatus.ToFinish, RequestStatus.ToForceFinish, + RequestStatus.ToResume, RequestStatus.Resuming] + reqs = core_requests.get_requests_by_status_type(status=req_status, time_period=None, + locking=True, bulk_size=self.retrieve_bulk_size, + not_lock=True, update_poll=True, only_return_id=True) + + # self.logger.debug("Main thread get %s Transforming requests to running" % len(reqs)) + if reqs: + self.logger.info("Main thread get Transforming requests to running: %s" % str(reqs)) + + for req_id in reqs: + event = UpdateRequestEvent(publisher_id=self.id, request_id=req_id) + self.event_bus.send(event) + + return reqs + except exceptions.DatabaseException as ex: + if 'ORA-00060' in str(ex): + self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") + else: + # raise ex + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + return [] + + def get_operation_requests(self): + """ + Get running requests + """ + try: + if not self.is_ok_to_run_more_requests(): + return [] + + self.show_queue_size() + + status = [CommandStatus.New] + new_commands = core_commands.get_commands_by_status(status=status, locking=True, period=self.new_command_poll_period) + status = [CommandStatus.Processing] + processing_commands = core_commands.get_commands_by_status(status=status, locking=True, + period=self.update_command_poll_period) + commands = new_commands + processing_commands + + # self.logger.debug("Main thread get %s commands" % len(commands)) + if commands: + self.logger.info("Main thread get %s commands" % len(commands)) + + update_commands = [] + for cmd in commands: + request_id = cmd['request_id'] + cmd_content = cmd['cmd_content'] + cmd_type = cmd['cmd_type'] + cmd_status = cmd['status'] + event_content = {'request_id': request_id, + 'cmd_type': cmd_type, + 'cmd_id': cmd['cmd_id'], + 'cmd_content': cmd_content} + + event = None + if cmd_status in [CommandStatus.New, CommandStatus.Processing]: + if cmd_type in [CommandType.AbortRequest]: + event = AbortRequestEvent(publisher_id=self.id, request_id=request_id, content=event_content) + elif cmd_type in [CommandType.ResumeRequest]: + event = ResumeRequestEvent(publisher_id=self.id, request_id=request_id, content=event_content) + # elif cmd_status in [CommandStatus.Processing]: + # event = UpdateRequestEvent(publisher_id=self.id, request_id=request_id, content=event_content) + + if event: + self.event_bus.send(event) + + u_command = {'cmd_id': cmd['cmd_id'], + 'status': CommandStatus.Processing, + 'locking': CommandLocking.Idle} + update_commands.append(u_command) + else: + u_command = {'cmd_id': cmd['cmd_id'], + 'status': CommandStatus.UnknownCommand, + 'locking': CommandLocking.Idle} + update_commands.append(u_command) + core_commands.update_commands(update_commands) + return commands + except exceptions.DatabaseException as ex: + if 'ORA-00060' in str(ex): + self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") + else: + # raise ex + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + return [] + + def get_request(self, request_id, status=None, locking=False): + try: + return core_requests.get_request_by_id_status(request_id=request_id, status=status, locking=locking) + except exceptions.DatabaseException as ex: + if 'ORA-00060' in str(ex): + self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") + else: + # raise ex + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + return None + + def load_poll_period(self, req, parameters): + if self.new_poll_period and req['new_poll_period'] != self.new_poll_period: + parameters['new_poll_period'] = self.new_poll_period + if self.update_poll_period and req['update_poll_period'] != self.update_poll_period: + parameters['update_poll_period'] = self.update_poll_period + parameters['max_new_retries'] = req['max_new_retries'] if req['max_new_retries'] is not None else self.max_new_retries + parameters['max_update_retries'] = req['max_update_retries'] if req['max_update_retries'] is not None else self.max_update_retries + return parameters + + def generate_transform(self, req, work): + wf = req['request_metadata']['workflow'] + + work.set_request_id(req['request_id']) + work.username = req['username'] + + new_transform = {'request_id': req['request_id'], + 'workload_id': req['workload_id'], + 'transform_type': work.get_work_type(), + 'transform_tag': work.get_work_tag(), + 'priority': req['priority'], + 'status': TransformStatus.New, + 'retries': 0, + 'new_poll_period': self.new_poll_period, + 'update_poll_period': self.update_poll_period, + 'max_new_retries': req['max_new_retries'] if req['max_new_retries'] is not None else self.max_new_retries, + 'max_update_retries': req['max_update_retries'] if req['max_update_retries'] is not None else self.max_update_retries, + # 'expired_at': req['expired_at'], + 'expired_at': None, + 'transform_metadata': {'internal_id': work.get_internal_id(), + 'template_work_id': work.get_template_work_id(), + 'sequence_id': work.get_sequence_id(), + 'work_name': work.get_work_name(), + 'work': work, + 'workflow': wf} + # 'running_metadata': {'work_data': new_work.get_running_data()} + # 'collections': related_collections + } + + return new_transform + + def get_log_prefix(self, req): + return "" % req['request_id'] + + def handle_new_request(self, req): + try: + log_pre = self.get_log_prefix(req) + self.logger.info(log_pre + "Handle new request") workflow = req['request_metadata']['workflow'] # wf = workflow.copy() @@ -149,214 +329,157 @@ def process_new_request(self, req): transform = self.generate_transform(req, work) transforms.append(transform) - self.logger.debug("Processing request(%s): new transforms: %s" % (req['request_id'], - str(transforms))) + self.logger.debug(log_pre + "Processing request(%s): new transforms: %s" % (req['request_id'], + str(transforms))) # processing_metadata = req['processing_metadata'] # processing_metadata = {'workflow_data': wf.get_running_data()} ret_req = {'request_id': req['request_id'], 'parameters': {'status': RequestStatus.Transforming, 'locking': RequestLocking.Idle, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period), # 'processing_metadata': processing_metadata, 'request_metadata': req['request_metadata']}, 'new_transforms': transforms} + ret_req['parameters'] = self.load_poll_period(req, ret_req['parameters']) + self.logger.info(log_pre + "Handle new request result: %s" % str(ret_req)) except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + retries = req['new_retries'] + 1 + if not req['max_new_retries'] or retries < req['max_new_retries']: + req_status = req['status'] + else: + req_status = RequestStatus.Failed + + # increase poll period + new_poll_period = int(req['new_poll_period'].total_seconds() * self.poll_period_increase_rate) + if new_poll_period > self.max_new_poll_period: + new_poll_period = self.max_new_poll_period + + error = {'submit_err': {'msg': truncate_string('%s' % (ex), length=200)}} + ret_req = {'request_id': req['request_id'], - 'parameters': {'status': RequestStatus.Failed, + 'parameters': {'status': req_status, 'locking': RequestLocking.Idle, - 'errors': {'msg': truncate_string('%s: %s' % (ex, traceback.format_exc()), length=800)}}} + 'new_retries': retries, + 'new_poll_period': new_poll_period, + 'errors': req['errors'] if req['errors'] else {}}} + ret_req['parameters']['errors'].update(error) + self.logger.warn(log_pre + "Handle new request error result: %s" % str(ret_req)) return ret_req - def process_new_requests(self): - """ - Process new request - """ - ret = [] - while not self.new_task_queue.empty(): - try: - req = self.new_task_queue.get() - if req: - self.new_processing_size += 1 - self.logger.info("Main thread processing new requst: %s" % req) - ret_req = self.process_new_request(req) - self.new_processing_size -= 1 - if ret_req: - # ret.append(ret_req) - self.new_output_queue.put(ret_req) - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - return ret + def update_request(self, req): + new_tf_ids, update_tf_ids = [], [] + try: + log_pre = self.get_log_prefix(req) + self.logger.info(log_pre + "update request: %s" % req) + req['parameters']['locking'] = RequestLocking.Idle + req['parameters']['updated_at'] = datetime.datetime.utcnow() - def finish_new_requests(self): - while not self.new_output_queue.empty(): - try: - req = self.new_output_queue.get() - self.logger.info("Main thread finished processing requst: %s" % req) - req['parameters']['locking'] = RequestLocking.Idle + if 'new_transforms' in req: + new_transforms = req['new_transforms'] + else: + new_transforms = [] - if 'new_transforms' in req: - new_transforms = req['new_transforms'] - else: - new_transforms = [] + if 'update_transforms' in req: + update_transforms = req['update_transforms'] + else: + update_transforms = {} - if 'update_transforms' in req: - update_transforms = req['update_transforms'] - else: - update_transforms = {} - - retry = True - retry_num = 0 - while retry: - retry = False - retry_num += 1 - try: - core_requests.update_request_with_transforms(req['request_id'], req['parameters'], - new_transforms=new_transforms, - update_transforms=update_transforms) - except exceptions.DatabaseException as ex: - if 'ORA-00060' in str(ex): - self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") - if retry_num < 5: - retry = True - time.sleep(60 * retry_num * 2) + retry = True + retry_num = 0 + while retry: + retry = False + retry_num += 1 + try: + _, new_tf_ids, update_tf_ids = core_requests.update_request_with_transforms(req['request_id'], req['parameters'], + new_transforms=new_transforms, + update_transforms=update_transforms) + except exceptions.DatabaseException as ex: + if 'ORA-00060' in str(ex): + self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") + if retry_num < 5: + retry = True + if retry_num <= 1: + random_sleep = random.randint(1, 10) + elif retry_num <= 2: + random_sleep = random.randint(1, 60) else: - raise ex + random_sleep = random.randint(1, 120) + time.sleep(random_sleep) else: - # self.logger.error(ex) - # self.logger.error(traceback.format_exc()) raise ex + else: + # self.logger.error(ex) + # self.logger.error(traceback.format_exc()) + raise ex + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + try: + req_parameters = {'status': RequestStatus.Transforming, + 'locking': RequestLocking.Idle} + if 'new_retries' in req['parameters']: + req_parameters['new_retries'] = req['parameters']['new_retries'] + if 'update_retries' in req['parameters']: + req_parameters['update_retries'] = req['parameters']['update_retries'] + if 'errors' in req['parameters']: + req_parameters['errors'] = req['parameters']['errors'] + self.logger.warn(log_pre + "Update request in exception: %s" % str(req_parameters)) + core_requests.update_request_with_transforms(req['request_id'], req_parameters) except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) - try: - req_parameters = {'status': RequestStatus.Transforming, - 'locking': RequestLocking.Idle, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period)} - core_requests.update_request_with_transforms(req['request_id'], req_parameters) - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) + return new_tf_ids, update_tf_ids - def get_running_requests(self): - """ - Get running requests - """ + def process_new_request(self, event): + self.number_workers += 1 try: - if self.running_task_queue.qsize() > 0 or self.running_output_queue.qsize() > 0: - return [] - - self.show_queue_size() - - req_status = [RequestStatus.Transforming, RequestStatus.ToCancel, RequestStatus.Cancelling, - RequestStatus.ToSuspend, RequestStatus.Suspending, - RequestStatus.ToExpire, RequestStatus.Expiring, - RequestStatus.ToFinish, RequestStatus.ToForceFinish, - RequestStatus.ToResume, RequestStatus.Resuming] - reqs = core_requests.get_requests_by_status_type(status=req_status, time_period=None, - locking=True, bulk_size=self.retrieve_bulk_size, - with_messaging=True) - - self.logger.debug("Main thread get %s Transforming requests to running" % len(reqs)) - if reqs: - self.logger.info("Main thread get %s Transforming requests to running" % len(reqs)) - return reqs - except exceptions.DatabaseException as ex: - if 'ORA-00060' in str(ex): - self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") - else: - # raise ex - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - return [] + if event: + req_status = [RequestStatus.New, RequestStatus.Extend] + req = self.get_request(request_id=event._request_id, status=req_status, locking=True) + if not req: + self.logger.error("Cannot find request for event: %s" % str(event)) + elif req: + log_pre = self.get_log_prefix(req) + ret = self.handle_new_request(req) + new_tf_ids, update_tf_ids = self.update_request(ret) + for tf_id in new_tf_ids: + self.logger.info(log_pre + "NewTransformEvent(transform_id: %s)" % str(tf_id)) + event = NewTransformEvent(publisher_id=self.id, transform_id=tf_id) + self.event_bus.send(event) + time.sleep(1) + for tf_id in update_tf_ids: + self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % str(tf_id)) + event = UpdateTransformEvent(publisher_id=self.id, transform_id=tf_id) + self.event_bus.send(event) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 - def get_message_for_update_request(self, req, req_status): - msg_content = {'command': 'update_request', - 'parameters': {'status': req_status}} - msg = {'msg_type': MessageType.IDDSCommunication, - 'status': MessageStatus.New, - 'destination': MessageDestination.Clerk, - 'source': MessageSource.Clerk, - 'request_id': req['request_id'], - 'workload_id': req['workload_id'], - 'transform_id': None, - 'num_contents': 1, - 'msg_content': msg_content} - return msg - - def get_message_for_update_transform(self, tf, tf_status): - msg_content = {'command': 'update_transform', - 'parameters': {'status': tf_status}} - msg = {'msg_type': MessageType.IDDSCommunication, - 'status': MessageStatus.New, - 'destination': MessageDestination.Transformer, - 'source': MessageSource.Clerk, - 'request_id': tf['request_id'], - 'workload_id': tf['workload_id'], - 'transform_id': tf['transform_id'], - 'num_contents': 1, - 'msg_content': msg_content} - return msg - - def get_message_for_update_processing(self, processing, processing_status): - msg_content = {'command': 'update_processing', - 'parameters': {'status': processing_status}} - msg = {'msg_type': MessageType.IDDSCommunication, - 'status': MessageStatus.New, - 'destination': MessageDestination.Carrier, - 'source': MessageSource.Clerk, - 'request_id': processing['request_id'], - 'workload_id': processing['workload_id'], - 'transform_id': processing['transform_id'], - 'processing_id': processing['processing_id'], - 'num_contents': 1, - 'msg_content': msg_content} - return msg - - def get_update_processing_messages(self, msg_id, request_id, parameters): - self.logger.info("get_update_processing_messages: msg_id: %s, request_id: %s, parameters: %s" % (msg_id, request_id, str(parameters))) - processings = core_processings.get_processings(request_id=request_id) - if not processings: - self.logger.error("get_update_processing_messages: msg_id: %s, request_id: %s, no processings. return" % (msg_id, request_id)) - return [] - processing_map = {} - processing_wids = [] - for processing in processings: - processing_map[processing['workload_id']] = processing - processing_wids.append(processing['workload_id']) - - msgs = [] - if type(parameters) in [list, tuple]: - for param in parameters: - workload_id, status = None, None - if 'workload_id' in param: - try: - workload_id = int(param['workload_id']) - except: - pass - if 'status' in param: - status = param['status'] - - if workload_id is None or status is None or status not in [ProcessingStatus.ToCancel, ProcessingStatus.ToSuspend, ProcessingStatus.ToResume]: - self.logger.error("get_update_processing_messages: msg_id %s, workload_id (%s) is None or status is None or status(%s) is not in [ToCancel, ToSuspend, ToResume], ignore." % (msg_id, workload_id, status)) - continue - if workload_id not in processing_map: - self.logger.error("get_update_processing_messages: msg_id %s, workload_id %s cannot be found in request %s(processings: %s)" % (msg_id, workload_id, request_id, str(processing_wids))) - continue - msg = self.get_message_for_update_processing(processing_map[workload_id], status) - msgs.append(msg) - return msgs - - def process_running_request_real(self, req): + def handle_update_request_real(self, req, event): """ process running request """ - self.logger.info("process_running_request: request_id: %s" % req['request_id']) + log_pre = self.get_log_prefix(req) + self.logger.info(log_pre + " handle_update_request: request_id: %s" % req['request_id']) wf = req['request_metadata']['workflow'] + to_abort = False + to_abort_transform_id = None + if (event and event._content and 'cmd_type' in event._content and event._content['cmd_type'] + and event._content['cmd_type'] in [CommandType.AbortRequest, CommandType.ExpireRequest]): # noqa W503 + to_abort = True + self.logger.info(log_pre + "to_abort: %s" % to_abort) + if (event and event._content and 'cmd_content' in event._content and event._content['cmd_content'] + and 'transform_id' in event._content['cmd_content']): # noqa W503 + to_abort_transform_id = event._content['cmd_content']['transform_id'] + self.logger.info(log_pre + "to_abort_transform_id: %s" % to_abort_transform_id) + + if to_abort and not to_abort_transform_id: + wf.to_cancel = True + # current works works = wf.get_all_works() # print(works) @@ -367,11 +490,12 @@ def process_running_request_real(self, req): transform_work = tf['transform_metadata']['work'] # work_status = WorkStatus(tf['status'].value) # work.set_status(work_status) - work.sync_work_data(status=tf['status'], substatus=tf['substatus'], work=transform_work) + work.sync_work_data(status=tf['status'], substatus=tf['substatus'], work=transform_work, workload_id=tf['workload_id']) + self.logger.info(log_pre + "transform status: %s, work status: %s" % (tf['status'], work.status)) wf.refresh_works() new_transforms = [] - if req['status'] in [RequestStatus.Transforming]: + if req['status'] in [RequestStatus.Transforming] and not wf.to_cancel: # new works works = wf.get_new_works() for work in works: @@ -380,299 +504,322 @@ def process_running_request_real(self, req): new_work.add_proxy(wf.get_proxy()) new_transform = self.generate_transform(req, new_work) new_transforms.append(new_transform) - self.logger.debug("Processing request(%s): new transforms: %s" % (req['request_id'], str(new_transforms))) + self.logger.debug(log_pre + " Processing request(%s): new transforms: %s" % (req['request_id'], str(new_transforms))) - is_operation = False + req_status = RequestStatus.Transforming if wf.is_terminated(): if wf.is_finished(): req_status = RequestStatus.Finished - elif wf.is_subfinished(): - req_status = RequestStatus.SubFinished - elif wf.is_expired(): - req_status = RequestStatus.Expired - elif wf.is_failed(): - req_status = RequestStatus.Failed - elif wf.is_cancelled(): - req_status = RequestStatus.Cancelled - elif wf.is_suspended(): - req_status = RequestStatus.Suspended else: - req_status = RequestStatus.Failed - req_msg = wf.get_terminated_msg() + if to_abort and not to_abort_transform_id: + req_status = RequestStatus.Cancelled + elif wf.is_expired(): + req_status = RequestStatus.Expired + elif wf.is_subfinished(): + req_status = RequestStatus.SubFinished + elif wf.is_failed(): + req_status = RequestStatus.Failed + else: + req_status = RequestStatus.Failed + # req_msg = wf.get_terminated_msg() else: - req_msg = None - if req['status'] in [RequestStatus.ToSuspend, RequestStatus.Suspending]: - req_status = RequestStatus.Suspending - is_operation = True - elif req['status'] in [RequestStatus.ToCancel, RequestStatus.Cancelling]: - req_status = RequestStatus.Cancelling - is_operation = True - elif wf.is_to_expire(req['expired_at'], self.pending_time, request_id=req['request_id']): + if wf.is_to_expire(req['expired_at'], self.pending_time, request_id=req['request_id']): wf.expired = True - req_status = RequestStatus.ToExpire - is_operation = True - req_msg = "Workflow expired" - else: - req_status = RequestStatus.Transforming - - # processing_metadata['workflow_data'] = wf.get_running_data() - if not is_operation: - next_poll_at = datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period) - else: - next_poll_at = datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_operation_time_period) + event_content = {'request_id': req['request_id'], + 'cmd_type': CommandType.ExpireRequest, + 'cmd_content': {}} + self.logger.debug(log_pre + "ExpireRequestEvent(request_id: %s" % req['request_id']) + event = ExpireRequestEvent(publisher_id=self.id, request_id=req['request_id'], content=event_content) + self.event_bus.send(event) parameters = {'status': req_status, 'locking': RequestLocking.Idle, - 'next_poll_at': next_poll_at, - 'request_metadata': req['request_metadata'], - 'errors': {'msg': truncate_string(req_msg, 800)}} - - new_messages = [] - if req_status == RequestStatus.ToExpire: - # parameters['substatus'] = req_status - new_message = self.get_message_for_update_request(req, req_status) - new_messages.append(new_message) + 'request_metadata': req['request_metadata'] + } + parameters = self.load_poll_period(req, parameters) ret = {'request_id': req['request_id'], 'parameters': parameters, - 'new_messages': new_messages, 'new_transforms': new_transforms} # 'update_transforms': update_transforms} + self.logger.info(log_pre + "Handle update request result: %s" % str(ret)) return ret - def process_running_request_message(self, req, messages): + def handle_update_request(self, req, event): """ - process running request message + process running request """ try: - self.logger.info("process_running_request_message: request_id: %s, messages: %s" % (req['request_id'], str(messages) if messages else messages)) - msg = messages[0] - message = messages[0]['msg_content'] - if message['command'] == 'update_request': - parameters_temp = message['parameters'] - parameters = {} - if 'status' not in parameters_temp or parameters_temp['status'] not in [RequestStatus.ToCancel, RequestStatus.ToSuspend, RequestStatus.ToResume]: - self.logger.error("process_running_request_message: message %s parameters cannot be accepted: %s" % (msg['msg_id'], parameters_temp)) - else: - parameters['status'] = parameters_temp['status'] - parameters['locking'] = RequestLocking.Idle - ret_req = {'request_id': req['request_id'], - 'parameters': parameters, - 'update_messages': [{'msg_id': msg['msg_id'], 'status': MessageStatus.Delivered}] - } - elif message['command'] == 'update_processing': - parameters = message['parameters'] - update_processing_messages = self.get_update_processing_messages(msg['msg_id'], req['request_id'], parameters) - ret_req = {'request_id': req['request_id'], - 'parameters': {'locking': RequestLocking.Idle}, - 'new_messages': update_processing_messages, - 'update_messages': [{'msg_id': msg['msg_id'], 'status': MessageStatus.Delivered}] - } - else: - self.logger.error("Unknown message: %s" % str(msg)) - ret_req = {'request_id': req['request_id'], - 'parameters': {'locking': RequestLocking.Idle}, - 'update_messages': [{'msg_id': msg['msg_id'], 'status': MessageStatus.Failed}] - } + # if self.release_helper: + # self.release_inputs(req['request_id']) + ret_req = self.handle_update_request_real(req, event) except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + retries = req['update_retries'] + 1 + if not req['max_update_retries'] or retries < req['max_update_retries']: + req_status = req['status'] + else: + req_status = RequestStatus.Failed + error = {'update_err': {'msg': truncate_string('%s' % (ex), length=200)}} + + # increase poll period + update_poll_period = int(req['update_poll_period'].total_seconds() * self.poll_period_increase_rate) + if update_poll_period > self.max_update_poll_period: + update_poll_period = self.max_update_poll_period + ret_req = {'request_id': req['request_id'], - 'parameters': {'status': RequestStatus.Failed, + 'parameters': {'status': req_status, 'locking': RequestLocking.Idle, - 'errors': {'msg': truncate_string('%s: %s' % (ex, traceback.format_exc()), length=800)}}} + 'update_retries': retries, + 'update_poll_period': update_poll_period, + 'errors': req['errors'] if req['errors'] else {}}} + ret_req['parameters']['errors'].update(error) + log_pre = self.get_log_prefix(req) + self.logger.warn(log_pre + "Handle new request exception result: %s" % str(ret_req)) return ret_req - def release_inputs(self, request_id): - contents = core_catalog.get_contents(request_id=request_id, status=ContentStatus.Available) - ret_contents = {} - for content in contents: - if content['content_relation_type'] == ContentRelationType.Output: # InputDependency - if content['coll_id'] not in ret_contents: - ret_contents[content['coll_id']] = [] - ret_contents[content['coll_id']].append(content) - - updated_contents = core_transforms.release_inputs_by_collection(ret_contents) - - core_catalog.update_contents(updated_contents) + def process_update_request(self, event): + self.number_workers += 1 + try: + if event: + req_status = [RequestStatus.Transforming, RequestStatus.ToCancel, RequestStatus.Cancelling, + RequestStatus.ToSuspend, RequestStatus.Suspending, + RequestStatus.ToExpire, RequestStatus.Expiring, + RequestStatus.ToFinish, RequestStatus.ToForceFinish, + RequestStatus.ToResume, RequestStatus.Resuming] + + req = self.get_request(request_id=event._request_id, status=req_status, locking=True) + if not req: + self.logger.error("Cannot find request for event: %s" % str(event)) + else: + log_pre = self.get_log_prefix(req) + ret = self.handle_update_request(req, event=event) + new_tf_ids, update_tf_ids = self.update_request(ret) + for tf_id in new_tf_ids: + self.logger.info(log_pre + "NewTransformEvent(transform_id: %s)" % tf_id) + event = NewTransformEvent(publisher_id=self.id, transform_id=tf_id, content=event._content) + self.event_bus.send(event) + for tf_id in update_tf_ids: + self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % tf_id) + event = UpdateTransformEvent(publisher_id=self.id, transform_id=tf_id, content=event._content) + self.event_bus.send(event) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 - def process_running_request(self, req): + def handle_abort_request(self, req, event): """ - process running request + process abort request """ try: - if self.release_helper: - self.release_inputs(req['request_id']) - ret_req = self.process_running_request_real(req) + log_pre = self.get_log_prefix(req) + self.logger.info(log_pre + "handle_abort_request event: %s" % str(event)) + + to_abort = False + to_abort_transform_id = None + if (event and event._content and 'cmd_type' in event._content and event._content['cmd_type'] + and event._content['cmd_type'] in [CommandType.AbortRequest, CommandType.ExpireRequest]): # noqa W503 + to_abort = True + self.logger.info(log_pre + "to_abort: %s" % to_abort) + if (event and event._content and 'cmd_content' in event._content and event._content['cmd_content'] + and 'transform_id' in event._content['cmd_content']): # noqa W503 + to_abort_transform_id = event._content['cmd_content']['transform_id'] + self.logger.info(log_pre + "to_abort_transform_id: %s" % to_abort_transform_id) + + if to_abort and to_abort_transform_id: + req_status = req['status'] + else: + wf = req['request_metadata']['workflow'] + wf.to_cancel = True + req_status = RequestStatus.Cancelling + + ret_req = {'request_id': req['request_id'], + 'parameters': {'status': req_status, + 'locking': RequestLocking.Idle, + 'request_metadata': req['request_metadata']}, + } + self.logger.info(log_pre + "handle_abort_request result: %s" % str(ret_req)) + return ret_req except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + error = {'abort_err': {'msg': truncate_string('%s' % (ex), length=200)}} ret_req = {'request_id': req['request_id'], - 'parameters': {'status': RequestStatus.Failed, + 'parameters': {'status': RequestStatus.ToCancel, 'locking': RequestLocking.Idle, - 'errors': {'msg': truncate_string('%s: %s' % (ex, traceback.format_exc()), length=800)}}} + 'errors': req['errors'] if req['errors'] else {}}} + ret_req['parameters']['errors'].update(error) + self.logger.info(log_pre + "handle_abort_request exception result: %s" % str(ret_req)) return ret_req - def process_operating_request_real(self, req): + def handle_command(self, event, cmd_status, errors=None): + if (event and event._content and 'cmd_id' in event._content and event._content['cmd_id']): + u_command = {'cmd_id': event._content['cmd_id'], + 'status': cmd_status, + 'locking': CommandLocking.Idle} + if errors: + u_command['errors'] = errors + core_commands.update_commands([u_command]) + + def process_abort_request(self, event): + self.number_workers += 1 + try: + if event: + req = self.get_request(request_id=event._request_id, locking=True) + if not req: + self.logger.error("Cannot find request for event: %s" % str(event)) + else: + log_pre = self.get_log_prefix(req) + self.logger.info(log_pre + "process_abort_request event: %s" % str(event)) + + if req['status'] in [RequestStatus.Finished, RequestStatus.SubFinished, + RequestStatus.Failed, RequestStatus.Cancelled, + RequestStatus.Suspended, RequestStatus.Expired]: + ret = {'request_id': req['request_id'], + 'parameters': {'locking': RequestLocking.Idle, + 'errors': {'extra_msg': "Request is already terminated. Cannot be aborted"}}} + if req['errors'] and 'msg' in req['errors']: + ret['parameters']['errors']['msg'] = req['errors']['msg'] + self.logger.info(log_pre + "process_abort_request result: %s" % str(ret)) + self.update_request(ret) + self.handle_command(event, cmd_status=CommandStatus.Failed, errors="Request is already terminated. Cannot be aborted") + else: + ret = self.handle_abort_request(req, event) + self.logger.info(log_pre + "process_abort_request result: %s" % str(ret)) + self.update_request(ret) + to_abort_transform_id = None + if event and event._content and event._content['cmd_content'] and 'transform_id' in event._content['cmd_content']: + to_abort_transform_id = event._content['cmd_content']['transform_id'] + + wf = req['request_metadata']['workflow'] + works = wf.get_all_works() + if works: + for work in works: + if not work.is_terminated(): + if not to_abort_transform_id or to_abort_transform_id == work.get_work_id(): + self.logger.info(log_pre + "AbortTransformEvent(transform_id: %s)" % str(work.get_work_id())) + event = AbortTransformEvent(publisher_id=self.id, + transform_id=work.get_work_id(), + content=event._content) + self.event_bus.send(event) + else: + # no works. should trigger update request + self.logger.info(log_pre + "UpdateRequestEvent(request_id: %s)" % str(req['request_id'])) + event = UpdateRequestEvent(publisher_id=self.id, request_id=req['request_id'], content=event._content) + self.event_bus.send(event) + + self.handle_command(event, cmd_status=CommandStatus.Processed, errors=None) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 + + def handle_resume_request(self, req): """ - process ToCancel/ToSuspend/ToResume/ToExpire request + process resume request """ - if req['status'] == RequestStatus.ToCancel: - tf_status = TransformStatus.ToCancel - req_status = RequestStatus.Cancelling - elif req['status'] == RequestStatus.ToSuspend: - tf_status = TransformStatus.ToSuspend - req_status = RequestStatus.Suspending - elif req['status'] == RequestStatus.ToResume: - tf_status = TransformStatus.ToResume + try: req_status = RequestStatus.Resuming - elif req['status'] == RequestStatus.ToExpire: - tf_status = TransformStatus.ToExpire - req_status = RequestStatus.Expiring - elif req['status'] == RequestStatus.ToFinish: - tf_status = TransformStatus.ToFinish - req_status = RequestStatus.Transforming - elif req['status'] == RequestStatus.ToForceFinish: - tf_status = TransformStatus.ToForceFinish - req_status = RequestStatus.Transforming - - processing_metadata = req['processing_metadata'] - wf = req['request_metadata']['workflow'] - if req['status'] == RequestStatus.ToResume: - wf.resume_works() + processing_metadata = req['processing_metadata'] - new_messages = [] - tfs = core_transforms.get_transforms(request_id=req['request_id']) - for tf in tfs: - try: - # core_transforms.update_transform(transform_id=tf['transform_id'], parameters={'substatus': tf_status}) - msg = self.get_message_for_update_transform(tf, tf_status) - new_messages.append(msg) - if tf_status in [TransformStatus.ToResume]: - # duplicate the messages for ToResume - new_messages.append(msg) - except Exception as ex: - self.logger.warn("Failed to add messages for tranform %s, record it for later update: %s" % (tf['transform_id'], str(ex))) - - # processing_metadata['workflow_data'] = wf.get_running_data() - - ret_req = {'request_id': req['request_id'], - 'parameters': {'status': req_status, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_operation_time_period), - 'request_metadata': req['request_metadata'], - 'processing_metadata': processing_metadata, - 'locking': RequestLocking.Idle}, - # 'update_transforms': tfs_status - 'update_transforms': {}, - 'new_messages': new_messages - } - return ret_req + wf = req['request_metadata']['workflow'] + wf.resume_works() - def process_operating_request(self, req): - """ - process ToCancel/ToSuspend/ToResume/ToExpire request - """ - try: - ret_req = self.process_operating_request_real(req) + ret_req = {'request_id': req['request_id'], + 'parameters': {'status': req_status, + 'request_metadata': req['request_metadata'], + 'processing_metadata': processing_metadata, + 'locking': RequestLocking.Idle}, + } + return ret_req except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) + error = {'abort_err': {'msg': truncate_string('%s' % (ex), length=200)}} ret_req = {'request_id': req['request_id'], - 'parameters': {'status': RequestStatus.Failed, + 'parameters': {'status': RequestStatus.ToResume, 'locking': RequestLocking.Idle, - 'errors': {'msg': truncate_string('%s: %s' % (ex, traceback.format_exc()), length=800)}}} + 'errors': req['errors'] if req['errors'] else {}}} + ret_req['parameters']['errors'].update(error) return ret_req - def process_running_requests(self): - """ - Process running request - """ - ret = [] - while not self.running_task_queue.empty(): - try: - req = self.running_task_queue.get() - if req: - self.running_processing_size += 1 - if req['status'] in [RequestStatus.ToCancel, RequestStatus.ToSuspend, - RequestStatus.ToResume, RequestStatus.ToExpire, - RequestStatus.ToFinish, RequestStatus.ToForceFinish]: - self.logger.info("Main thread processing operating requst: %s" % req) - ret_req = self.process_operating_request(req) + def process_resume_request(self, event): + self.number_workers += 1 + try: + if event: + req = self.get_request(request_id=event._request_id, locking=True) + if not req: + self.logger.error("Cannot find request for event: %s" % str(event)) + else: + log_pre = self.get_log_prefix(req) + self.logger.info(log_pre + "process_resume_request event: %s" % str(event)) + + if req['status'] in [RequestStatus.Finished]: + ret = {'request_id': req['request_id'], + 'parameters': {'locking': RequestLocking.Idle, + 'errors': {'extra_msg': "Request is already finished. Cannot be resumed"}}} + if req['errors'] and 'msg' in req['errors']: + ret['parameters']['errors']['msg'] = req['errors']['msg'] + self.logger.info(log_pre + "process_resume_request result: %s" % str(ret)) + + self.update_request(ret) + self.handle_command(event, cmd_status=CommandStatus.Failed, errors="Request is already finished. Cannot be resumed") else: - msgs = self.get_request_message(request_id=req['request_id'], bulk_size=1) - if msgs: - self.logger.info("Main thread processing running requst with message: %s" % req) - ret_req = self.process_running_request_message(req, msgs) + ret = self.handle_resume_request(req) + self.logger.info(log_pre + "process_resume_request result: %s" % str(ret)) + + self.update_request(ret) + wf = req['request_metadata']['workflow'] + works = wf.get_all_works() + if works: + for work in works: + # if not work.is_finished(): + self.logger.info(log_pre + "ResumeTransformEvent(transform_id: %s)" % str(work.get_work_id())) + event = ResumeTransformEvent(publisher_id=self.id, + transform_id=work.get_work_id(), + content=event._content) + self.event_bus.send(event) else: - self.logger.info("Main thread processing running requst: %s" % req) - ret_req = self.process_running_request(req) - self.running_processing_size -= 1 - - if ret_req: - # ret.append(ret_req) - self.running_output_queue.put(ret_req) - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - return ret - - def finish_running_requests(self): - while not self.running_output_queue.empty(): - try: - req = self.running_output_queue.get() - self.logger.info("finish_running_requests: req: %s" % req) - req['parameters']['locking'] = RequestLocking.Idle - - if 'new_transforms' in req: - new_transforms = req['new_transforms'] - else: - new_transforms = [] + self.logger.info(log_pre + "UpdateRequestEvent(request_id: %s)" % str(req['request_id'])) + event = UpdateRequestEvent(publisher_id=self.id, request_id=req['request_id'], content=event._content) + self.event_bus.send(event) - if 'update_transforms' in req: - update_transforms = req['update_transforms'] - else: - update_transforms = {} - - retry = True - retry_num = 0 - while retry: - retry = False - retry_num += 1 - try: - core_requests.update_request_with_transforms(req['request_id'], req['parameters'], - new_transforms=new_transforms, - update_transforms=update_transforms, - new_messages=req.get('new_messages', None), - update_messages=req.get('update_messages', None)) - - except exceptions.DatabaseException as ex: - if 'ORA-00060' in str(ex): - self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") - if retry_num < 5: - retry = True - time.sleep(60 * retry_num * 2) - else: - raise ex - else: - # self.logger.error(ex) - # self.logger.error(traceback.format_exc()) - raise ex - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - try: - req_parameters = {'status': RequestStatus.Transforming, - 'locking': RequestLocking.Idle, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period)} - core_requests.update_request_with_transforms(req['request_id'], req_parameters) - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) + self.handle_command(event, cmd_status=CommandStatus.Processed, errors=None) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 def clean_locks(self): self.logger.info("clean locking") core_requests.clean_locking() + def init_event_function_map(self): + self.event_func_map = { + EventType.NewRequest: { + 'pre_check': self.is_ok_to_run_more_requests, + 'exec_func': self.process_new_request + }, + EventType.UpdateRequest: { + 'pre_check': self.is_ok_to_run_more_requests, + 'exec_func': self.process_update_request + }, + EventType.AbortRequest: { + 'pre_check': self.is_ok_to_run_more_requests, + 'exec_func': self.process_abort_request + }, + EventType.ExpireRequest: { + 'pre_check': self.is_ok_to_run_more_requests, + 'exec_func': self.process_abort_request + }, + EventType.ResumeRequest: { + 'pre_check': self.is_ok_to_run_more_requests, + 'exec_func': self.process_resume_request + } + } + def run(self): """ Main run function. @@ -684,24 +831,14 @@ def run(self): self.add_default_tasks() - task = self.create_task(task_func=self.get_new_requests, task_output_queue=self.new_task_queue, task_args=tuple(), task_kwargs={}, delay_time=60, priority=1) - self.add_task(task) - for _ in range(self.num_threads): - # task = self.create_task(task_func=self.process_new_requests, task_output_queue=self.new_output_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - task = self.create_task(task_func=self.process_new_requests, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - self.add_task(task) - task = self.create_task(task_func=self.finish_new_requests, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - self.add_task(task) + self.init_event_function_map() - task = self.create_task(task_func=self.get_running_requests, task_output_queue=self.running_task_queue, task_args=tuple(), task_kwargs={}, delay_time=60, priority=1) + task = self.create_task(task_func=self.get_new_requests, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=10, priority=1) self.add_task(task) - for _ in range(self.num_threads): - # task = self.create_task(task_func=self.process_running_requests, task_output_queue=self.running_output_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - task = self.create_task(task_func=self.process_running_requests, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - self.add_task(task) - task = self.create_task(task_func=self.finish_running_requests, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) + task = self.create_task(task_func=self.get_running_requests, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=60, priority=1) + self.add_task(task) + task = self.create_task(task_func=self.get_operation_requests, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=10, priority=1) self.add_task(task) - task = self.create_task(task_func=self.clean_locks, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1800, priority=1) self.add_task(task) diff --git a/main/lib/idds/agents/common/baseagent.py b/main/lib/idds/agents/common/baseagent.py index 593796e5..5a5d859c 100644 --- a/main/lib/idds/agents/common/baseagent.py +++ b/main/lib/idds/agents/common/baseagent.py @@ -10,7 +10,9 @@ import os import socket +import traceback import threading +import uuid from idds.common.constants import Sections from idds.common.constants import (MessageType, MessageTypeStr, @@ -20,6 +22,8 @@ from idds.common.utils import setup_logging from idds.core import health as core_health, messages as core_messages from idds.agents.common.timerscheduler import TimerScheduler +from idds.agents.common.eventbus.eventbus import EventBus +from idds.agents.common.cache.redis import get_redis_cache setup_logging(__name__) @@ -30,12 +34,12 @@ class BaseAgent(TimerScheduler, PluginBase): The base IDDS agent class """ - def __init__(self, num_threads=1, **kwargs): - super(BaseAgent, self).__init__(num_threads) + def __init__(self, num_threads=1, name=None, **kwargs): + super(BaseAgent, self).__init__(num_threads, name=name) self.name = self.__class__.__name__ + self.id = str(uuid.uuid4())[:8] self.logger = None - self.setup_logger() - self.set_logger(self.logger) + self.setup_logger(self.logger) self.config_section = Sections.Common @@ -57,6 +61,14 @@ def __init__(self, num_threads=1, **kwargs): self.logger.info("agent_attributes: %s" % self.agent_attributes) + self.event_bus = EventBus() + self.event_func_map = {} + + self.cache = get_redis_cache() + + def get_event_bus(self): + self.event_bus + def get_name(self): return self.name @@ -89,6 +101,31 @@ def load_plugins(self): raise AgentPluginError("Plugin %s is defined but it is not defined in plugin_sequence" % plugin_name) """ + def init_event_function_map(self): + self.event_func_map = {} + + def get_event_function_map(self): + return self.event_func_map + + def execute_event_schedule(self): + event_funcs = self.get_event_function_map() + for event_type in event_funcs: + exec_func = event_funcs[event_type]['exec_func'] + pre_check = event_funcs[event_type]['pre_check'] + if pre_check(): + event = self.event_bus.get(event_type) + if event: + self.executors.submit(exec_func, event) + + def execute(self): + while not self.graceful_stop.is_set(): + try: + self.execute_timer_schedule() + self.execute_event_schedule() + self.graceful_stop.wait(0.1) + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + def run(self): """ Main run function. @@ -101,6 +138,7 @@ def run(self): self.execute() except KeyboardInterrupt: self.stop() + self.event_bus.stop() def __call__(self): self.run() diff --git a/main/lib/idds/agents/conductor/plugins/__init__.py b/main/lib/idds/agents/common/cache/__init__.py similarity index 100% rename from main/lib/idds/agents/conductor/plugins/__init__.py rename to main/lib/idds/agents/common/cache/__init__.py diff --git a/main/lib/idds/agents/common/cache/redis.py b/main/lib/idds/agents/common/cache/redis.py new file mode 100644 index 00000000..cf46b825 --- /dev/null +++ b/main/lib/idds/agents/common/cache/redis.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2022 + +import logging +import uuid +import redis + +from idds.common.constants import Sections +from idds.common.config import config_has_section, config_list_options +from idds.common.utils import json_dumps, json_loads + + +class Singleton(object): + _instance = None + + def __new__(class_, *args, **kwargs): + if not isinstance(class_._instance, class_): + class_._instance = object.__new__(class_, *args, **kwargs) + class_._instance._initialized = False + return class_._instance + + +class RedisCache(Singleton): + """ + Redis cache + """ + + def __init__(self, logger=None): + if not self._initialized: + self._initialized = True + + super(RedisCache, self).__init__() + self._id = str(uuid.uuid4())[:8] + self.logger = logger + self.setup_logger(self.logger) + self.config_section = Sections.Cache + attrs = self.load_attributes() + if 'host' in attrs and attrs['host']: + self.host = attrs['host'] + else: + self.host = 'localhost' + if 'port' in attrs and attrs['port']: + self.port = int(attrs['port']) + else: + self.port = 6379 + self.cache = redis.Redis(host=self.host, port=self.port, db=0) + + def setup_logger(self, logger=None): + """ + Setup logger + """ + if logger: + self.logger = logger + else: + self.logger = logging.getLogger(self.get_class_name()) + + def get_class_name(self): + return self.__class__.__name__ + + def load_attributes(self): + self.logger.info("Loading config for section: %s" % self.config_section) + attrs = {} + if config_has_section(self.config_section): + options = config_list_options(self.config_section) + for option, value in options: + if isinstance(value, str) and value.lower() == 'true': + value = True + if isinstance(value, str) and value.lower() == 'false': + value = False + attrs[option] = value + return attrs + + def set(self, key, value, expire_seconds=21600): + value = json_dumps(value) + self.cache.set(key, value, ex=expire_seconds) + + def get(self, key, default=None): + value = self.cache.get(key) + if value: + value = json_loads(value) + if not value: + return default + return value + + def hset(self, key, value, expire_seconds=21600): + value = json_dumps(value) + self.cache.hset(key, value) + self.cache.expire(key, expire_seconds) + + def hget(self, key, default=None): + value = self.cache.hget(key) + if value: + value = json_loads(value) + if not value: + return default + return value + + +def get_redis_cache(): + cache = RedisCache() + return cache diff --git a/main/lib/idds/agents/common/eventbus/__init__.py b/main/lib/idds/agents/common/eventbus/__init__.py new file mode 100644 index 00000000..865b774e --- /dev/null +++ b/main/lib/idds/agents/common/eventbus/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 diff --git a/main/lib/idds/agents/common/eventbus/event.py b/main/lib/idds/agents/common/eventbus/event.py new file mode 100644 index 00000000..88e36ceb --- /dev/null +++ b/main/lib/idds/agents/common/eventbus/event.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2022 + +import time +import uuid + +from enum import Enum + +from idds.common.utils import json_dumps + + +class EventBusState(Enum): + New = 0 + Master = 1 + Slave = 2 + Unknown = 3 + + +class EventType(Enum): + Event = 0 + StateClaim = 1 + Demand = 2 + + NewRequest = 10 + UpdateRequest = 11 + AbortRequest = 12 + ResumeRequest = 13 + ExpireRequest = 14 + + NewTransform = 20 + UpdateTransform = 21 + AbortTransform = 22 + ResumeTransform = 23 + + NewProcessing = 30 + UpdateProcessing = 31 + AbortProcessing = 32 + ResumeProcessing = 33 + SyncProcessing = 34 + TerminatedProcessing = 35 + TriggerProcessing = 36 + + UpdateCommand = 40 + + +class Event(object): + def __init__(self, publisher_id, event_type=EventType.Event, content=None): + self._id = str(uuid.uuid4()) + self._publisher_id = publisher_id + self._event_type = event_type + self._timestamp = time.time() + self._content = content + + def to_json(self): + ret = {'id': self._id, 'publisher_id': self._publisher_id, + 'event_type': (self._event_type.name, self._event_type.value), + 'timestamp': self._timestamp, + 'content': self._content} + return ret + + def __str__(self): + return json_dumps(self.to_json()) + + +class StateClaimEvent(Event): + def __init__(self, publisher_id, event_bus_state, content=None): + super(StateClaimEvent, self).__init__(publisher_id, event_type=EventType.StateClaim, content=content) + self._event_bus_state = event_bus_state + + def to_json(self): + ret = super(StateClaimEvent, self).to_json() + ret['event_bus_state'] = self._event_bus_state + return ret + + +class DemandEvent(Event): + def __init__(self, publisher_id, demand_type, content=None): + super(DemandEvent, self).__init__(publisher_id, event_type=EventType.Demand, content=content) + self._demand_type = demand_type + + def to_json(self): + ret = super(DemandEvent, self).to_json() + ret['demand_type'] = self._demand_type + return ret + + +class NewRequestEvent(Event): + def __init__(self, publisher_id, request_id, content=None): + super(NewRequestEvent, self).__init__(publisher_id, event_type=EventType.NewRequest, content=content) + self._request_id = request_id + + def to_json(self): + ret = super(NewRequestEvent, self).to_json() + ret['request_id'] = self._request_id + return ret + + +class UpdateRequestEvent(Event): + def __init__(self, publisher_id, request_id, content=None): + super(UpdateRequestEvent, self).__init__(publisher_id, event_type=EventType.UpdateRequest, content=content) + self._request_id = request_id + + def to_json(self): + ret = super(UpdateRequestEvent, self).to_json() + ret['request_id'] = self._request_id + return ret + + +class AbortRequestEvent(Event): + def __init__(self, publisher_id, request_id, content=None): + super(AbortRequestEvent, self).__init__(publisher_id, event_type=EventType.AbortRequest, content=content) + self._request_id = request_id + + def to_json(self): + ret = super(AbortRequestEvent, self).to_json() + ret['request_id'] = self._request_id + return ret + + +class ResumeRequestEvent(Event): + def __init__(self, publisher_id, request_id, content=None): + super(ResumeRequestEvent, self).__init__(publisher_id, event_type=EventType.ResumeRequest, content=content) + self._request_id = request_id + + def to_json(self): + ret = super(ResumeRequestEvent, self).to_json() + ret['request_id'] = self._request_id + return ret + + +class ExpireRequestEvent(Event): + def __init__(self, publisher_id, request_id, content=None): + super(ExpireRequestEvent, self).__init__(publisher_id, event_type=EventType.ExpireRequest, content=content) + self._request_id = request_id + + def to_json(self): + ret = super(ExpireRequestEvent, self).to_json() + ret['request_id'] = self._request_id + return ret + + +class UpdateCommandEvent(Event): + def __init__(self, publisher_id, command_id, content=None): + super(UpdateCommandEvent, self).__init__(publisher_id, event_type=EventType.UpdateCommand, content=content) + self._command_id = command_id + + def to_json(self): + ret = super(UpdateCommandEvent, self).to_json() + ret['command_id'] = self._command_id + return ret + + +class NewTransformEvent(Event): + def __init__(self, publisher_id, transform_id, content=None): + super(NewTransformEvent, self).__init__(publisher_id, event_type=EventType.NewTransform, content=content) + self._transform_id = transform_id + + def to_json(self): + ret = super(NewTransformEvent, self).to_json() + ret['transform_id'] = self._transform_id + return ret + + +class UpdateTransformEvent(Event): + def __init__(self, publisher_id, transform_id, content=None): + super(UpdateTransformEvent, self).__init__(publisher_id, event_type=EventType.UpdateTransform, content=content) + self._transform_id = transform_id + + def to_json(self): + ret = super(UpdateTransformEvent, self).to_json() + ret['transform_id'] = self._transform_id + return ret + + +class AbortTransformEvent(Event): + def __init__(self, publisher_id, transform_id, content=None): + super(AbortTransformEvent, self).__init__(publisher_id, event_type=EventType.AbortTransform, content=content) + self._transform_id = transform_id + + def to_json(self): + ret = super(AbortTransformEvent, self).to_json() + ret['transform_id'] = self._transform_id + return ret + + +class ResumeTransformEvent(Event): + def __init__(self, publisher_id, transform_id, content=None): + super(ResumeTransformEvent, self).__init__(publisher_id, event_type=EventType.ResumeTransform, content=content) + self._transform_id = transform_id + + def to_json(self): + ret = super(ResumeTransformEvent, self).to_json() + ret['transform_id'] = self._transform_id + return ret + + +class NewProcessingEvent(Event): + def __init__(self, publisher_id, processing_id, content=None): + super(NewProcessingEvent, self).__init__(publisher_id, event_type=EventType.NewProcessing, content=content) + self._processing_id = processing_id + + def to_json(self): + ret = super(NewProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret + + +class UpdateProcessingEvent(Event): + def __init__(self, publisher_id, processing_id, content=None): + super(UpdateProcessingEvent, self).__init__(publisher_id, event_type=EventType.UpdateProcessing, content=content) + self._processing_id = processing_id + + def to_json(self): + ret = super(UpdateProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret + + +class AbortProcessingEvent(Event): + def __init__(self, publisher_id, processing_id, content=None): + super(AbortProcessingEvent, self).__init__(publisher_id, event_type=EventType.AbortProcessing, content=content) + self._processing_id = processing_id + + def to_json(self): + ret = super(AbortProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret + + +class ResumeProcessingEvent(Event): + def __init__(self, publisher_id, processing_id, content=None): + super(ResumeProcessingEvent, self).__init__(publisher_id, event_type=EventType.ResumeProcessing, content=content) + self._processing_id = processing_id + + def to_json(self): + ret = super(ResumeProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret + + +class SyncProcessingEvent(Event): + def __init__(self, publisher_id, processing_id, content=None): + super(SyncProcessingEvent, self).__init__(publisher_id, event_type=EventType.SyncProcessing, content=content) + self._processing_id = processing_id + + def to_json(self): + ret = super(SyncProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret + + +class TerminatedProcessingEvent(Event): + def __init__(self, publisher_id, processing_id, content=None): + super(TerminatedProcessingEvent, self).__init__(publisher_id, event_type=EventType.TerminatedProcessing, content=content) + self._processing_id = processing_id + + def to_json(self): + ret = super(TerminatedProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret + + +class TriggerProcessingEvent(Event): + def __init__(self, publisher_id, processing_id, content=None): + super(TriggerProcessingEvent, self).__init__(publisher_id, event_type=EventType.TriggerProcessing, content=content) + self._processing_id = processing_id + + def to_json(self): + ret = super(TriggerProcessingEvent, self).to_json() + ret['processing_id'] = self._processing_id + return ret diff --git a/main/lib/idds/agents/common/eventbus/eventbus.py b/main/lib/idds/agents/common/eventbus/eventbus.py new file mode 100644 index 00000000..45764ef1 --- /dev/null +++ b/main/lib/idds/agents/common/eventbus/eventbus.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2022 + +import logging +import uuid + +from idds.common.constants import Sections +from idds.common.config import config_has_section, config_list_options + +from .localeventbusbackend import LocalEventBusBackend + + +class Singleton(object): + _instance = None + + def __new__(class_, *args, **kwargs): + if not isinstance(class_._instance, class_): + class_._instance = object.__new__(class_, *args, **kwargs) + class_._instance._initialized = False + return class_._instance + + +class EventBus(Singleton): + """ + Event Bus + """ + + def __init__(self, logger=None): + if not self._initialized: + self._initialized = True + + super(EventBus, self).__init__() + self._id = str(uuid.uuid4())[:8] + self.setup_logger(logger) + self.config_section = Sections.EventBus + attrs = self.load_attributes() + if 'backend' in attrs and attrs['backend'] == 'message': + # ToBeDone + # self.backend = MsgEventBusBackend(**attrs) + pass + else: + self.backend = LocalEventBusBackend(logger=self.logger, **attrs) + + def setup_logger(self, logger=None): + """ + Setup logger + """ + if logger: + self.logger = logger + else: + self.logger = logging.getLogger(self.get_class_name()) + + def get_class_name(self): + return self.__class__.__name__ + + def load_attributes(self): + self.logger.info("Loading config for section: %s" % self.config_section) + attrs = {} + if config_has_section(self.config_section): + options = config_list_options(self.config_section) + for option, value in options: + if isinstance(value, str) and value.lower() == 'true': + value = True + if isinstance(value, str) and value.lower() == 'false': + value = False + attrs[option] = value + return attrs + + def publish_event(self, event): + self.backend.send(event) + + def get_event(self, event_type): + # demand_event = DemandEvent(event._event_type, self._id) + event = self.backend.get(event_type, wait=10) + return event + + def get(self, event_type): + return self.get_event(event_type) + + def send(self, event): + return self.publish_event(event) + + def stop(self): + self.backend.stop() diff --git a/main/lib/idds/agents/common/eventbus/localeventbusbackend.py b/main/lib/idds/agents/common/eventbus/localeventbusbackend.py new file mode 100644 index 00000000..5a276a6a --- /dev/null +++ b/main/lib/idds/agents/common/eventbus/localeventbusbackend.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2022 + +import logging +import time +import threading +import traceback +import uuid + +from .event import StateClaimEvent, EventBusState + + +class LocalEventBusBackend(threading.Thread): + """ + Local Event Bus Backend + """ + + def __init__(self, logger=None, **kwargs): + super(LocalEventBusBackend, self).__init__() + self._id = str(uuid.uuid4())[:8] + self._state_claim_wait = 60 + self._state_claim = StateClaimEvent(self._id, EventBusState.New, time.time()) + + self.graceful_stop = threading.Event() + + self._events = {} + self._events_index = {} + + self._lock = threading.RLock() + + self.setup_logger(logger) + + def setup_logger(self, logger=None): + """ + Setup logger + """ + if logger: + self.logger = logger + else: + self.logger = logging.getLogger(self.get_class_name()) + + def get_class_name(self): + return self.__class__.__name__ + + def stop(self, signum=None, frame=None): + self.graceful_stop.set() + + def send(self, event): + with self._lock: + if event._event_type not in self._events: + self._events[event._event_type] = {} + self._events_index[event._event_type] = [] + self._events[event._event_type][event._id] = event + self._events_index[event._event_type].append(event._id) + + def get(self, event_type, wait=0): + with self._lock: + if event_type in self._events_index and self._events_index[event_type]: + event_id = self._events_index[event_type].pop(0) + event = self._events[event_type][event_id] + del self._events[event_type][event_id] + return event + return None + + def execute(self): + while not self.graceful_stop.is_set(): + try: + self.graceful_stop.wait(1) + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + + def run(self): + self.execute() diff --git a/main/lib/idds/agents/common/plugins/__init__.py b/main/lib/idds/agents/common/plugins/__init__.py new file mode 100644 index 00000000..865b774e --- /dev/null +++ b/main/lib/idds/agents/common/plugins/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 diff --git a/main/lib/idds/agents/conductor/plugins/messaging.py b/main/lib/idds/agents/common/plugins/messaging.py similarity index 93% rename from main/lib/idds/agents/conductor/plugins/messaging.py rename to main/lib/idds/agents/common/plugins/messaging.py index 191830fa..28949b6f 100644 --- a/main/lib/idds/agents/conductor/plugins/messaging.py +++ b/main/lib/idds/agents/common/plugins/messaging.py @@ -34,26 +34,27 @@ def __init__(self, broker, output_queue): ''' __init__ ''' + self.name = "MessagingListener" self.__broker = broker self.__output_queue = output_queue self.logger = logging.getLogger(self.__class__.__name__) - def on_error(self, headers, body): + def on_error(self, frame): ''' Error handler ''' - self.logger.error('[broker] [%s]: %s', self.__broker, body) + self.logger.error('[broker] [%s]: %s', self.__broker, frame.body) - def on_message(self, headers, body): + def on_message(self, frame): # self.logger.info('[broker] [%s]: %s', self.__broker, body) - self.__output_queue.put(body) + self.__output_queue.put(frame.body) pass class MessagingSender(PluginBase, threading.Thread): - def __init__(self, **kwargs): - threading.Thread.__init__(self) - super(MessagingSender, self).__init__(**kwargs) + def __init__(self, name="MessagingSender", **kwargs): + threading.Thread.__init__(self, name=name) + super(MessagingSender, self).__init__(name=name, **kwargs) self.setup_logger() self.graceful_stop = threading.Event() @@ -147,8 +148,8 @@ def __call__(self): class MessagingReceiver(MessagingSender): - def __init__(self, **kwargs): - super(MessagingReceiver, self).__init__(**kwargs) + def __init__(self, name="MessagingReceiver", **kwargs): + super(MessagingReceiver, self).__init__(name=name, **kwargs) self.listener = None def get_listener(self, broker): diff --git a/main/lib/idds/agents/common/timerscheduler.py b/main/lib/idds/agents/common/timerscheduler.py index 86a40225..c99a4f5d 100644 --- a/main/lib/idds/agents/common/timerscheduler.py +++ b/main/lib/idds/agents/common/timerscheduler.py @@ -22,13 +22,14 @@ class TimerScheduler(threading.Thread): The base class to schedule Task which will be executed after some time """ - def __init__(self, num_threads, logger=None): - super(TimerScheduler, self).__init__() + def __init__(self, num_threads, name=None, logger=None): + super(TimerScheduler, self).__init__(name=name) self.num_threads = int(num_threads) if self.num_threads < 1: self.num_threads = 1 self.graceful_stop = threading.Event() - self.executors = futures.ThreadPoolExecutor(max_workers=self.num_threads) + self.executors = futures.ThreadPoolExecutor(max_workers=self.num_threads, + thread_name_prefix=name) self._task_queue = [] self._lock = threading.RLock() @@ -90,3 +91,11 @@ def execute_once(self): self.executors.submit(self.execute_task, task) except Exception as error: self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) + + def execute_timer_schedule(self): + try: + task = self.get_ready_task() + if task: + self.executors.submit(self.execute_task, task) + except Exception as error: + self.logger.critical("Caught an exception: %s\n%s" % (str(error), traceback.format_exc())) diff --git a/main/lib/idds/agents/conductor/conductor.py b/main/lib/idds/agents/conductor/conductor.py index 77959391..58787ae5 100644 --- a/main/lib/idds/agents/conductor/conductor.py +++ b/main/lib/idds/agents/conductor/conductor.py @@ -8,7 +8,6 @@ # Authors: # - Wen Guan, , 2019 -import random import time import traceback try: @@ -34,8 +33,8 @@ class Conductor(BaseAgent): """ def __init__(self, num_threads=1, retrieve_bulk_size=1000, threshold_to_release_messages=None, - random_delay=None, **kwargs): - super(Conductor, self).__init__(num_threads=num_threads, **kwargs) + random_delay=None, delay=60, replay_times=3, **kwargs): + super(Conductor, self).__init__(num_threads=num_threads, name='Conductor', **kwargs) self.config_section = Sections.Conductor self.retrieve_bulk_size = int(retrieve_bulk_size) self.message_queue = Queue() @@ -50,6 +49,12 @@ def __init__(self, num_threads=1, retrieve_bulk_size=1000, threshold_to_release_ self.random_delay = int(random_delay) if self.random_delay < 5: self.random_delay = 5 + if delay is None: + delay = 60 + self.delay = int(delay) + if replay_times is None: + replay_times = 3 + self.replay_times = int(replay_times) def __del__(self): self.stop_notifier() @@ -62,17 +67,30 @@ def get_messages(self): bulk_size=self.retrieve_bulk_size, destination=MessageDestination.Outside) - self.logger.debug("Main thread get %s new messages" % len(messages)) + # self.logger.debug("Main thread get %s new messages" % len(messages)) if messages: self.logger.info("Main thread get %s new messages" % len(messages)) - return messages + retry_messages = [] + for retry in range(1, self.replay_times + 1): + delay = int(self.delay) * (retry ** 3) + + messages_d = core_messages.retrieve_messages(status=MessageStatus.Delivered, + retries=retry, delay=delay, + bulk_size=self.retrieve_bulk_size, + destination=MessageDestination.Outside) + if messages_d: + self.logger.info("Main thread get %s retries messages" % len(messages_d)) + retry_messages += messages_d + + return messages + retry_messages def clean_messages(self, msgs): # core_messages.delete_messages(msgs) to_updates = [] for msg in msgs: to_updates.append({'msg_id': msg['msg_id'], + 'retries': msg['retries'] + 1, 'status': MessageStatus.Delivered}) core_messages.update_messages(to_updates) @@ -112,22 +130,18 @@ def run(self): self.start_notifier() - self.add_health_message_task() + # self.add_health_message_task() while not self.graceful_stop.is_set(): # execute timer task self.execute_once() - reach_threshold = False try: num_contents = 0 messages = self.get_messages() for message in messages: num_contents += message['num_contents'] self.message_queue.put(message) - if self.threshold_to_release_messages and num_contents > self.threshold_to_release_messages: - reach_threshold = True - break while not self.message_queue.empty(): time.sleep(1) output_messages = self.get_output_messages() @@ -136,10 +150,7 @@ def run(self): self.logger.error("Main thread IDDSException: %s" % str(error)) except Exception as error: self.logger.critical("Main thread exception: %s\n%s" % (str(error), traceback.format_exc())) - if self.random_delay is None or not reach_threshold: - time.sleep(5) - else: - time.sleep(random.randint(5, self.random_delay)) + # time.sleep(random.randint(5, self.random_delay)) except KeyboardInterrupt: self.stop() diff --git a/main/lib/idds/agents/main.py b/main/lib/idds/agents/main.py index b386bb5f..f7de3440 100755 --- a/main/lib/idds/agents/main.py +++ b/main/lib/idds/agents/main.py @@ -32,7 +32,11 @@ 'marshaller': ['idds.agents.marshaller.marshaller.Marshaller', Sections.Marshaller], 'transformer': ['idds.agents.transformer.transformer.Transformer', Sections.Transformer], 'transporter': ['idds.agents.transporter.transporter.Transporter', Sections.Transporter], - 'carrier': ['idds.agents.carrier.carrier.Carrier', Sections.Carrier], + 'submitter': ['idds.agents.carrier.submitter.Submitter', Sections.Carrier], + 'poller': ['idds.agents.carrier.poller.Poller', Sections.Carrier], + 'receiver': ['idds.agents.carrier.receiver.Receiver', Sections.Carrier], + 'trigger': ['idds.agents.carrier.trigger.Trigger', Sections.Carrier], + 'finisher': ['idds.agents.carrier.finisher.Finisher', Sections.Carrier], 'conductor': ['idds.agents.conductor.conductor.Conductor', Sections.Conductor], 'consumer': ['idds.agents.conductor.consumer.Consumer', Sections.Consumer] } @@ -114,7 +118,7 @@ def stop(signum=None, frame=None): [thr.stop() for thr in RUNNING_AGENTS if thr and thr.is_alive()] stop_time = time.time() while len(RUNNING_AGENTS): - [thr.join(timeout=3.14) for thr in RUNNING_AGENTS if thr and thr.is_alive()] + [thr.join(timeout=1) for thr in RUNNING_AGENTS if thr and thr.is_alive()] RUNNING_AGENTS = [thr for thr in RUNNING_AGENTS if thr and thr.is_alive()] if time.time() > stop_time + 180: break @@ -123,7 +127,9 @@ def stop(signum=None, frame=None): [thr.terminate() for thr in RUNNING_AGENTS if thr and thr.is_alive()] while len(RUNNING_AGENTS): - [thr.join(timeout=3.14) for thr in RUNNING_AGENTS if thr and thr.is_alive()] + logging.info("Still running agents: %s" % str(RUNNING_AGENTS)) + [thr.terminate() for thr in RUNNING_AGENTS if thr and thr.is_alive()] + [thr.join(timeout=1) for thr in RUNNING_AGENTS if thr and thr.is_alive()] RUNNING_AGENTS = [thr for thr in RUNNING_AGENTS if thr and thr.is_alive()] diff --git a/main/lib/idds/agents/transformer/transformer.py b/main/lib/idds/agents/transformer/transformer.py index 1ca4b084..a54a4cba 100644 --- a/main/lib/idds/agents/transformer/transformer.py +++ b/main/lib/idds/agents/transformer/transformer.py @@ -6,30 +6,29 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2021 +# - Wen Guan, , 2019 - 2022 import copy import datetime +import random import time import traceback -try: - # python 3 - from queue import Queue -except ImportError: - # Python 2 - from Queue import Queue from idds.common import exceptions -from idds.common.constants import (Sections, TransformStatus, TransformLocking, TransformType, - ContentRelationType, CollectionStatus, - ContentType, ContentStatus, - ProcessingStatus, MessageType, MessageTypeStr, - MessageStatus, MessageSource, MessageDestination) -from idds.common.utils import setup_logging +from idds.common.constants import (Sections, TransformStatus, TransformLocking, + CommandType, ProcessingStatus) +from idds.common.utils import setup_logging, truncate_string from idds.core import (transforms as core_transforms, - processings as core_processings, - catalog as core_catalog) + processings as core_processings) from idds.agents.common.baseagent import BaseAgent +from idds.agents.common.eventbus.event import (EventType, + NewTransformEvent, + UpdateTransformEvent, + AbortProcessingEvent, + ResumeProcessingEvent, + UpdateRequestEvent, + NewProcessingEvent, + UpdateProcessingEvent) setup_logging(__name__) @@ -39,51 +38,88 @@ class Transformer(BaseAgent): Transformer works to process transforms. """ - def __init__(self, num_threads=1, poll_time_period=1800, retrieve_bulk_size=10, + def __init__(self, num_threads=1, poll_period=1800, retries=3, retrieve_bulk_size=10, message_bulk_size=10000, **kwargs): - super(Transformer, self).__init__(num_threads=num_threads, **kwargs) + super(Transformer, self).__init__(num_threads=num_threads, name='Transformer', **kwargs) self.config_section = Sections.Transformer - self.poll_time_period = int(poll_time_period) + self.poll_period = int(poll_period) + self.retries = int(retries) self.retrieve_bulk_size = int(retrieve_bulk_size) self.message_bulk_size = int(message_bulk_size) - if not hasattr(self, 'retries') or not self.retries: - self.retries = 100 + if not hasattr(self, 'new_poll_period') or not self.new_poll_period: + self.new_poll_period = self.poll_period else: - self.retries = int(self.retries) + self.new_poll_period = int(self.new_poll_period) + if not hasattr(self, 'update_poll_period') or not self.update_poll_period: + self.update_poll_period = self.poll_period + else: + self.update_poll_period = int(self.update_poll_period) - self.new_task_queue = Queue() - self.new_output_queue = Queue() - self.running_task_queue = Queue() - self.running_output_queue = Queue() - self.new_processing_size = 0 - self.running_processing_size = 0 + if not hasattr(self, 'new_poll_period') or not self.new_poll_period: + self.new_poll_period = self.poll_period + else: + self.new_poll_period = int(self.new_poll_period) + if not hasattr(self, 'update_poll_period') or not self.update_poll_period: + self.update_poll_period = self.poll_period + else: + self.update_poll_period = int(self.update_poll_period) + + if hasattr(self, 'poll_period_increase_rate'): + self.poll_period_increase_rate = float(self.poll_period_increase_rate) + else: + self.poll_period_increase_rate = 2 + + if hasattr(self, 'max_new_poll_period'): + self.max_new_poll_period = int(self.max_new_poll_period) + else: + self.max_new_poll_period = 3600 * 6 + if hasattr(self, 'max_update_poll_period'): + self.max_update_poll_period = int(self.max_update_poll_period) + else: + self.max_update_poll_period = 3600 * 6 + + self.number_workers = 0 + if not hasattr(self, 'max_number_workers') or not self.max_number_workers: + self.max_number_workers = 3 + else: + self.max_number_workers = int(self.max_number_workers) + + def is_ok_to_run_more_transforms(self): + if self.number_workers >= self.max_number_workers: + return False + return True def show_queue_size(self): - q_str = "new queue size: %s, processing size: %s, output queue size: %s, " % (self.new_task_queue.qsize(), - self.new_processing_size, - self.new_output_queue.qsize()) - q_str += "running queue size: %s, processing size: %s, output queue size: %s" % (self.running_task_queue.qsize(), - self.running_processing_size, - self.running_output_queue.qsize()) - self.logger.debug(q_str) + if self.number_workers > 0: + q_str = "number of transforms: %s, max number of transforms: %s" % (self.number_workers, self.max_number_workers) + self.logger.debug(q_str) def get_new_transforms(self): """ Get new transforms to process """ try: - if self.new_task_queue.qsize() > 0 or self.new_output_queue.qsize() > 0: + if not self.is_ok_to_run_more_transforms(): return [] self.show_queue_size() transform_status = [TransformStatus.New, TransformStatus.Ready, TransformStatus.Extend] - transforms_new = core_transforms.get_transforms_by_status(status=transform_status, locking=True, bulk_size=self.retrieve_bulk_size) + # next_poll_at = datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_period) + transforms_new = core_transforms.get_transforms_by_status(status=transform_status, locking=True, + not_lock=True, + new_poll=True, only_return_id=True, + bulk_size=self.retrieve_bulk_size) - self.logger.debug("Main thread get %s New+Ready+Extend transforms to process" % len(transforms_new)) + # self.logger.debug("Main thread get %s New+Ready+Extend transforms to process" % len(transforms_new)) if transforms_new: - self.logger.info("Main thread get %s New+Ready+Extend transforms to process" % len(transforms_new)) + self.logger.info("Main thread get New+Ready+Extend transforms to process: %s" % str(transforms_new)) + + for tf_id in transforms_new: + event = NewTransformEvent(publisher_id=self.id, transform_id=tf_id) + self.event_bus.send(event) + return transforms_new except exceptions.DatabaseException as ex: if 'ORA-00060' in str(ex): @@ -94,216 +130,92 @@ def get_new_transforms(self): self.logger.error(traceback.format_exc()) return [] - def get_new_contents(self, transform, new_input_output_maps): - new_input_contents, new_output_contents, new_log_contents = [], [], [] - new_input_dependency_contents = [] - for map_id in new_input_output_maps: - inputs = new_input_output_maps[map_id]['inputs'] if 'inputs' in new_input_output_maps[map_id] else [] - inputs_dependency = new_input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in new_input_output_maps[map_id] else [] - outputs = new_input_output_maps[map_id]['outputs'] if 'outputs' in new_input_output_maps[map_id] else [] - logs = new_input_output_maps[map_id]['logs'] if 'logs' in new_input_output_maps[map_id] else [] - - for input_content in inputs: - content = {'transform_id': transform['transform_id'], - 'coll_id': input_content['coll_id'], - 'request_id': transform['request_id'], - 'workload_id': transform['workload_id'], - 'map_id': map_id, - 'scope': input_content['scope'], - 'name': input_content['name'], - 'min_id': input_content['min_id'] if 'min_id' in input_content else 0, - 'max_id': input_content['max_id'] if 'max_id' in input_content else 0, - 'status': input_content['status'] if 'status' in input_content and input_content['status'] is not None else ContentStatus.New, - 'substatus': input_content['substatus'] if 'substatus' in input_content and input_content['substatus'] is not None else ContentStatus.New, - 'path': input_content['path'] if 'path' in input_content else None, - 'content_type': input_content['content_type'] if 'content_type' in input_content else ContentType.File, - 'content_relation_type': ContentRelationType.Input, - 'bytes': input_content['bytes'], - 'adler32': input_content['adler32'], - 'content_metadata': input_content['content_metadata']} - if content['min_id'] is None: - content['min_id'] = 0 - if content['max_id'] is None: - content['max_id'] = 0 - new_input_contents.append(content) - for input_content in inputs_dependency: - content = {'transform_id': transform['transform_id'], - 'coll_id': input_content['coll_id'], - 'request_id': transform['request_id'], - 'workload_id': transform['workload_id'], - 'map_id': map_id, - 'scope': input_content['scope'], - 'name': input_content['name'], - 'min_id': input_content['min_id'] if 'min_id' in input_content else 0, - 'max_id': input_content['max_id'] if 'max_id' in input_content else 0, - 'status': input_content['status'] if 'status' in input_content and input_content['status'] is not None else ContentStatus.New, - 'substatus': input_content['substatus'] if 'substatus' in input_content and input_content['substatus'] is not None else ContentStatus.New, - 'path': input_content['path'] if 'path' in input_content else None, - 'content_type': input_content['content_type'] if 'content_type' in input_content else ContentType.File, - 'content_relation_type': ContentRelationType.InputDependency, - 'bytes': input_content['bytes'], - 'adler32': input_content['adler32'], - 'content_metadata': input_content['content_metadata']} - if content['min_id'] is None: - content['min_id'] = 0 - if content['max_id'] is None: - content['max_id'] = 0 - new_input_dependency_contents.append(content) - for output_content in outputs: - content = {'transform_id': transform['transform_id'], - 'coll_id': output_content['coll_id'], - 'request_id': transform['request_id'], - 'workload_id': transform['workload_id'], - 'map_id': map_id, - 'scope': output_content['scope'], - 'name': output_content['name'], - 'min_id': output_content['min_id'] if 'min_id' in output_content else 0, - 'max_id': output_content['max_id'] if 'max_id' in output_content else 0, - 'status': ContentStatus.New, - 'substatus': ContentStatus.New, - 'path': output_content['path'] if 'path' in output_content else None, - 'content_type': output_content['content_type'] if 'content_type' in output_content else ContentType.File, - 'content_relation_type': ContentRelationType.Output, - 'bytes': output_content['bytes'], - 'adler32': output_content['adler32'], - 'content_metadata': output_content['content_metadata']} - if content['min_id'] is None: - content['min_id'] = 0 - if content['max_id'] is None: - content['max_id'] = 0 - new_output_contents.append(content) - for log_content in logs: - content = {'transform_id': transform['transform_id'], - 'coll_id': log_content['coll_id'], - 'request_id': transform['request_id'], - 'workload_id': transform['workload_id'], - 'map_id': map_id, - 'scope': log_content['scope'], - 'name': log_content['name'], - 'min_id': log_content['min_id'] if 'min_id' in log_content else 0, - 'max_id': log_content['max_id'] if 'max_id' in log_content else 0, - 'status': ContentStatus.New, - 'substatus': ContentStatus.New, - 'path': log_content['path'] if 'path' in log_content else None, - 'content_type': log_content['content_type'] if 'content_type' in log_content else ContentType.File, - 'content_relation_type': ContentRelationType.Log, - 'bytes': log_content['bytes'], - 'adler32': log_content['adler32'], - 'content_metadata': log_content['content_metadata']} - if content['min_id'] is None: - content['min_id'] = 0 - if content['max_id'] is None: - content['max_id'] = 0 - new_output_contents.append(content) - return new_input_contents, new_output_contents, new_log_contents, new_input_dependency_contents - - def is_all_inputs_dependency_available(self, inputs_dependency): - for content in inputs_dependency: - if content['status'] not in [ContentStatus.Available, ContentStatus.FakeAvailable]: - return False - return True + def get_running_transforms(self): + """ + Get running transforms + """ + try: + if not self.is_ok_to_run_more_transforms(): + return [] - def is_all_inputs_dependency_terminated(self, inputs_dependency): - for content in inputs_dependency: - if content['status'] not in [ContentStatus.Available, ContentStatus.FakeAvailable, - ContentStatus.FinalFailed, ContentStatus.Missing]: - return False - return True + self.show_queue_size() - def is_input_dependency_terminated(self, input_dependency): - if input_dependency['status'] in [ContentStatus.Available, ContentStatus.FakeAvailable, - ContentStatus.FinalFailed, ContentStatus.Missing]: - return True - return False - - def get_updated_contents(self, transform, registered_input_output_maps): - updated_contents = [] - updated_input_contents_full, updated_output_contents_full = [], [] - - for map_id in registered_input_output_maps: - inputs = registered_input_output_maps[map_id]['inputs'] if 'inputs' in registered_input_output_maps[map_id] else [] - outputs = registered_input_output_maps[map_id]['outputs'] if 'outputs' in registered_input_output_maps[map_id] else [] - inputs_dependency = registered_input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in registered_input_output_maps[map_id] else [] - - if self.is_all_inputs_dependency_available(inputs_dependency): - # self.logger.debug("all input dependency available: %s, inputs: %s" % (str(inputs_dependency), str(inputs))) - for content in inputs: - content['substatus'] = ContentStatus.Available - if content['status'] != content['substatus']: - updated_content = {'content_id': content['content_id'], - 'status': content['substatus'], - 'substatus': content['substatus']} - content['status'] = content['substatus'] - updated_contents.append(updated_content) - updated_input_contents_full.append(content) - elif self.is_all_inputs_dependency_terminated(inputs_dependency): - # self.logger.debug("all input dependency terminated: %s, inputs: %s, outputs: %s" % (str(inputs_dependency), str(inputs), str(outputs))) - for content in inputs: - content['substatus'] = ContentStatus.Missing - if content['status'] != content['substatus']: - updated_content = {'content_id': content['content_id'], - 'status': content['substatus'], - 'substatus': content['substatus']} - content['status'] = content['substatus'] - updated_contents.append(updated_content) - updated_input_contents_full.append(content) - for content in outputs: - content['substatus'] = ContentStatus.Missing - if content['status'] != content['substatus']: - content['status'] = content['substatus'] - updated_content = {'content_id': content['content_id'], - 'status': content['substatus'], - 'substatus': content['substatus']} - updated_contents.append(updated_content) - updated_output_contents_full.append(content) - - for content in outputs: - if content['status'] != content['substatus']: - updated_content = {'content_id': content['content_id'], - 'status': content['substatus']} - content['status'] = content['substatus'] - updated_contents.append(updated_content) - updated_output_contents_full.append(content) - return updated_contents, updated_input_contents_full, updated_output_contents_full - - def trigger_release_inputs(self, updated_output_contents, work, input_output_maps, final=False): - to_release_inputs = {} - for map_id in input_output_maps: - outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] - for content in outputs: - if (content['status'] in [ContentStatus.Available, ContentStatus.FakeAvailable, ContentStatus.FinalFailed, ContentStatus.Missing] - or content['substatus'] in [ContentStatus.Available, ContentStatus.FakeAvailable, ContentStatus.FinalFailed, ContentStatus.Missing]): # noqa W503 - if content['coll_id'] not in to_release_inputs: - to_release_inputs[content['coll_id']] = [] - to_release_inputs[content['coll_id']].append(content) - - # updated_contents = core_transforms.release_inputs(to_release_inputs) - updated_contents = core_transforms.release_inputs_by_collection(to_release_inputs, final=final) - # self.logger.debug("trigger_release_inputs, to_release_inputs: %s" % str(to_release_inputs)) - self.logger.debug("trigger_release_inputs, updated_contents[:10]: %s" % str(updated_contents[:10])) - return updated_contents - - def poll_inputs_dependency(self, transform, registered_input_output_maps): - unfinished_inputs = {} - for map_id in registered_input_output_maps: - inputs_dependency = registered_input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in registered_input_output_maps[map_id] else [] - for content in inputs_dependency: - if (content['status'] not in [ContentStatus.Available, ContentStatus.FakeAvailable, ContentStatus.FinalFailed, ContentStatus.Missing] - and content['substatus'] not in [ContentStatus.Available, ContentStatus.FakeAvailable, ContentStatus.FinalFailed, ContentStatus.Missing]): # noqa W503 - if content['coll_id'] not in unfinished_inputs: - unfinished_inputs[content['coll_id']] = [] - unfinished_inputs[content['coll_id']].append(content) - - # updated_contents = core_transforms.release_inputs(to_release_inputs) - updated_contents = core_transforms.poll_inputs_dependency_by_collection(unfinished_inputs) - self.logger.debug("poll_inputs_dependency, updated_contents[:10]: %s" % str(updated_contents[:10])) - return updated_contents - - def process_new_transform_real(self, transform): + transform_status = [TransformStatus.Transforming, + TransformStatus.ToCancel, TransformStatus.Cancelling, + TransformStatus.ToSuspend, TransformStatus.Suspending, + TransformStatus.ToExpire, TransformStatus.Expiring, + TransformStatus.ToResume, TransformStatus.Resuming, + TransformStatus.ToFinish, TransformStatus.ToForceFinish] + transforms = core_transforms.get_transforms_by_status(status=transform_status, + period=None, + locking=True, + not_lock=True, + update_poll=True, only_return_id=True, + bulk_size=self.retrieve_bulk_size) + + # self.logger.debug("Main thread get %s transforming transforms to process" % len(transforms)) + if transforms: + self.logger.info("Main thread get transforming transforms to process: %s" % str(transforms)) + + for tf_id in transforms: + event = UpdateTransformEvent(publisher_id=self.id, transform_id=tf_id) + self.event_bus.send(event) + + return transforms + except exceptions.DatabaseException as ex: + if 'ORA-00060' in str(ex): + self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") + else: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + return [] + + def get_transform(self, transform_id, status=None, locking=False): + try: + return core_transforms.get_transform_by_id_status(transform_id=transform_id, status=status, locking=locking) + except exceptions.DatabaseException as ex: + if 'ORA-00060' in str(ex): + self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") + else: + # raise ex + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + return None + + def load_poll_period(self, transform, parameters): + if self.new_poll_period and transform['new_poll_period'] != self.new_poll_period: + parameters['new_poll_period'] = self.new_poll_period + if self.update_poll_period and transform['update_poll_period'] != self.update_poll_period: + parameters['update_poll_period'] = self.update_poll_period + return parameters + + def generate_processing_model(self, transform): + new_processing_model = {} + new_processing_model['transform_id'] = transform['transform_id'] + new_processing_model['request_id'] = transform['request_id'] + new_processing_model['workload_id'] = transform['workload_id'] + new_processing_model['status'] = ProcessingStatus.New + # new_processing_model['expired_at'] = work.get_expired_at(None) + new_processing_model['expired_at'] = transform['expired_at'] + + new_processing_model['new_poll_period'] = transform['new_poll_period'] + new_processing_model['update_poll_period'] = transform['update_poll_period'] + new_processing_model['max_new_retries'] = transform['max_new_retries'] + new_processing_model['max_update_retries'] = transform['max_update_retries'] + return new_processing_model + + def get_log_prefix(self, transform): + if transform: + return "" % (transform['request_id'], transform['transform_id']) + self.logger.error("get_log_prefix transform is empty: %s" % str(transform)) + return "" + + def handle_new_transform_real(self, transform): """ Process new transform """ - self.logger.info("process_new_transform: transform_id: %s" % transform['transform_id']) + log_pre = self.get_log_prefix(transform) + self.logger.info(log_pre + "handle_new_transform: transform_id: %s" % transform['transform_id']) work = transform['transform_metadata']['work'] work.set_work_id(transform['transform_id']) @@ -312,925 +224,515 @@ def process_new_transform_real(self, transform): work_name_to_coll_map = core_transforms.get_work_name_to_coll_map(request_id=transform['request_id']) work.set_work_name_to_coll_map(work_name_to_coll_map) - # check contents - new_input_output_maps = work.get_new_input_output_maps(mapped_input_output_maps={}) - - new_input_contents, new_output_contents, new_log_contents, new_input_dependency_contents = self.get_new_contents(transform, new_input_output_maps) - new_contents = [] - if new_input_contents: - new_contents = new_contents + new_input_contents - if new_output_contents: - new_contents = new_contents + new_output_contents - if new_log_contents: - new_contents = new_contents + new_log_contents - if new_input_dependency_contents: - new_contents = new_contents + new_input_dependency_contents - # create processing new_processing_model = None - processing = work.get_processing(new_input_output_maps, without_creating=False) - self.logger.debug("work get_processing with creating: %s" % processing) + processing = work.get_processing(input_output_maps=[], without_creating=False) + self.logger.debug(log_pre + "work get_processing with creating: %s" % processing) if processing and not processing.processing_id: - new_processing_model = {} - new_processing_model['transform_id'] = transform['transform_id'] - new_processing_model['request_id'] = transform['request_id'] - new_processing_model['workload_id'] = transform['workload_id'] - new_processing_model['status'] = ProcessingStatus.New - # new_processing_model['expired_at'] = work.get_expired_at(None) - new_processing_model['expired_at'] = transform['expired_at'] - - # if 'processing_metadata' not in processing: - # processing['processing_metadata'] = {} - # if 'processing_metadata' not in new_processing_model: - # new_processing_model['processing_metadata'] = {} - # new_processing_model['processing_metadata'] = processing.processing_metadata + new_processing_model = self.generate_processing_model(transform) proc_work = copy.deepcopy(work) proc_work.clean_work() processing.work = proc_work new_processing_model['processing_metadata'] = {'processing': processing} - msgs = [] - self.logger.info("generate_message: %s" % transform['transform_id']) - if new_input_contents: - msg = self.generate_message(transform, files=new_input_contents, msg_type='file', relation_type='input') - msgs.append(msg) - if new_output_contents: - msg = self.generate_message(transform, files=new_output_contents, msg_type='file', relation_type='output') - msgs.append(msg) - transform_parameters = {'status': TransformStatus.Transforming, 'locking': TransformLocking.Idle, 'workload_id': transform['workload_id'], - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period), - # 'next_poll_at': datetime.datetime.utcnow(), 'transform_metadata': transform['transform_metadata']} - if new_contents: - work.has_new_updates() + transform_parameters = self.load_poll_period(transform, transform_parameters) + + if new_processing_model is not None: + if 'new_poll_period' in transform_parameters: + new_processing_model['new_poll_period'] = transform_parameters['new_poll_period'] + if 'update_poll_period' in transform_parameters: + new_processing_model['update_poll_period'] = transform_parameters['update_poll_period'] + if 'max_new_retries' in transform_parameters: + new_processing_model['max_new_retries'] = transform_parameters['max_new_retries'] + if 'max_update_retries' in transform_parameters: + new_processing_model['max_update_retries'] = transform_parameters['max_update_retries'] ret = {'transform': transform, 'transform_parameters': transform_parameters, - 'new_contents': new_contents, - # 'update_contents': updated_contents + to_release_input_contents, - 'messages': msgs, 'new_processing': new_processing_model } return ret - def process_new_transform(self, transform): + def handle_new_transform(self, transform): """ Process new transform """ try: - ret = self.process_new_transform_real(transform) + log_pre = self.get_log_prefix(transform) + ret = self.handle_new_transform_real(transform) + self.logger.info(log_pre + "handle_new_transform result: %s" % str(ret)) except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) - if transform['retries'] > self.retries: - tf_status = TransformStatus.Failed + retries = transform['new_retries'] + 1 + if not transform['max_new_retries'] or retries < transform['max_new_retries']: + tf_status = transform['status'] else: - tf_status = TransformStatus.Transforming + tf_status = TransformStatus.Failed + + # increase poll period + new_poll_period = int(transform['new_poll_period'].total_seconds() * self.poll_period_increase_rate) + if new_poll_period > self.max_new_poll_period: + new_poll_period = self.max_new_poll_period - wait_times = max(4, transform['retries']) + error = {'submit_err': {'msg': truncate_string('%s' % (ex), length=200)}} transform_parameters = {'status': tf_status, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period * wait_times), - 'retries': transform['retries'] + 1, + 'new_retries': retries, + 'new_poll_period': new_poll_period, + 'errors': transform['errors'] if transform['errors'] else {}, 'locking': TransformLocking.Idle} + transform_parameters['errors'].update(error) ret = {'transform': transform, 'transform_parameters': transform_parameters} + self.logger.info(log_pre + "handle_new_transform exception result: %s" % str(ret)) return ret - def process_new_transforms(self): - ret = [] - while not self.new_task_queue.empty(): - try: - transform = self.new_task_queue.get() - if transform: - self.new_processing_size += 1 - self.logger.info("Main thread processing new transform: %s" % transform) - ret_transform = self.process_new_transform(transform) - self.new_processing_size -= 1 - if ret_transform: - self.new_output_queue.put(ret_transform) - # ret.append(ret_transform) - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - return ret - - def finish_new_transforms(self): - while not self.new_output_queue.empty(): - try: - ret = self.new_output_queue.get() - self.logger.info("Main thread finishing processing transform: %s" % ret['transform']) - if ret: - retry = True - retry_num = 0 - while retry: - retry = False - retry_num += 1 - try: - # self.logger.debug("wen: %s" % str(ret['output_contents'])) - core_transforms.add_transform_outputs(transform=ret['transform'], - transform_parameters=ret['transform_parameters'], - input_collections=ret.get('input_collections', None), - output_collections=ret.get('output_collections', None), - log_collections=ret.get('log_collections', None), - new_contents=ret.get('new_contents', None), - update_input_collections=ret.get('update_input_collections', None), - update_output_collections=ret.get('update_output_collections', None), - update_log_collections=ret.get('update_log_collections', None), - update_contents=ret.get('update_contents', None), - messages=ret.get('messages', None), - new_processing=ret.get('new_processing', None), - message_bulk_size=self.message_bulk_size) - except exceptions.DatabaseException as ex: - if 'ORA-00060' in str(ex): - self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") - if retry_num < 5: - retry = True - time.sleep(60 * retry_num * 2) + def update_transform(self, ret): + new_pr_ids, update_pr_ids = [], [] + try: + if ret: + log_pre = self.get_log_prefix(ret['transform']) + self.logger.info(log_pre + "Update transform: %s" % str(ret)) + + ret['transform_parameters']['locking'] = TransformLocking.Idle + ret['transform_parameters']['updated_at'] = datetime.datetime.utcnow() + + retry = True + retry_num = 0 + while retry: + retry = False + retry_num += 1 + try: + # self.logger.debug("wen: %s" % str(ret['output_contents'])) + new_pr_ids, update_pr_ids = core_transforms.add_transform_outputs(transform=ret['transform'], + transform_parameters=ret['transform_parameters'], + input_collections=ret.get('input_collections', None), + output_collections=ret.get('output_collections', None), + log_collections=ret.get('log_collections', None), + new_contents=ret.get('new_contents', None), + update_input_collections=ret.get('update_input_collections', None), + update_output_collections=ret.get('update_output_collections', None), + update_log_collections=ret.get('update_log_collections', None), + update_contents=ret.get('update_contents', None), + messages=ret.get('messages', None), + update_messages=ret.get('update_messages', None), + new_processing=ret.get('new_processing', None), + update_processing=ret.get('update_processing', None), + message_bulk_size=self.message_bulk_size) + except exceptions.DatabaseException as ex: + if 'ORA-00060' in str(ex): + self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") + if retry_num < 5: + retry = True + if retry_num <= 1: + random_sleep = random.randint(1, 10) + elif retry_num <= 2: + random_sleep = random.randint(1, 60) else: - raise ex + random_sleep = random.randint(1, 120) + time.sleep(random_sleep) else: raise ex - # self.logger.error(ex) - # self.logger.error(traceback.format_exc()) + else: + raise ex + # self.logger.error(ex) + # self.logger.error(traceback.format_exc()) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + try: + transform_parameters = {'status': TransformStatus.Transforming, + 'locking': TransformLocking.Idle} + if 'new_retries' in ret['transform_parameters']: + transform_parameters['new_retries'] = ret['transform_parameters']['new_retries'] + if 'update_retries' in ret['transform_parameters']: + transform_parameters['update_retries'] = ret['transform_parameters']['update_retries'] + if 'errors' in ret['transform_parameters']: + transform_parameters['errors'] = ret['transform_parameters']['errors'] + + log_pre = self.get_log_prefix(ret['transform']) + self.logger.warn(log_pre + "update transform exception result: %s" % str(transform_parameters)) + + new_pr_ids, update_pr_ids = core_transforms.add_transform_outputs(transform=ret['transform'], + transform_parameters=transform_parameters) except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) - try: - transform_parameters = {'status': TransformStatus.Transforming, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period), - 'retries': ret['transform']['retries'] + 1, - 'locking': TransformLocking.Idle} - core_transforms.add_transform_outputs(transform=ret['transform'], - transform_parameters=transform_parameters) - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) + return new_pr_ids, update_pr_ids - def get_running_transforms(self): - """ - Get running transforms - """ + def process_new_transform(self, event): + self.number_workers += 1 try: - if self.running_task_queue.qsize() > 0 or self.running_output_queue.qsize() > 0: - return [] - - self.show_queue_size() - - transform_status = [TransformStatus.Transforming, - TransformStatus.ToCancel, TransformStatus.Cancelling, - TransformStatus.ToSuspend, TransformStatus.Suspending, - TransformStatus.ToExpire, TransformStatus.Expiring, - TransformStatus.ToResume, TransformStatus.Resuming, - TransformStatus.ToFinish, TransformStatus.ToForceFinish] - transforms = core_transforms.get_transforms_by_status(status=transform_status, - period=None, - locking=True, - with_messaging=True, - bulk_size=self.retrieve_bulk_size) - - self.logger.debug("Main thread get %s transforming transforms to process" % len(transforms)) - if transforms: - self.logger.info("Main thread get %s transforming transforms to process" % len(transforms)) - return transforms - except exceptions.DatabaseException as ex: - if 'ORA-00060' in str(ex): - self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") - else: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - return [] - - def get_collection_ids(self, collections): - coll_ids = [] - for coll in collections: - coll_ids.append(coll.coll_id) - return coll_ids - - def get_message_type(self, transform_type, input_type='file'): - if transform_type in [TransformType.StageIn, TransformType.StageIn.value]: - if input_type == 'work': - msg_type_str = MessageTypeStr.StageInWork - msg_type = MessageType.StageInWork - elif input_type == 'collection': - msg_type_str = MessageTypeStr.StageInCollection - msg_type = MessageType.StageInCollection - else: - msg_type_str = MessageTypeStr.StageInFile - msg_type = MessageType.StageInFile - elif transform_type in [TransformType.ActiveLearning, TransformType.ActiveLearning.value]: - if input_type == 'work': - msg_type_str = MessageTypeStr.ActiveLearningWork - msg_type = MessageType.ActiveLearningWork - elif input_type == 'collection': - msg_type_str = MessageTypeStr.ActiveLearningCollection - msg_type = MessageType.ActiveLearningCollection - else: - msg_type_str = MessageTypeStr.ActiveLearningFile - msg_type = MessageType.ActiveLearningFile - elif transform_type in [TransformType.HyperParameterOpt, TransformType.HyperParameterOpt.value]: - if input_type == 'work': - msg_type_str = MessageTypeStr.HyperParameterOptWork - msg_type = MessageType.HyperParameterOptWork - elif input_type == 'collection': - msg_type_str = MessageTypeStr.HyperParameterOptCollection - msg_type = MessageType.HyperParameterOptCollection - else: - msg_type_str = MessageTypeStr.HyperParameterOptFile - msg_type = MessageType.HyperParameterOptFile - elif transform_type in [TransformType.Processing, TransformType.Processing.value]: - if input_type == 'work': - msg_type_str = MessageTypeStr.ProcessingWork - msg_type = MessageType.ProcessingWork - elif input_type == 'collection': - msg_type_str = MessageTypeStr.ProcessingCollection - msg_type = MessageType.ProcessingCollection - else: - msg_type_str = MessageTypeStr.ProcessingFile - msg_type = MessageType.ProcessingFile - else: - if input_type == 'work': - msg_type_str = MessageTypeStr.UnknownWork - msg_type = MessageType.UnknownWork - elif input_type == 'collection': - msg_type_str = MessageTypeStr.UnknownCollection - msg_type = MessageType.UnknownCollection - else: - msg_type_str = MessageTypeStr.UnknownFile - msg_type = MessageType.UnknownFile - return msg_type, msg_type_str.value - - def generate_message(self, transform, work=None, collection=None, files=None, msg_type='file', relation_type='input'): - if msg_type == 'work': - if not work: - return None - elif msg_type == 'collection': - if not collection: - return None - if not work: - work = transform['transform_metadata']['work'] - else: - if not files: - return None - - request_id = transform['request_id'] - workload_id = transform['workload_id'] - i_msg_type, i_msg_type_str = None, None - - if msg_type == 'work': - i_msg_type, i_msg_type_str = self.get_message_type(transform['transform_type'], input_type='work') - msg_content = {'msg_type': i_msg_type_str, - 'request_id': request_id, - 'workload_id': workload_id, - 'relation_type': relation_type, - 'status': transform['status'].name, - 'output': work.get_output_data(), - 'error': work.get_terminated_msg()} - num_msg_content = 1 - elif msg_type == 'collection': - # fix for old requests - coll_name = collection.name - if coll_name.endswith(".idds.stagein"): - coll_name = coll_name.replace(".idds.stagein", "") - - i_msg_type, i_msg_type_str = self.get_message_type(transform['transform_type'], input_type='collection') - msg_content = {'msg_type': i_msg_type_str, - 'request_id': request_id, - 'workload_id': workload_id, - 'relation_type': relation_type, - 'collections': [{'scope': collection.scope, - 'name': coll_name, - 'status': collection.status.name}], - 'output': work.get_output_data(), - 'error': work.get_terminated_msg()} - num_msg_content = 1 - else: - i_msg_type, i_msg_type_str = self.get_message_type(transform['transform_type'], input_type='file') - files_message = [] - for file in files: - file_status = file['status'].name - if file['status'] == ContentStatus.FakeAvailable: - file_status = ContentStatus.Available.name - file_message = {'scope': file['scope'], - 'name': file['name'], - 'path': file['path'], - 'status': file_status} - files_message.append(file_message) - msg_content = {'msg_type': i_msg_type_str, - 'request_id': request_id, - 'workload_id': workload_id, - 'relation_type': relation_type, - 'files': files_message} - num_msg_content = len(files_message) - - msg = {'msg_type': i_msg_type, - 'status': MessageStatus.New, - 'source': MessageSource.Transformer, - 'destination': MessageDestination.Outside, - 'request_id': request_id, - 'workload_id': workload_id, - 'transform_id': transform['transform_id'], - 'num_contents': num_msg_content, - 'msg_content': msg_content} - return msg - - def syn_collection_status(self, input_collections, output_collections, log_collections, registered_input_output_maps): - all_updates_flushed, output_statistics = True, {} - - input_status, output_status, log_status = {}, {}, {} - for map_id in registered_input_output_maps: - inputs = registered_input_output_maps[map_id]['inputs'] if 'inputs' in registered_input_output_maps[map_id] else [] - outputs = registered_input_output_maps[map_id]['outputs'] if 'outputs' in registered_input_output_maps[map_id] else [] - logs = registered_input_output_maps[map_id]['logs'] if 'logs' in registered_input_output_maps[map_id] else [] - - for content in inputs: - if content['coll_id'] not in input_status: - input_status[content['coll_id']] = {'total_files': 0, 'processed_files': 0, 'processing_files': 0, 'bytes': 0} - input_status[content['coll_id']]['total_files'] += 1 - - if content['status'] in [ContentStatus.Available, ContentStatus.Mapped, - ContentStatus.Available.value, ContentStatus.Mapped.value, - ContentStatus.FakeAvailable, ContentStatus.FakeAvailable.value]: - input_status[content['coll_id']]['processed_files'] += 1 - input_status[content['coll_id']]['bytes'] += content['bytes'] - else: - input_status[content['coll_id']]['processing_files'] += 1 - - for content in outputs: - if content['coll_id'] not in output_status: - output_status[content['coll_id']] = {'total_files': 0, 'processed_files': 0, 'processing_files': 0, 'bytes': 0} - output_status[content['coll_id']]['total_files'] += 1 - if content['status'] in [ContentStatus.Available, ContentStatus.Available.value, - ContentStatus.FakeAvailable, ContentStatus.FakeAvailable.value]: - output_status[content['coll_id']]['processed_files'] += 1 - output_status[content['coll_id']]['bytes'] += content['bytes'] + if event: + tf_status = [TransformStatus.New, TransformStatus.Ready, TransformStatus.Extend] + tf = self.get_transform(transform_id=event._transform_id, status=tf_status, locking=True) + if not tf: + self.logger.error("Cannot find transform for event: %s" % str(event)) else: - output_status[content['coll_id']]['processing_files'] += 1 - - if content['status'].name not in output_statistics: - output_statistics[content['status'].name] = 0 - output_statistics[content['status'].name] += 1 - - if content['status'] != content['substatus']: - all_updates_flushed = False - - for content in logs: - if content['coll_id'] not in log_status: - log_status[content['coll_id']] = {'total_files': 0, 'processed_files': 0, 'processing_files': 0, 'bytes': 0} - log_status[content['coll_id']]['total_files'] += 1 - if content['status'] in [ContentStatus.Available, ContentStatus.Available.value, - ContentStatus.FakeAvailable, ContentStatus.FakeAvailable.value]: - log_status[content['coll_id']]['processed_files'] += 1 - log_status[content['coll_id']]['bytes'] += content['bytes'] - else: - log_status[content['coll_id']]['processing_files'] += 1 - - for coll in input_collections: - if coll.coll_id in input_status: - coll.collection['total_files'] = input_status[coll.coll_id]['total_files'] - coll.collection['processed_files'] = input_status[coll.coll_id]['processed_files'] - coll.collection['processing_files'] = input_status[coll.coll_id]['processing_files'] - - for coll in output_collections: - if coll.coll_id in output_status: - coll.collection['total_files'] = output_status[coll.coll_id]['total_files'] - coll.collection['processed_files'] = output_status[coll.coll_id]['processed_files'] - coll.collection['processing_files'] = output_status[coll.coll_id]['processing_files'] - coll.collection['bytes'] = output_status[coll.coll_id]['bytes'] - - for coll in log_collections: - if coll.coll_id in log_status: - coll.collection['total_files'] = log_status[coll.coll_id]['total_files'] - coll.collection['processed_files'] = log_status[coll.coll_id]['processed_files'] - coll.collection['processing_files'] = log_status[coll.coll_id]['processing_files'] - coll.collection['bytes'] = log_status[coll.coll_id]['bytes'] - - return all_updates_flushed, output_statistics - - def get_message_for_update_processing(self, processing, processing_status): - msg_content = {'command': 'update_processing', - 'parameters': {'status': processing_status}} - msg = {'msg_type': MessageType.IDDSCommunication, - 'status': MessageStatus.New, - 'destination': MessageDestination.Carrier, - 'source': MessageSource.Transformer, - 'request_id': processing['request_id'], - 'workload_id': processing['workload_id'], - 'transform_id': processing['transform_id'], - 'processing_id': processing['processing_id'], - 'num_contents': 1, - 'msg_content': msg_content} - return msg - - def reactive_contents(self, input_output_maps): - updated_contents = [] - for map_id in input_output_maps: - inputs = input_output_maps[map_id]['inputs'] if 'inputs' in input_output_maps[map_id] else [] - outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] - inputs_dependency = input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in input_output_maps[map_id] else [] - - all_outputs_available = True - for content in outputs: - if not content['status'] in [ContentStatus.Available]: - all_outputs_available = False - break - - if not all_outputs_available: - for content in inputs + outputs: - update_content = {'content_id': content['content_id'], - 'status': ContentStatus.New, - 'substatus': ContentStatus.New} - updated_contents.append(update_content) - for content in inputs_dependency: - if content['status'] not in [ContentStatus.Available]: - update_content = {'content_id': content['content_id'], - 'status': ContentStatus.New, - 'substatus': ContentStatus.New} - updated_contents.append(update_content) - return updated_contents - - def process_running_transform_real(self, transform): + log_pre = self.get_log_prefix(tf) + self.logger.info(log_pre + "process_new_transform") + ret = self.handle_new_transform(tf) + self.logger.info(log_pre + "process_new_transform result: %s" % str(ret)) + + new_pr_ids, update_pr_ids = self.update_transform(ret) + for pr_id in new_pr_ids: + self.logger.info(log_pre + "NewProcessingEvent(processing_id: %s)" % pr_id) + event = NewProcessingEvent(publisher_id=self.id, processing_id=pr_id, content=event._content) + self.event_bus.send(event) + for pr_id in update_pr_ids: + self.logger.info(log_pre + "UpdateProcessingEvent(processing_id: %s)" % pr_id) + event = UpdateProcessingEvent(publisher_id=self.id, processing_id=pr_id, content=event._content) + self.event_bus.send(event) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 + + def handle_update_transform_real(self, transform, event): """ process running transforms """ - self.logger.info("process_running_transform: transform_id: %s" % transform['transform_id']) - - msgs, update_msgs = [], [] - - # transform_substatus = None - t_processing_status = None - is_operation = False - if transform['status'] in [TransformStatus.ToCancel, TransformStatus.ToSuspend, - TransformStatus.ToResume, TransformStatus.ToExpire, - TransformStatus.ToFinish, TransformStatus.ToForceFinish]: - is_operation = True - if transform['status'] == TransformStatus.ToCancel: - t_processing_status = ProcessingStatus.ToCancel - # transform_substatus = TransformStatus.Cancelling - if transform['status'] == TransformStatus.ToSuspend: - t_processing_status = ProcessingStatus.ToSuspend - # transform_substatus = TransformStatus.Suspending - if transform['status'] == TransformStatus.ToResume: - t_processing_status = ProcessingStatus.ToResume - # transform_substatus = TransformStatus.Resuming - if transform['status'] == TransformStatus.ToExpire: - t_processing_status = ProcessingStatus.ToExpire - # transform_substatus = TransformStatus.Expiring - if transform['status'] == TransformStatus.ToFinish: - t_processing_status = ProcessingStatus.ToFinish - # transform_substatus = TransformStatus.Transforming - if transform['status'] == TransformStatus.ToForceFinish: - t_processing_status = ProcessingStatus.ToForceFinish - # transform_substatus = TransformStatus.Transforming + log_pre = self.get_log_prefix(transform) + + self.logger.info(log_pre + "handle_update_transform: transform_id: %s" % transform['transform_id']) + + is_terminated = False + to_abort = False + if (event and event._content and 'cmd_type' in event._content and event._content['cmd_type'] + and event._content['cmd_type'] in [CommandType.AbortRequest, CommandType.ExpireRequest]): # noqa W503 + to_abort = True + self.logger.info(log_pre + "to_abort %s" % to_abort) work = transform['transform_metadata']['work'] work.set_work_id(transform['transform_id']) work.set_agent_attributes(self.agent_attributes, transform) - # link collections - input_collections = work.get_input_collections() - output_collections = work.get_output_collections() - log_collections = work.get_log_collections() - - for coll in input_collections + output_collections + log_collections: - coll_model = core_catalog.get_collection(coll_id=coll.coll_id) - coll.collection = coll_model - - input_coll_ids = self.get_collection_ids(input_collections) - output_coll_ids = self.get_collection_ids(output_collections) - log_coll_ids = self.get_collection_ids(log_collections) - - registered_input_output_maps = core_transforms.get_transform_input_output_maps(transform['transform_id'], - input_coll_ids=input_coll_ids, - output_coll_ids=output_coll_ids, - log_coll_ids=log_coll_ids) - work_name_to_coll_map = core_transforms.get_work_name_to_coll_map(request_id=transform['request_id']) work.set_work_name_to_coll_map(work_name_to_coll_map) # link processings - new_processing_model, processing_model, update_processing_model = None, None, {} + new_processing_model, processing_model = None, None processing = work.get_processing(input_output_maps=[], without_creating=True) - self.logger.debug("work get_processing: %s" % processing) + self.logger.debug(log_pre + "work get_processing: %s" % processing) if processing and processing.processing_id: processing_model = core_processings.get_processing(processing_id=processing.processing_id) work.sync_processing(processing, processing_model) - processing_metadata = processing_model['processing_metadata'] - if 'errors' in processing_metadata: - work.set_terminated_msg(processing_metadata['errors']) + proc = processing_model['processing_metadata']['processing'] + work.sync_work_data(status=processing_model['status'], substatus=processing_model['substatus'], + work=proc.work, output_data=processing_model['output_metadata']) + # processing_metadata = processing_model['processing_metadata'] + if processing_model['errors']: + work.set_terminated_msg(processing_model['errors']) # work.set_processing_output_metadata(processing, processing_model['output_metadata']) work.set_output_data(processing.output_data) transform['workload_id'] = processing_model['workload_id'] - if t_processing_status is not None: - msg = self.get_message_for_update_processing(processing_model, t_processing_status) - msgs.append(msg) - - # check contents - new_input_output_maps = work.get_new_input_output_maps(registered_input_output_maps) - - new_input_contents, new_output_contents, new_log_contents, new_input_dependency_contents = self.get_new_contents(transform, new_input_output_maps) - new_contents = [] - if new_input_contents: - new_contents = new_contents + new_input_contents - if new_output_contents: - new_contents = new_contents + new_output_contents - if new_log_contents: - new_contents = new_contents + new_log_contents - if new_input_dependency_contents: - new_contents = new_contents + new_input_dependency_contents - - # create processing - if not processing: - processing = work.get_processing(new_input_output_maps, without_creating=False) - self.logger.debug("work get_processing with creating: %s" % processing) - if processing and not processing.processing_id: - new_processing_model = {} - new_processing_model['transform_id'] = transform['transform_id'] - new_processing_model['request_id'] = transform['request_id'] - new_processing_model['workload_id'] = transform['workload_id'] - new_processing_model['status'] = ProcessingStatus.New - # new_processing_model['expired_at'] = work.get_expired_at(None) - new_processing_model['expired_at'] = transform['expired_at'] - - # if 'processing_metadata' not in processing: - # processing['processing_metadata'] = {} - # if 'processing_metadata' not in new_processing_model: - # new_processing_model['processing_metadata'] = {} - # new_processing_model['processing_metadata'] = processing.processing_metadata + else: + if not processing: + processing = work.get_processing(input_output_maps=[], without_creating=False) + self.logger.debug(log_pre + "work get_processing with creating: %s" % processing) + new_processing_model = self.generate_processing_model(transform) proc_work = copy.deepcopy(work) proc_work.clean_work() processing.work = proc_work new_processing_model['processing_metadata'] = {'processing': processing} - if t_processing_status is not None: - new_processing_model['status'] = t_processing_status - # new_processing_model['substatus'] = t_processing_status - - # check updated contents - updated_contents, updated_input_contents_full, updated_output_contents_full = [], [], [] - to_release_input_contents = [] - if work.should_release_inputs(processing, self.poll_operation_time_period): - self.logger.info("get_updated_contents for transform %s" % transform['transform_id']) - updated_contents, updated_input_contents_full, updated_output_contents_full = self.get_updated_contents(transform, registered_input_output_maps) - # if work.use_dependency_to_release_jobs() and (updated_output_contents_full or work.has_to_release_inputs()): - if work.use_dependency_to_release_jobs(): - pass - self.logger.info("trigger_release_inputs: %s" % transform['transform_id']) - to_release_input_contents = self.trigger_release_inputs(updated_output_contents_full, work, registered_input_output_maps) - if not to_release_input_contents: - to_release_input_contents = self.poll_inputs_dependency(transform, registered_input_output_maps) - - self.logger.info("generate_message: %s" % transform['transform_id']) - if new_input_contents: - msg = self.generate_message(transform, files=new_input_contents, msg_type='file', relation_type='input') - msgs.append(msg) - if new_output_contents: - msg = self.generate_message(transform, files=new_output_contents, msg_type='file', relation_type='output') - msgs.append(msg) - if updated_input_contents_full: - msg = self.generate_message(transform, files=updated_input_contents_full, msg_type='file', relation_type='input') - msgs.append(msg) - if updated_output_contents_full: - msg = self.generate_message(transform, files=updated_output_contents_full, msg_type='file', relation_type='output') - msgs.append(msg) - - # transform['locking'] = TransformLocking.Idle - # status_statistics = work.get_status_statistics(registered_input_output_maps) - self.logger.info("syn_collection_status: %s" % transform['transform_id']) - all_updates_flushed, output_statistics = self.syn_collection_status(input_collections, output_collections, log_collections, registered_input_output_maps) - - self.logger.info("syn_work_status: %s, transform status: %s" % (transform['transform_id'], transform['status'])) - work.syn_work_status(registered_input_output_maps, all_updates_flushed, output_statistics, to_release_input_contents) - if work.is_terminated(): - self.logger.info("Transform(%s) work is terminated, trigger to release all final status files" % (transform['transform_id'])) - if work.use_dependency_to_release_jobs(): - pass - self.logger.info("trigger_release_inputs: %s" % transform['transform_id']) - to_release_input_contents1 = self.trigger_release_inputs(updated_output_contents_full, work, registered_input_output_maps, final=True) - to_release_input_contents = to_release_input_contents + to_release_input_contents1 - - to_resume_transform = False - reactivated_contents = [] - if transform['status'] in [TransformStatus.ToCancel]: - transform['status'] = TransformStatus.Cancelling - work.tocancel = True - elif transform['status'] in [TransformStatus.ToSuspend]: - transform['status'] = TransformStatus.Suspending - work.tosuspend = True - elif transform['status'] in [TransformStatus.ToResume]: - transform['status'] = TransformStatus.Resuming - transform['retries'] = 0 - work.toresume = True - to_resume_transform = True - reactivated_contents = self.reactive_contents(registered_input_output_maps) - # reactive collections - for coll in input_collections: - coll.status = CollectionStatus.Open - for coll in output_collections: - coll.status = CollectionStatus.Open - for coll in log_collections: - coll.status = CollectionStatus.Open - elif transform['status'] in [TransformStatus.ToExpire]: - transform['status'] = TransformStatus.Expiring - work.toexpire = True - elif transform['status'] in [TransformStatus.ToFinish]: - transform['status'] = TransformStatus.Transforming - work.tofinish = True - elif transform['status'] in [TransformStatus.ToForceFinish]: - transform['status'] = TransformStatus.Transforming - work.toforcefinish = True - elif work.is_finished(): - transform['status'] = TransformStatus.Finished - msg = self.generate_message(transform, work=work, msg_type='work') - msgs.append(msg) - for coll in input_collections: - coll.status = CollectionStatus.Closed - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='input') - msgs.append(msg) - for coll in output_collections: - coll.status = CollectionStatus.Closed - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='output') - msgs.append(msg) - for coll in log_collections: - coll.status = CollectionStatus.Closed - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='log') - msgs.append(msg) - elif work.is_subfinished(): - transform['status'] = TransformStatus.SubFinished - msg = self.generate_message(transform, work=work, msg_type='work') - msgs.append(msg) - for coll in input_collections: - coll.status = CollectionStatus.SubClosed - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='input') - msgs.append(msg) - for coll in output_collections: - coll.status = CollectionStatus.SubClosed - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='output') - msgs.append(msg) - for coll in log_collections: - coll.status = CollectionStatus.SubClosed - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='log') - msgs.append(msg) - elif work.is_failed(): - transform['status'] = TransformStatus.Failed - msg = self.generate_message(transform, work=work, msg_type='work') - msgs.append(msg) - for coll in input_collections: - coll.status = CollectionStatus.Failed - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='input') - msgs.append(msg) - for coll in output_collections: - coll.status = CollectionStatus.Failed - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='output') - msgs.append(msg) - for coll in log_collections: - coll.status = CollectionStatus.Failed - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='log') - msgs.append(msg) - elif work.is_expired(): - transform['status'] = TransformStatus.Expired - msg = self.generate_message(transform, work=work, msg_type='work') - msgs.append(msg) - for coll in input_collections: - coll.status = CollectionStatus.SubClosed - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='input') - msgs.append(msg) - for coll in output_collections: - coll.status = CollectionStatus.SubClosed - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='output') - msgs.append(msg) - for coll in log_collections: - coll.status = CollectionStatus.SubClosed - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='log') - msgs.append(msg) - elif work.is_cancelled(): - transform['status'] = TransformStatus.Cancelled - msg = self.generate_message(transform, work=work, msg_type='work') - msgs.append(msg) - for coll in input_collections: - coll.status = CollectionStatus.Cancelled - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='input') - msgs.append(msg) - for coll in output_collections: - coll.status = CollectionStatus.Cancelled - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='output') - msgs.append(msg) - for coll in log_collections: - coll.status = CollectionStatus.Cancelled - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='log') - msgs.append(msg) - elif work.is_suspended(): - transform['status'] = TransformStatus.Suspended - msg = self.generate_message(transform, work=work, msg_type='work') - msgs.append(msg) - for coll in input_collections: - coll.status = CollectionStatus.Suspended - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='input') - msgs.append(msg) - for coll in output_collections: - coll.status = CollectionStatus.Suspended - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='output') - msgs.append(msg) - for coll in log_collections: - coll.status = CollectionStatus.Suspended - msg = self.generate_message(transform, work=work, collection=coll, msg_type='collection', relation_type='log') - msgs.append(msg) - else: - transform['status'] = TransformStatus.Transforming - if not is_operation: - next_poll_at = datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period) - else: - if to_resume_transform: - next_poll_at = datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_operation_time_period * 5) + self.logger.info(log_pre + "syn_work_status: %s, transform status: %s" % (transform['transform_id'], transform['status'])) + if work.is_terminated(): + is_terminated = True + self.logger.info(log_pre + "Transform(%s) work is terminated: work status: %s" % (transform['transform_id'], work.get_status())) + if work.is_finished(): + transform['status'] = TransformStatus.Finished else: - next_poll_at = datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_operation_time_period) - - # reset retries to 0 when it succeed - transform['retries'] = 0 + if to_abort: + transform['status'] = TransformStatus.Cancelled + elif work.is_subfinished(): + transform['status'] = TransformStatus.SubFinished + elif work.is_failed(): + transform['status'] = TransformStatus.Failed + else: + transform['status'] = TransformStatus.Failed transform_parameters = {'status': transform['status'], 'locking': TransformLocking.Idle, 'workload_id': transform['workload_id'], - 'next_poll_at': next_poll_at, - 'retries': transform['retries'], 'transform_metadata': transform['transform_metadata']} - # if transform_substatus: - # transform_parameters['substatus'] = transform_substatus - - if new_contents or updated_contents or to_release_input_contents: - work.has_new_updates() + transform_parameters = self.load_poll_period(transform, transform_parameters) + + if new_processing_model is not None: + if 'new_poll_period' in transform_parameters: + new_processing_model['new_poll_period'] = transform_parameters['new_poll_period'] + if 'update_poll_period' in transform_parameters: + new_processing_model['update_poll_period'] = transform_parameters['update_poll_period'] + if 'max_new_retries' in transform_parameters: + new_processing_model['max_new_retries'] = transform_parameters['max_new_retries'] + if 'max_update_retries' in transform_parameters: + new_processing_model['max_update_retries'] = transform_parameters['max_update_retries'] - # print(input_collections) ret = {'transform': transform, 'transform_parameters': transform_parameters, - # 'update_input_collections': copy.deepcopy(input_collections) if input_collections else input_collections, - # 'update_output_collections': copy.deepcopy(output_collections) if output_collections else output_collections, - # 'update_log_collections': copy.deepcopy(log_collections) if log_collections else log_collections, - 'update_input_collections': input_collections, - 'update_output_collections': output_collections, - 'update_log_collections': log_collections, - 'new_contents': new_contents, - 'update_contents': updated_contents + to_release_input_contents + reactivated_contents, - 'messages': msgs, - 'update_messages': update_msgs, - 'new_processing': new_processing_model, - 'update_processing': update_processing_model} - return ret + 'new_processing': new_processing_model} + return ret, is_terminated - def process_running_transform_message(self, transform, messages): + def handle_update_transform(self, transform, event): """ - process running transform message + Process running transform """ try: - self.logger.info("process_running_transform_message: transform_id: %s, messages: %s" % (transform['transform_id'], str(messages) if messages else messages)) - msg = messages[0] - message = messages[0]['msg_content'] - if message['command'] == 'update_transform': - parameters = message['parameters'] - parameters['locking'] = TransformLocking.Idle - ret = {'transform': transform, - 'transform_parameters': parameters, - 'update_messages': [{'msg_id': msg['msg_id'], 'status': MessageStatus.Delivered}] - } - else: - self.logger.error("Unknown message: %s" % str(msg)) - ret = {'transform': transform, - 'transform_parameters': {'locking': TransformLocking.Idle}, - 'update_messages': [{'msg_id': msg['msg_id'], 'status': MessageStatus.Failed}] - } + log_pre = self.get_log_prefix(transform) + + self.logger.info(log_pre + "handle_update_transform: %s" % transform) + ret, is_terminated = self.handle_update_transform_real(transform, event) + self.logger.info(log_pre + "handle_update_transform result: %s" % str(ret)) + return ret, is_terminated except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) - if transform['retries'] > self.retries: - tf_status = TransformStatus.Failed + + retries = transform['update_retries'] + 1 + if not transform['max_update_retries'] or retries < transform['max_update_retries']: + tf_status = transform['status'] else: - tf_status = TransformStatus.Transforming + tf_status = TransformStatus.Failed + error = {'submit_err': {'msg': truncate_string('%s' % (ex), length=200)}} - wait_times = max(4, transform['retries']) + # increase poll period + update_poll_period = int(transform['update_poll_period'].total_seconds() * self.poll_period_increase_rate) + if update_poll_period > self.max_update_poll_period: + update_poll_period = self.max_update_poll_period - ret = {'transform': transform, - 'transform_parameters': {'status': tf_status, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period * wait_times), - 'locking': TransformLocking.Idle, - 'retries': transform['retries'] + 1, - 'errors': {'msg': '%s: %s' % (ex, traceback.format_exc())}}} - return ret + transform_parameters = {'status': tf_status, + 'update_retries': retries, + 'update_poll_period': update_poll_period, + 'errors': transform['errors'] if transform['errors'] else {}, + 'locking': TransformLocking.Idle} + transform_parameters['errors'].update(error) + + ret = {'transform': transform, 'transform_parameters': transform_parameters} + self.logger.warn(log_pre + "handle_update_transform exception result: %s" % str(ret)) + return ret, False + + def process_update_transform(self, event): + self.number_workers += 1 + try: + if event: + tf_status = [TransformStatus.Transforming, + TransformStatus.ToCancel, TransformStatus.Cancelling, + TransformStatus.ToSuspend, TransformStatus.Suspending, + TransformStatus.ToExpire, TransformStatus.Expiring, + TransformStatus.ToResume, TransformStatus.Resuming, + TransformStatus.ToFinish, TransformStatus.ToForceFinish] + tf = self.get_transform(transform_id=event._transform_id, status=tf_status, locking=True) + if not tf: + self.logger.error("Cannot find transform for event: %s" % str(event)) + else: + log_pre = self.get_log_prefix(tf) + + ret, is_terminated = self.handle_update_transform(tf, event) + new_pr_ids, update_pr_ids = self.update_transform(ret) + + if is_terminated or (event._content and 'event' in event._content and event._content['event'] == 'submitted'): + self.logger.info(log_pre + "UpdateRequestEvent(request_id: %s)" % tf['request_id']) + event = UpdateRequestEvent(publisher_id=self.id, request_id=tf['request_id'], content=event._content) + self.event_bus.send(event) + for pr_id in new_pr_ids: + self.logger.info(log_pre + "NewProcessingEvent(processing_id: %s)" % pr_id) + event = NewProcessingEvent(publisher_id=self.id, processing_id=pr_id, content=event._content) + self.event_bus.send(event) + for pr_id in update_pr_ids: + self.logger.info(log_pre + "NewProcessingEvent(processing_id: %s)" % pr_id) + event = UpdateProcessingEvent(publisher_id=self.id, processing_id=pr_id, content=event._content) + self.event_bus.send(event) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 - def process_running_transform(self, transform): + def handle_abort_transform(self, transform): """ - Process running transform + process abort transform """ try: - msgs = self.get_transform_message(transform_id=transform['transform_id'], bulk_size=1) - if msgs: - self.logger.info("Main thread processing running transform with message: %s" % transform) - ret = self.process_running_transform_message(transform, msgs) + work = transform['transform_metadata']['work'] + work.set_agent_attributes(self.agent_attributes, transform) + + # save old status for retry + oldstatus = transform['status'] + + processing = work.get_processing(input_output_maps=[], without_creating=True) + if processing and processing.processing_id: + tf_status = TransformStatus.Cancelling else: - self.logger.info("Main thread processing running transform: %s" % transform) - ret = self.process_running_transform_real(transform) + tf_status = TransformStatus.Cancelled + + transform_parameters = {'status': tf_status, + 'oldstatus': oldstatus, + 'locking': TransformLocking.Idle, + 'transform_metadata': transform['transform_metadata']} + ret = {'transform': transform, 'transform_parameters': transform_parameters} + return ret except Exception as ex: self.logger.error(ex) self.logger.error(traceback.format_exc()) - if transform['retries'] > self.retries: - tf_status = TransformStatus.Failed - else: - tf_status = TransformStatus.Transforming + error = {'abort_err': {'msg': truncate_string('%s' % (ex), length=200)}} + transform_parameters = {'status': tf_status, + 'locking': TransformLocking.Idle, + 'errors': transform['errors'] if transform['errors'] else {}} + transform_parameters['errors'].update(error) + ret = {'transform': transform, 'transform_parameters': transform_parameters} + return ret + return None - wait_times = max(4, transform['retries']) + def process_abort_transform(self, event): + self.number_workers += 1 + try: + if event: + self.logger.info("process_abort_transform: event: %s" % event) + tf = self.get_transform(transform_id=event._transform_id, locking=True) + if not tf: + self.logger.error("Cannot find transform for event: %s" % str(event)) + else: + log_pre = self.get_log_prefix(tf) + self.logger.info(log_pre + "process_abort_transform") + + if tf['status'] in [TransformStatus.Finished, TransformStatus.SubFinished, + TransformStatus.Failed, TransformStatus.Cancelled, + TransformStatus.Suspended, TransformStatus.Expired]: + ret = {'transform': tf, + 'transform_parameters': {'locking': TransformLocking.Idle, + 'errors': {'extra_msg': "Transform is already terminated. Cannot be aborted"}}} + if tf['errors'] and 'msg' in tf['errors']: + ret['parameters']['errors']['msg'] = tf['errors']['msg'] + + self.logger.info(log_pre + "process_abort_transform result: %s" % str(ret)) + + self.update_transform(ret) + else: + ret = self.handle_abort_transform(tf) + self.logger.info(log_pre + "process_abort_transform result: %s" % str(ret)) + if ret: + self.update_transform(ret) + + work = tf['transform_metadata']['work'] + work.set_work_id(tf['transform_id']) + work.set_agent_attributes(self.agent_attributes, tf) + + processing = work.get_processing(input_output_maps=[], without_creating=True) + if processing and processing.processing_id: + self.logger.info(log_pre + "AbortProcessingEvent(processing_id: %s)" % processing.processing_id) + event = AbortProcessingEvent(publisher_id=self.id, processing_id=processing.processing_id, content=event._content) + self.event_bus.send(event) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 + + def handle_resume_transform(self, transform): + """ + process resume transform + """ + try: + work = transform['transform_metadata']['work'] + work.set_agent_attributes(self.agent_attributes, transform) + + tf_status = transform['oldstatus'] transform_parameters = {'status': tf_status, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period * wait_times), - 'retries': transform['retries'] + 1, 'locking': TransformLocking.Idle} - ret = {'transform': transform, 'transform_parameters': transform_parameters} - return ret - def process_running_transforms(self): - ret = [] - while not self.running_task_queue.empty(): - try: - transform = self.running_task_queue.get() - if transform: - self.running_processing_size += 1 - self.logger.info("Main thread processing running transform: %s" % transform) - ret_transform = self.process_running_transform(transform) - self.logger.debug("Main thread processing running transform finished: %s" % transform) - self.running_processing_size -= 1 - if ret_transform: - self.running_output_queue.put(ret_transform) - # ret.append(ret_transform) - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - return ret + ret = {'transform': transform, + 'transform_parameters': transform_parameters} + return ret + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + error = {'resume_err': {'msg': truncate_string('%s' % (ex), length=200)}} + transform_parameters = {'status': tf_status, + 'locking': TransformLocking.Idle, + 'errors': transform['errors'] if transform['errors'] else {}} + transform_parameters['errors'].update(error) + ret = {'transform': transform, 'transform_parameters': transform_parameters} + return ret + return None - def finish_running_transforms(self): - while not self.running_output_queue.empty(): - try: - ret = self.running_output_queue.get() - self.logger.debug("Main thread finishing running transform: %s" % ret['transform']) - self.logger.info("Main thread finishing running transform(%s): %s" % (ret['transform']['transform_id'], - ret['transform_parameters'])) - if ret: - retry = True - retry_num = 0 - while retry: - retry = False - retry_num += 1 - try: - # self.logger.debug("wen: %s" % str(ret['output_contents'])) - core_transforms.add_transform_outputs(transform=ret['transform'], - transform_parameters=ret['transform_parameters'], - input_collections=ret.get('input_collections', None), - output_collections=ret.get('output_collections', None), - log_collections=ret.get('log_collections', None), - new_contents=ret.get('new_contents', None), - update_input_collections=ret.get('update_input_collections', None), - update_output_collections=ret.get('update_output_collections', None), - update_log_collections=ret.get('update_log_collections', None), - update_contents=ret.get('update_contents', None), - messages=ret.get('messages', None), - update_messages=ret.get('update_messages', None), - new_processing=ret.get('new_processing', None), - update_processing=ret.get('update_processing', None), - message_bulk_size=self.message_bulk_size) - - except exceptions.DatabaseException as ex: - if 'ORA-00060' in str(ex): - self.logger.warn("(cx_Oracle.DatabaseError) ORA-00060: deadlock detected while waiting for resource") - if retry_num < 5: - retry = True - time.sleep(60 * retry_num * 2) - else: - raise ex - else: - # self.logger.error(ex) - # self.logger.error(traceback.format_exc()) - raise ex - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) - try: - transform_parameters = {'status': TransformStatus.Transforming, - 'next_poll_at': datetime.datetime.utcnow() + datetime.timedelta(seconds=self.poll_time_period), - 'retries': ret['transform']['retries'] + 1, - 'locking': TransformLocking.Idle} - core_transforms.add_transform_outputs(transform=ret['transform'], - transform_parameters=transform_parameters) - except Exception as ex: - self.logger.error(ex) - self.logger.error(traceback.format_exc()) + def process_resume_transform(self, event): + self.number_workers += 1 + try: + if event: + self.logger.info("process_resume_transform: event: %s" % event) + tf = self.get_transform(transform_id=event._transform_id, locking=True) + if not tf: + self.logger.error("Cannot find transform for event: %s" % str(event)) + else: + log_pre = self.get_log_prefix(tf) + + if tf['status'] in [TransformStatus.Finished]: + ret = {'transform': tf, + 'transform_parameters': {'locking': TransformLocking.Idle, + 'errors': {'extra_msg': "Transform is already finished. Cannot be resumed"}}} + if tf['errors'] and 'msg' in tf['errors']: + ret['parameters']['errors']['msg'] = tf['errors']['msg'] + + self.logger.info(log_pre + "process_resume_transform result: %s" % str(ret)) + self.update_transform(ret) + else: + ret = self.handle_resume_transform(tf) + self.logger.info(log_pre + "process_resume_transform result: %s" % str(ret)) + if ret: + self.update_transform(ret) + + work = tf['transform_metadata']['work'] + work.set_agent_attributes(self.agent_attributes, tf) + + processing = work.get_processing(input_output_maps=[], without_creating=True) + if processing and processing.processing_id: + self.logger.info(log_pre + "ResumeProcessingEvent(processing_id: %s)" % processing.processing_id) + event = ResumeProcessingEvent(publisher_id=self.id, + processing_id=processing.processing_id, + content=event._content) + self.event_bus.send(event) + else: + self.logger.info(log_pre + "UpdateTransformEvent(transform_id: %s)" % tf['transform_id']) + event = UpdateTransformEvent(publisher_id=self.id, + transform_id=tf['transform_id'], + content=event._content) + self.event_bus.send(event) + except Exception as ex: + self.logger.error(ex) + self.logger.error(traceback.format_exc()) + self.number_workers -= 1 def clean_locks(self): self.logger.info("clean locking") core_transforms.clean_locking() + def init_event_function_map(self): + self.event_func_map = { + EventType.NewTransform: { + 'pre_check': self.is_ok_to_run_more_transforms, + 'exec_func': self.process_new_transform + }, + EventType.UpdateTransform: { + 'pre_check': self.is_ok_to_run_more_transforms, + 'exec_func': self.process_update_transform + }, + EventType.AbortTransform: { + 'pre_check': self.is_ok_to_run_more_transforms, + 'exec_func': self.process_abort_transform + }, + EventType.ResumeTransform: { + 'pre_check': self.is_ok_to_run_more_transforms, + 'exec_func': self.process_resume_transform + } + } + def run(self): """ Main run function. @@ -1242,24 +744,12 @@ def run(self): self.add_default_tasks() - task = self.create_task(task_func=self.get_new_transforms, task_output_queue=self.new_task_queue, task_args=tuple(), task_kwargs={}, delay_time=60, priority=1) - self.add_task(task) - for _ in range(self.num_threads): - # task = self.create_task(task_func=self.process_new_transforms, task_output_queue=self.new_output_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - task = self.create_task(task_func=self.process_new_transforms, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - self.add_task(task) - task = self.create_task(task_func=self.finish_new_transforms, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=2, priority=1) - self.add_task(task) + self.init_event_function_map() - task = self.create_task(task_func=self.get_running_transforms, task_output_queue=self.running_task_queue, task_args=tuple(), task_kwargs={}, delay_time=60, priority=1) + task = self.create_task(task_func=self.get_new_transforms, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=60, priority=1) self.add_task(task) - for _ in range(self.num_threads): - # task = self.create_task(task_func=self.process_running_transforms, task_output_queue=self.running_output_queue, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - task = self.create_task(task_func=self.process_running_transforms, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) - self.add_task(task) - task = self.create_task(task_func=self.finish_running_transforms, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1, priority=1) + task = self.create_task(task_func=self.get_running_transforms, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=60, priority=1) self.add_task(task) - task = self.create_task(task_func=self.clean_locks, task_output_queue=None, task_args=tuple(), task_kwargs={}, delay_time=1800, priority=1) self.add_task(task) diff --git a/main/lib/idds/core/catalog.py b/main/lib/idds/core/catalog.py index 1d588644..eb2b9ea8 100644 --- a/main/lib/idds/core/catalog.py +++ b/main/lib/idds/core/catalog.py @@ -78,6 +78,18 @@ def get_collections(scope=None, name=None, request_id=None, workload_id=None, tr return collections +@read_session +def get_collections_by_request_ids(request_ids, session=None): + """" + Get collections by a list of request ids. + + :param request_ids: list of request ids. + + :return collections: list of collections. + """ + return orm_collections.get_collections_by_request_ids(request_ids) + + @transactional_session def add_collection(request_id, workload_id, scope, name, coll_type=CollectionType.Dataset, transform_id=None, relation_type=CollectionRelationType.Input, bytes=0, status=CollectionStatus.New, @@ -294,7 +306,7 @@ def get_contents(coll_scope=None, coll_name=None, request_id=None, workload_id=N :param to_json: return json format. :param session: The database session in use. - :returns: dict of contents + :returns: list of contents """ collections = get_collections(scope=coll_scope, name=coll_name, request_id=request_id, workload_id=workload_id, transform_id=transform_id, @@ -308,6 +320,43 @@ def get_contents(coll_scope=None, coll_name=None, request_id=None, workload_id=N return rets +@read_session +def get_contents_by_request_transform(request_id=None, workload_id=None, transform_id=None, status=None, status_updated=False, session=None): + """ + Get contents with request id, workload id and transform id. + + :param request_id: the request id. + :param workload_id: The workload_id of the request. + :param transform_id: The transform id related to this collection. + :param session: The database session in use. + + :returns: list of contents + """ + ret = orm_contents.get_contents_by_request_transform(request_id=request_id, transform_id=transform_id, + workload_id=workload_id, status=status, + status_updated=status_updated, session=session) + return ret + + +@read_session +def get_contents_by_content_ids(content_ids, request_id=None, session=None): + """ + Get content or raise a NoObject exception. + + :param request_id: request id. + :param content_ids: list of content id. + :param workload_id: workload id. + + :param session: The database session in use. + + :raises NoObject: If no content is founded. + + :returns: list of contents. + """ + ret = orm_contents.get_contents_by_content_ids(content_ids=content_ids, request_id=request_id, session=session) + return ret + + @read_session def get_contents_by_coll_id_status(coll_id, status=None, to_json=False, session=None): """ diff --git a/main/lib/idds/core/commands.py b/main/lib/idds/core/commands.py new file mode 100644 index 00000000..dc7f8311 --- /dev/null +++ b/main/lib/idds/core/commands.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 + + +""" +operations related to Commands. +""" + +from idds.common.constants import CommandLocation, CommandLocking, CommandStatus +from idds.orm.base.session import read_session, transactional_session +from idds.orm import commands as orm_commands + + +@transactional_session +def add_command(request_id, cmd_type, cmd_content, + status=CommandStatus.New, workload_id=None, + transform_id=None, + username=None, retries=0, processing_id=0, + source=CommandLocation.Rest, + destination=CommandLocation.Clerk, session=None): + """ + Add a command to be submitted asynchronously to a command broker. + + :param cmd_type: The type of the cmd as a number, e.g., finished_stagein. + :param status: The status about the command + :param source: The source where the command is from. + :param cmd_content: The command cmd_content as JSON. + :param session: The database session. + """ + return orm_commands.add_command(cmd_type=cmd_type, status=status, source=source, + request_id=request_id, workload_id=workload_id, + transform_id=transform_id, username=username, retries=retries, + destination=destination, processing_id=processing_id, + cmd_content=cmd_content, session=session) + + +@read_session +def retrieve_commands(bulk_size=None, cmd_type=None, status=None, destination=None, + source=None, request_id=None, workload_id=None, transform_id=None, + processing_id=None, session=None): + """ + Retrieve up to $bulk commands. + + :param bulk: Number of commands as an integer. + :param cmd_type: Return only specified cmd_type. + :param status: The status about the command + :param source: The source where the command is from. + :param session: The database session. + + :returns commands: List of dictionaries + """ + return orm_commands.retrieve_commands(bulk_size=bulk_size, cmd_type=cmd_type, + status=status, source=source, destination=destination, + request_id=request_id, workload_id=workload_id, + transform_id=transform_id, processing_id=processing_id, + session=session) + + +@transactional_session +def delete_commands(commands, session=None): + """ + Delete all commands with the given IDs. + + :param commands: The commands to delete as a list of dictionaries. + """ + return orm_commands.delete_commands(commands=commands, session=session) + + +@transactional_session +def update_commands(commands, session=None): + """ + Update all commands status with the given IDs. + + :param commands: The commands to be updated as a list of dictionaries. + """ + return orm_commands.update_commands(commands=commands, session=session) + + +@transactional_session +def get_commands_by_status(status, locking=False, period=None, session=None): + """ + Get commands + + :param status: Command status. + :param locking: Whether only retrieves unlocked items. + + :param session: The database session in use. + + :returns: list of commands. + """ + cmds = orm_commands.get_commands_by_status(status=status, locking=locking, period=period, session=session) + if locking: + parameters = [] + for cmd in cmds: + param = {'cmd_id': cmd['cmd_id'], + 'locking': CommandLocking.Locking} + parameters.append(param) + orm_commands.update_commands(parameters) + return cmds diff --git a/main/lib/idds/core/messages.py b/main/lib/idds/core/messages.py index af6b8a3c..1ca8f002 100644 --- a/main/lib/idds/core/messages.py +++ b/main/lib/idds/core/messages.py @@ -46,7 +46,7 @@ def add_messages(messages, bulk_size=1000, session=None): @read_session def retrieve_messages(bulk_size=None, msg_type=None, status=None, destination=None, source=None, request_id=None, workload_id=None, transform_id=None, - processing_id=None, session=None): + processing_id=None, retries=None, delay=None, session=None): """ Retrieve up to $bulk messages. @@ -62,6 +62,7 @@ def retrieve_messages(bulk_size=None, msg_type=None, status=None, destination=No status=status, source=source, destination=destination, request_id=request_id, workload_id=workload_id, transform_id=transform_id, processing_id=processing_id, + retries=retries, delay=delay, session=session) diff --git a/main/lib/idds/core/processings.py b/main/lib/idds/core/processings.py index 4b39c738..f193be80 100644 --- a/main/lib/idds/core/processings.py +++ b/main/lib/idds/core/processings.py @@ -6,13 +6,14 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2020 +# - Wen Guan, , 2019 - 2022 """ operations related to Processings. """ +import datetime from idds.orm.base.session import read_session, transactional_session from idds.common.constants import ProcessingLocking, ProcessingStatus, GranularityType @@ -21,13 +22,14 @@ contents as orm_contents, messages as orm_messages, transforms as orm_transforms) -from idds.core import messages as core_messages @transactional_session def add_processing(request_id, workload_id, transform_id, status, submitter=None, substatus=ProcessingStatus.New, granularity=None, granularity_type=GranularityType.File, + new_poll_period=1, update_poll_period=10, + new_retries=0, update_retries=0, max_new_retries=3, max_update_retries=0, expired_at=None, processing_metadata=None, session=None): """ Add a processing. @@ -50,6 +52,11 @@ def add_processing(request_id, workload_id, transform_id, status, submitter=None return orm_processings.add_processing(request_id=request_id, workload_id=workload_id, transform_id=transform_id, status=status, substatus=substatus, submitter=submitter, granularity=granularity, granularity_type=granularity_type, + new_poll_period=new_poll_period, + update_poll_period=update_poll_period, + new_retries=new_retries, update_retries=update_retries, + max_new_retries=max_new_retries, + max_update_retries=max_update_retries, expired_at=expired_at, processing_metadata=processing_metadata, session=session) @@ -104,42 +111,20 @@ def get_processings_by_transform_id(transform_id=None, to_json=False, session=No @transactional_session -def get_processings_with_messaging(locking=False, bulk_size=None, session=None): - msgs = core_messages.retrieve_processing_messages(processing_id=None, bulk_size=bulk_size, session=session) - if msgs: - pr_ids = [msg['processing_id'] for msg in msgs] - if locking: - pr2s = orm_processings.get_processings_by_status(status=None, processing_ids=pr_ids, - locking=locking, locking_for_update=True, - bulk_size=None, session=session) - if pr2s: - prs = [] - for pr_id in pr_ids: - if len(prs) >= bulk_size: - break - for pr in pr2s: - if pr['processing_id'] == pr_id: - prs.append(pr) - break - else: - prs = [] - - parameters = {'locking': ProcessingLocking.Locking} - for pr in prs: - orm_processings.update_processing(processing_id=pr['processing_id'], parameters=parameters, session=session) - return prs - else: - prs = orm_processings.get_processings_by_status(status=None, processing_ids=pr_ids, locking=locking, - locking_for_update=locking, - bulk_size=bulk_size, session=session) - return prs - else: - return [] +def get_processing_by_id_status(processing_id, status=None, locking=False, session=None): + pr = orm_processings.get_processing_by_id_status(processing_id=processing_id, status=status, locking=locking, session=session) + if pr is not None and locking: + parameters = {} + parameters['locking'] = ProcessingLocking.Locking + parameters['updated_at'] = datetime.datetime.utcnow() + orm_processings.update_processing(processing_id=pr['processing_id'], parameters=parameters, session=session) + return pr @transactional_session def get_processings_by_status(status, time_period=None, locking=False, bulk_size=None, to_json=False, by_substatus=False, - with_messaging=False, for_poller=False, session=None): + not_lock=False, next_poll_at=None, for_poller=False, only_return_id=False, + locking_for_update=False, new_poll=False, update_poll=False, session=None): """ Get processing or raise a NoObject exception. @@ -153,24 +138,22 @@ def get_processings_by_status(status, time_period=None, locking=False, bulk_size :returns: Processings. """ - if with_messaging: - prs = get_processings_with_messaging(locking=locking, bulk_size=bulk_size, session=session) - if prs: - return prs - if locking: - if bulk_size: + if not only_return_id and bulk_size: # order by cannot work together with locking. So first select 2 * bulk_size without locking with order by. # then select with locking. proc_ids = orm_processings.get_processings_by_status(status=status, period=time_period, locking=locking, bulk_size=bulk_size * 2, to_json=False, locking_for_update=False, by_substatus=by_substatus, only_return_id=True, - for_poller=for_poller, session=session) + for_poller=for_poller, new_poll=new_poll, + update_poll=update_poll, session=session) if proc_ids: processing2s = orm_processings.get_processings_by_status(status=status, period=time_period, locking=locking, processing_ids=proc_ids, - bulk_size=None, to_json=to_json, locking_for_update=True, - by_substatus=by_substatus, + bulk_size=None, to_json=to_json, + locking_for_update=locking_for_update, + by_substatus=by_substatus, only_return_id=only_return_id, + new_poll=new_poll, update_poll=update_poll, for_poller=for_poller, session=session) if processing2s: # reqs = req2s[:bulk_size] @@ -190,15 +173,29 @@ def get_processings_by_status(status, time_period=None, locking=False, bulk_size processings = [] else: processings = orm_processings.get_processings_by_status(status=status, period=time_period, locking=locking, - bulk_size=bulk_size, to_json=to_json, locking_for_update=locking, + bulk_size=bulk_size, to_json=to_json, + locking_for_update=locking_for_update, + new_poll=new_poll, update_poll=update_poll, + only_return_id=only_return_id, by_substatus=by_substatus, for_poller=for_poller, session=session) - parameters = {'locking': ProcessingLocking.Locking} - for processing in processings: - orm_processings.update_processing(processing['processing_id'], parameters=parameters, session=session) + parameters = {} + if not not_lock: + parameters['locking'] = ProcessingLocking.Locking + if next_poll_at: + parameters['next_poll_at'] = next_poll_at + parameters['updated_at'] = datetime.datetime.utcnow() + if parameters: + for processing in processings: + if type(processing) in [dict]: + orm_processings.update_processing(processing['processing_id'], parameters=parameters, session=session) + else: + orm_processings.update_processing(processing, parameters=parameters, session=session) else: processings = orm_processings.get_processings_by_status(status=status, period=time_period, locking=locking, bulk_size=bulk_size, to_json=to_json, + new_poll=new_poll, update_poll=update_poll, + only_return_id=only_return_id, by_substatus=by_substatus, for_poller=for_poller, session=session) return processings @@ -288,23 +285,30 @@ def update_processing_with_collection_contents(updated_processing, new_processin @transactional_session -def update_processing_contents(processing_update, content_updates, update_messages=None, new_contents=None, session=None): +def update_processing_contents(update_processing, update_contents, update_messages=None, new_contents=None, + update_collections=None, messages=None, message_bulk_size=2000, session=None): """ Update processing with contents. - :param processing_update: dict with processing id and parameters. - :param content_updates: list of content files. + :param update_processing: dict with processing id and parameters. + :param update_contents: list of content files. """ - if content_updates: - orm_contents.update_contents(content_updates, session=session) + if update_collections: + orm_collections.update_collections(update_collections, session=session) + if update_contents: + orm_contents.update_contents(update_contents, session=session) if new_contents: orm_contents.add_contents(new_contents, session=session) - if processing_update: - orm_processings.update_processing(processing_id=processing_update['processing_id'], - parameters=processing_update['parameters'], + if update_processing: + orm_processings.update_processing(processing_id=update_processing['processing_id'], + parameters=update_processing['parameters'], session=session) if update_messages: - orm_messages.update_messages(update_messages, session=session) + orm_messages.update_messages(update_messages, bulk_size=message_bulk_size, session=session) + if messages: + if not type(messages) in [list, tuple]: + messages = [messages] + orm_messages.add_messages(messages, bulk_size=message_bulk_size, session=session) @transactional_session diff --git a/main/lib/idds/core/requests.py b/main/lib/idds/core/requests.py index 199926f1..cd78b2b1 100644 --- a/main/lib/idds/core/requests.py +++ b/main/lib/idds/core/requests.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2020 +# - Wen Guan, , 2019 - 2022 """ @@ -14,9 +14,11 @@ """ import copy +import datetime from idds.common.constants import (RequestStatus, RequestLocking, WorkStatus, - CollectionType, CollectionStatus, CollectionRelationType) + CollectionType, CollectionStatus, CollectionRelationType, + MessageStatus) from idds.orm.base.session import read_session, transactional_session from idds.orm import requests as orm_requests from idds.orm import transforms as orm_transforms @@ -30,6 +32,8 @@ def create_request(scope=None, name=None, requester=None, request_type=None, username=None, userdn=None, transform_tag=None, status=RequestStatus.New, locking=RequestLocking.Idle, priority=0, lifetime=None, workload_id=None, request_metadata=None, + new_poll_period=1, update_poll_period=10, + new_retries=0, update_retries=0, max_new_retries=3, max_update_retries=0, processing_metadata=None): """ Add a request. @@ -57,6 +61,9 @@ def create_request(scope=None, name=None, requester=None, request_type=None, 'username': username, 'userdn': userdn, 'transform_tag': transform_tag, 'status': status, 'locking': locking, 'priority': priority, 'lifetime': lifetime, 'workload_id': workload_id, + 'new_poll_period': new_poll_period, 'update_poll_period': update_poll_period, + 'new_retries': new_retries, 'update_retries': update_retries, + 'max_new_retries': max_new_retries, 'max_update_retries': max_update_retries, 'request_metadata': request_metadata, 'processing_metadata': processing_metadata} return orm_requests.create_request(**kwargs) @@ -66,6 +73,8 @@ def add_request(scope=None, name=None, requester=None, request_type=None, username=None, userdn=None, transform_tag=None, status=RequestStatus.New, locking=RequestLocking.Idle, priority=0, lifetime=None, workload_id=None, request_metadata=None, + new_poll_period=1, update_poll_period=10, + new_retries=0, update_retries=0, max_new_retries=3, max_update_retries=0, processing_metadata=None, session=None): """ Add a request. @@ -85,7 +94,7 @@ def add_request(scope=None, name=None, requester=None, request_type=None, :returns: request id. """ - if workload_id is None and request_metadata and 'workload_id' in request_metadata: + if workload_id is None and request_metadata and 'workload_id' in request_metadata and request_metadata['workload_id']: workload_id = int(request_metadata['workload_id']) # request_metadata = convert_request_metadata_to_workflow(scope, name, workload_id, request_type, request_metadata) @@ -93,6 +102,9 @@ def add_request(scope=None, name=None, requester=None, request_type=None, 'username': username, 'userdn': userdn, 'transform_tag': transform_tag, 'status': status, 'locking': locking, 'priority': priority, 'lifetime': lifetime, 'workload_id': workload_id, + 'new_poll_period': new_poll_period, 'update_poll_period': update_poll_period, + 'new_retries': new_retries, 'update_retries': update_retries, + 'max_new_retries': max_new_retries, 'max_update_retries': max_update_retries, 'request_metadata': request_metadata, 'processing_metadata': processing_metadata, 'session': session} return orm_requests.add_request(**kwargs) @@ -113,6 +125,17 @@ def get_request_ids_by_workload_id(workload_id, session=None): return orm_requests.get_request_ids_by_workload_id(workload_id, session=session) +@transactional_session +def get_request_by_id_status(request_id, status=None, locking=False, session=None): + req = orm_requests.get_request_by_id_status(request_id=request_id, status=status, locking=locking, session=session) + if req is not None and locking: + parameters = {} + parameters['locking'] = RequestLocking.Locking + parameters['updated_at'] = datetime.datetime.utcnow() + orm_requests.update_request(request_id=req['request_id'], parameters=parameters, session=session) + return req + + @read_session def get_requests(request_id=None, workload_id=None, with_detail=False, with_request=False, with_transform=False, with_processing=False, @@ -232,6 +255,7 @@ def update_request_with_transforms(request_id, parameters, :param new_transforms: list of transforms :param update_transforms: list of transforms """ + new_tf_ids, update_tf_ids = [], [] if new_transforms: for tf in new_transforms: # tf_id = orm_transforms.add_transform(**tf, session=session) @@ -266,16 +290,17 @@ def update_request_with_transforms(request_id, parameters, orm_transforms.update_transform(transform_id=tf_id, parameters={'transform_metadata': tf['transform_metadata']}, session=session) - + new_tf_ids.append(tf_id) if update_transforms: for tr_id in update_transforms: orm_transforms.update_transform(transform_id=tr_id, parameters=update_transforms[tr_id], session=session) + update_tf_ids.append(tf_id) if new_messages: orm_messages.add_messages(new_messages, session=session) if update_messages: orm_messages.update_messages(update_messages, session=session) - return orm_requests.update_request(request_id, parameters, session=session) + return orm_requests.update_request(request_id, parameters, session=session), new_tf_ids, update_tf_ids @transactional_session @@ -296,41 +321,22 @@ def update_request_with_workprogresses(request_id, parameters, new_workprogresse @transactional_session -def get_requests_with_messaging(locking=False, bulk_size=None, session=None): +def get_operation_request_msgs(locking=False, bulk_size=None, session=None): msgs = core_messages.retrieve_request_messages(request_id=None, bulk_size=bulk_size, session=session) if msgs: - req_ids = [msg['request_id'] for msg in msgs] - if locking: - req2s = orm_requests.get_requests_by_status_type(status=None, request_ids=req_ids, - locking=locking, locking_for_update=True, - bulk_size=None, session=session) - if req2s: - reqs = [] - for req_id in req_ids: - if len(reqs) >= bulk_size: - break - for req in req2s: - if req['request_id'] == req_id: - reqs.append(req) - break - else: - reqs = [] - - parameters = {'locking': RequestLocking.Locking} - for req in reqs: - orm_requests.update_request(request_id=req['request_id'], parameters=parameters, session=session) - return reqs - else: - reqs = orm_requests.get_requests_by_status_type(status=None, request_ids=req_ids, locking=locking, - locking_for_update=locking, - bulk_size=bulk_size, session=session) - return reqs - else: - return [] + # req_ids = [msg['request_id'] for msg in msgs] + to_updates = [] + for msg in msgs: + to_updates.append({'msg_id': msg['msg_id'], + 'status': MessageStatus.Delivered}) + core_messages.update_messages(to_updates) + return msgs @transactional_session -def get_requests_by_status_type(status, request_type=None, time_period=None, locking=False, bulk_size=None, to_json=False, by_substatus=False, with_messaging=False, session=None): +def get_requests_by_status_type(status, request_type=None, time_period=None, locking=False, bulk_size=None, to_json=False, + by_substatus=False, not_lock=False, next_poll_at=None, new_poll=False, update_poll=False, + only_return_id=False, session=None): """ Get requests by status and type @@ -343,21 +349,19 @@ def get_requests_by_status_type(status, request_type=None, time_period=None, loc :returns: list of Request. """ - if with_messaging: - reqs = get_requests_with_messaging(locking=locking, bulk_size=bulk_size, session=session) - if reqs: - return reqs - if locking: - if bulk_size: + if not only_return_id and bulk_size: # order by cannot work together with locking. So first select 2 * bulk_size without locking with order by. # then select with locking. req_ids = orm_requests.get_requests_by_status_type(status, request_type, time_period, locking=locking, bulk_size=bulk_size * 2, locking_for_update=False, to_json=False, by_substatus=by_substatus, + new_poll=new_poll, update_poll=update_poll, only_return_id=True, session=session) if req_ids: req2s = orm_requests.get_requests_by_status_type(status, request_type, time_period, request_ids=req_ids, - locking=locking, locking_for_update=True, bulk_size=None, to_json=to_json, + locking=locking, locking_for_update=True, bulk_size=None, + to_json=to_json, + new_poll=new_poll, update_poll=update_poll, by_substatus=by_substatus, session=session) if req2s: # reqs = req2s[:bulk_size] @@ -378,13 +382,24 @@ def get_requests_by_status_type(status, request_type=None, time_period=None, loc else: reqs = orm_requests.get_requests_by_status_type(status, request_type, time_period, locking=locking, locking_for_update=locking, bulk_size=bulk_size, + new_poll=new_poll, update_poll=update_poll, only_return_id=only_return_id, to_json=to_json, by_substatus=by_substatus, session=session) - parameters = {'locking': RequestLocking.Locking} - for req in reqs: - orm_requests.update_request(request_id=req['request_id'], parameters=parameters, session=session) + parameters = {} + if not not_lock: + parameters['locking'] = RequestLocking.Locking + if next_poll_at: + parameters['next_poll_at'] = next_poll_at + parameters['updated_at'] = datetime.datetime.utcnow() + if parameters: + for req in reqs: + if type(req) in [dict]: + orm_requests.update_request(request_id=req['request_id'], parameters=parameters, session=session) + else: + orm_requests.update_request(request_id=req, parameters=parameters, session=session) else: reqs = orm_requests.get_requests_by_status_type(status, request_type, time_period, locking=locking, bulk_size=bulk_size, + new_poll=new_poll, update_poll=update_poll, only_return_id=only_return_id, to_json=to_json, by_substatus=by_substatus, session=session) return reqs diff --git a/main/lib/idds/core/transforms.py b/main/lib/idds/core/transforms.py index aacba801..e6c8133a 100644 --- a/main/lib/idds/core/transforms.py +++ b/main/lib/idds/core/transforms.py @@ -6,13 +6,14 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2020 +# - Wen Guan, , 2019 - 2022 """ operations related to Transform. """ +import datetime import logging # from idds.common import exceptions @@ -25,13 +26,14 @@ contents as orm_contents, messages as orm_messages, processings as orm_processings) -from idds.core import messages as core_messages @transactional_session def add_transform(request_id, workload_id, transform_type, transform_tag=None, priority=0, status=TransformStatus.New, substatus=TransformStatus.New, locking=TransformLocking.Idle, - retries=0, expired_at=None, transform_metadata=None, workprogress_id=None, session=None): + new_poll_period=1, update_poll_period=10, retries=0, expired_at=None, transform_metadata=None, + new_retries=0, update_retries=0, max_new_retries=3, max_update_retries=0, + workprogress_id=None, session=None): """ Add a transform. @@ -55,6 +57,11 @@ def add_transform(request_id, workload_id, transform_type, transform_tag=None, p transform_type=transform_type, transform_tag=transform_tag, priority=priority, status=status, substatus=substatus, locking=locking, retries=retries, + new_poll_period=new_poll_period, + update_poll_period=update_poll_period, + new_retries=new_retries, update_retries=update_retries, + max_new_retries=max_new_retries, + max_update_retries=max_update_retries, expired_at=expired_at, transform_metadata=transform_metadata, workprogress_id=workprogress_id, session=session) return transform_id @@ -76,6 +83,17 @@ def get_transform(transform_id, to_json=False, session=None): return orm_transforms.get_transform(transform_id=transform_id, to_json=to_json, session=session) +@transactional_session +def get_transform_by_id_status(transform_id, status=None, locking=False, session=None): + tf = orm_transforms.get_transform_by_id_status(transform_id=transform_id, status=status, locking=locking, session=session) + if tf is not None and locking: + parameters = {} + parameters['locking'] = TransformLocking.Locking + parameters['updated_at'] = datetime.datetime.utcnow() + orm_transforms.update_transform(transform_id=tf['transform_id'], parameters=parameters, session=session) + return tf + + @read_session def get_transforms_with_input_collection(transform_type, transform_tag, coll_scope, coll_name, to_json=False, session=None): """ @@ -132,41 +150,9 @@ def get_transforms(request_id=None, workload_id=None, transform_id=None, to_json @transactional_session -def get_transforms_with_messaging(locking=False, bulk_size=None, session=None): - msgs = core_messages.retrieve_transform_messages(transform_id=None, bulk_size=bulk_size, session=session) - if msgs: - tf_ids = [msg['transform_id'] for msg in msgs] - if locking: - tf2s = orm_transforms.get_transforms_by_status(status=None, transform_ids=tf_ids, - locking=locking, locking_for_update=True, - bulk_size=None, session=session) - if tf2s: - transforms = [] - for tf_id in tf_ids: - if len(transforms) >= bulk_size: - break - for tf in tf2s: - if tf['transform_id'] == tf_id: - transforms.append(tf) - break - else: - transforms = [] - - parameters = {'locking': TransformLocking.Locking} - for tf in transforms: - orm_transforms.update_transform(transform_id=tf['transform_id'], parameters=parameters, session=session) - return transforms - else: - transforms = orm_transforms.get_transforms_by_status(status=None, transform_ids=tf_ids, locking=locking, - locking_for_update=locking, - bulk_size=bulk_size, session=session) - return transforms - else: - return [] - - -@transactional_session -def get_transforms_by_status(status, period=None, locking=False, bulk_size=None, to_json=False, by_substatus=False, with_messaging=False, session=None): +def get_transforms_by_status(status, period=None, locking=False, bulk_size=None, to_json=False, by_substatus=False, + new_poll=False, update_poll=False, only_return_id=False, + not_lock=False, next_poll_at=None, session=None): """ Get transforms or raise a NoObject exception. @@ -179,23 +165,20 @@ def get_transforms_by_status(status, period=None, locking=False, bulk_size=None, :returns: list of transform. """ - if with_messaging: - transforms = get_transforms_with_messaging(locking=locking, bulk_size=bulk_size, session=session) - if transforms: - return transforms - if locking: - if bulk_size: + if not only_return_id and bulk_size: # order by cannot work together with locking. So first select 2 * bulk_size without locking with order by. # then select with locking. tf_ids = orm_transforms.get_transforms_by_status(status=status, period=period, locking=locking, bulk_size=bulk_size * 2, locking_for_update=False, to_json=False, only_return_id=True, + new_poll=new_poll, update_poll=update_poll, by_substatus=by_substatus, session=session) if tf_ids: transform2s = orm_transforms.get_transforms_by_status(status=status, period=period, locking=locking, bulk_size=None, locking_for_update=True, to_json=to_json, transform_ids=tf_ids, + new_poll=new_poll, update_poll=update_poll, by_substatus=by_substatus, session=session) if transform2s: # reqs = req2s[:bulk_size] @@ -217,14 +200,27 @@ def get_transforms_by_status(status, period=None, locking=False, bulk_size=None, transforms = orm_transforms.get_transforms_by_status(status=status, period=period, locking=locking, locking_for_update=locking, bulk_size=bulk_size, to_json=to_json, + new_poll=new_poll, update_poll=update_poll, + only_return_id=only_return_id, by_substatus=by_substatus, session=session) - parameters = {'locking': TransformLocking.Locking} - for transform in transforms: - orm_transforms.update_transform(transform_id=transform['transform_id'], parameters=parameters, session=session) + parameters = {} + if not not_lock: + parameters['locking'] = TransformLocking.Locking + if next_poll_at: + parameters['next_poll_at'] = next_poll_at + parameters['updated_at'] = datetime.datetime.utcnow() + if parameters: + for transform in transforms: + if type(transform) in [dict]: + orm_transforms.update_transform(transform_id=transform['transform_id'], parameters=parameters, session=session) + else: + orm_transforms.update_transform(transform_id=transform, parameters=parameters, session=session) else: transforms = orm_transforms.get_transforms_by_status(status=status, period=period, locking=locking, bulk_size=bulk_size, to_json=to_json, + new_poll=new_poll, update_poll=update_poll, + only_return_id=only_return_id, by_substatus=by_substatus, session=session) return transforms @@ -271,6 +267,8 @@ def add_transform_outputs(transform, transform_parameters, input_collections=Non """ work = transform['transform_metadata']['work'] + new_pr_ids, update_pr_ids = [], [] + if input_collections: for coll in input_collections: collection = coll['collection'] @@ -312,9 +310,11 @@ def add_transform_outputs(transform, transform_parameters, input_collections=Non if new_processing: # print(new_processing) processing_id = orm_processings.add_processing(**new_processing, session=session) + new_pr_ids.append(processing_id) if update_processing: for proc_id in update_processing: orm_processings.update_processing(processing_id=proc_id, parameters=update_processing[proc_id], session=session) + update_pr_ids.append(proc_id) if messages: if not type(messages) in [list, tuple]: @@ -343,6 +343,7 @@ def add_transform_outputs(transform, transform_parameters, input_collections=Non orm_transforms.update_transform(transform_id=transform['transform_id'], parameters=transform_parameters, session=session) + return new_pr_ids, update_pr_ids @transactional_session @@ -386,7 +387,7 @@ def get_transform_input_output_maps(transform_id, input_coll_ids, output_coll_id :param transform_id: transform id. """ - contents = orm_contents.get_contents_by_transform(transform_id=transform_id, session=session) + contents = orm_contents.get_contents_by_request_transform(transform_id=transform_id, session=session) ret = {} for content in contents: map_id = content['map_id'] @@ -618,7 +619,11 @@ def get_work_name_to_coll_map(request_id): for coll in colls: if coll['transform_id'] == transform_id: if coll['relation_type'] == CollectionRelationType.Input: - work_name_to_coll_map[work_name]['inputs'].append({'coll_id': coll['coll_id'], 'scope': coll['scope'], 'name': coll['name']}) + work_name_to_coll_map[work_name]['inputs'].append({'coll_id': coll['coll_id'], 'transform_id': coll['transform_id'], + 'workload_id': coll['workload_id'], + 'scope': coll['scope'], 'name': coll['name']}) elif coll['relation_type'] == CollectionRelationType.Output: - work_name_to_coll_map[work_name]['outputs'].append({'coll_id': coll['coll_id'], 'scope': coll['scope'], 'name': coll['name']}) + work_name_to_coll_map[work_name]['outputs'].append({'coll_id': coll['coll_id'], 'transform_id': coll['transform_id'], + 'workload_id': coll['workload_id'], + 'scope': coll['scope'], 'name': coll['name']}) return work_name_to_coll_map diff --git a/main/lib/idds/orm/base/models.py b/main/lib/idds/orm/base/models.py index f35e557b..fe951b41 100644 --- a/main/lib/idds/orm/base/models.py +++ b/main/lib/idds/orm/base/models.py @@ -16,7 +16,7 @@ import datetime from enum import Enum -from sqlalchemy import BigInteger, Boolean, Column, DateTime, Integer, String, event, DDL +from sqlalchemy import BigInteger, Boolean, Column, DateTime, Integer, String, event, DDL, Interval from sqlalchemy.ext.compiler import compiles # from sqlalchemy.ext.hybrid import hybrid_property from sqlalchemy.orm import object_mapper @@ -30,7 +30,9 @@ CollectionRelationType, ContentType, ContentRelationType, ContentStatus, ContentLocking, GranularityType, MessageType, MessageStatus, MessageLocking, - MessageSource, MessageDestination) + MessageSource, MessageDestination, + CommandType, CommandStatus, CommandLocking, + CommandLocation) from idds.common.utils import date_to_str from idds.orm.base.enum import EnumSymbol from idds.orm.base.types import JSON, JSONString, EnumWithValue @@ -142,12 +144,19 @@ class Request(BASE, ModelBase): priority = Column(Integer()) status = Column(EnumWithValue(RequestStatus)) substatus = Column(EnumWithValue(RequestStatus), default=0) + oldstatus = Column(EnumWithValue(RequestStatus), default=0) locking = Column(EnumWithValue(RequestLocking)) created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) next_poll_at = Column("next_poll_at", DateTime, default=datetime.datetime.utcnow) accessed_at = Column("accessed_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) expired_at = Column("expired_at", DateTime) + new_retries = Column(Integer(), default=0) + update_retries = Column(Integer(), default=0) + max_new_retries = Column(Integer(), default=3) + max_update_retries = Column(Integer(), default=0) + new_poll_period = Column(Interval(), default=datetime.timedelta(seconds=1)) + update_poll_period = Column(Interval(), default=datetime.timedelta(seconds=10)) errors = Column(JSONString(1024)) _request_metadata = Column('request_metadata', JSON()) _processing_metadata = Column('processing_metadata', JSON()) @@ -258,6 +267,7 @@ class Transform(BASE, ModelBase): safe2get_output_from_input = Column(Integer()) status = Column(EnumWithValue(TransformStatus)) substatus = Column(EnumWithValue(TransformStatus), default=0) + oldstatus = Column(EnumWithValue(TransformStatus), default=0) locking = Column(EnumWithValue(TransformLocking)) retries = Column(Integer(), default=0) created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) @@ -266,6 +276,13 @@ class Transform(BASE, ModelBase): started_at = Column("started_at", DateTime) finished_at = Column("finished_at", DateTime) expired_at = Column("expired_at", DateTime) + new_retries = Column(Integer(), default=0) + update_retries = Column(Integer(), default=0) + max_new_retries = Column(Integer(), default=3) + max_update_retries = Column(Integer(), default=0) + new_poll_period = Column(Interval(), default=datetime.timedelta(seconds=1)) + update_poll_period = Column(Interval(), default=datetime.timedelta(seconds=10)) + errors = Column(JSONString(1024)) _transform_metadata = Column('transform_metadata', JSON()) _running_metadata = Column('running_metadata', JSON()) @@ -348,6 +365,7 @@ class Processing(BASE, ModelBase): workload_id = Column(Integer()) status = Column(EnumWithValue(ProcessingStatus)) substatus = Column(EnumWithValue(ProcessingStatus), default=0) + oldstatus = Column(EnumWithValue(ProcessingStatus), default=0) locking = Column(EnumWithValue(ProcessingLocking)) submitter = Column(String(20)) submitted_id = Column(Integer()) @@ -360,6 +378,13 @@ class Processing(BASE, ModelBase): submitted_at = Column("submitted_at", DateTime) finished_at = Column("finished_at", DateTime) expired_at = Column("expired_at", DateTime) + new_retries = Column(Integer(), default=0) + update_retries = Column(Integer(), default=0) + max_new_retries = Column(Integer(), default=3) + max_update_retries = Column(Integer(), default=0) + new_poll_period = Column(Interval(), default=datetime.timedelta(seconds=1)) + update_poll_period = Column(Interval(), default=datetime.timedelta(seconds=10)) + errors = Column(JSONString(1024)) _processing_metadata = Column('processing_metadata', JSON()) _running_metadata = Column('running_metadata', JSON()) output_metadata = Column(JSON()) @@ -543,6 +568,7 @@ class Message(BASE, ModelBase): transform_id = Column(Integer()) processing_id = Column(Integer()) num_contents = Column(Integer()) + retries = Column(Integer(), default=0) created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) msg_content = Column(JSON()) @@ -553,17 +579,46 @@ class Message(BASE, ModelBase): Index('MESSAGES_TYPE_ST_PR_IDX', 'msg_type', 'status', 'destination', 'processing_id')) +class Command(BASE, ModelBase): + """Represents the operations commands""" + __tablename__ = 'commands' + cmd_id = Column(BigInteger().with_variant(Integer, "sqlite"), + Sequence('COMMAND_ID_SEQ', schema=DEFAULT_SCHEMA_NAME), + primary_key=True) + request_id = Column(BigInteger().with_variant(Integer, "sqlite")) + workload_id = Column(Integer()) + transform_id = Column(Integer()) + processing_id = Column(Integer()) + cmd_type = Column(EnumWithValue(CommandType)) + status = Column(EnumWithValue(CommandStatus)) + substatus = Column(Integer()) + locking = Column(EnumWithValue(CommandLocking)) + username = Column(String(20)) + retries = Column(Integer(), default=0) + source = Column(EnumWithValue(CommandLocation)) + destination = Column(EnumWithValue(CommandLocation)) + created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) + updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) + cmd_content = Column(JSON()) + errors = Column(JSONString(1024)) + + _table_args = (PrimaryKeyConstraint('cmd_id', name='COMMANDS_PK'), + Index('COMMANDS_TYPE_ST_IDX', 'cmd_type', 'status', 'destination', 'request_id'), + Index('COMMANDS_TYPE_ST_TF_IDX', 'cmd_type', 'status', 'destination', 'transform_id'), + Index('COMMANDS_TYPE_ST_PR_IDX', 'cmd_type', 'status', 'destination', 'processing_id')) + + def register_models(engine): """ Creates database tables for all models with the given engine """ # models = (Request, Workprogress, Transform, Workprogress2transform, Processing, Collection, Content, Health, Message) - models = (Request, Transform, Processing, Collection, Content, Health, Message) + models = (Request, Transform, Processing, Collection, Content, Health, Message, Command) for model in models: - if not engine.has_table(model.__tablename__, model.metadata.schema): - model.metadata.create_all(engine) # pylint: disable=maybe-no-member + # if not engine.has_table(model.__tablename__, model.metadata.schema): + model.metadata.create_all(engine) # pylint: disable=maybe-no-member def unregister_models(engine): @@ -572,7 +627,7 @@ def unregister_models(engine): """ # models = (Request, Workprogress, Transform, Workprogress2transform, Processing, Collection, Content, Health, Message) - models = (Request, Transform, Processing, Collection, Content, Health, Message) + models = (Request, Transform, Processing, Collection, Content, Health, Message, Command) for model in models: model.metadata.drop_all(engine) # pylint: disable=maybe-no-member diff --git a/main/lib/idds/orm/base/types.py b/main/lib/idds/orm/base/types.py index 55dd3afb..831c62ba 100644 --- a/main/lib/idds/orm/base/types.py +++ b/main/lib/idds/orm/base/types.py @@ -36,6 +36,8 @@ class GUID(TypeDecorator): """ impl = CHAR + cache_ok = True + def generate_uuid(self): return str(uuid.uuid4()).replace('-', '').lower() @@ -84,6 +86,8 @@ class JSON(TypeDecorator): impl = types.JSON + cache_ok = True + def load_dialect_impl(self, dialect): if dialect.name == 'postgresql': return dialect.type_descriptor(JSONB()) @@ -126,6 +130,8 @@ class JSONString(TypeDecorator): impl = types.JSON + cache_ok = True + def __init__(self, length=1024, *args, **kwargs): super(JSONString, self).__init__(*args, **kwargs) self._length = length @@ -170,6 +176,7 @@ class EnumWithValue(TypeDecorator): The default would have stored the enum's *name* (ie the string). """ impl = Integer + cache_ok = True def __init__(self, enumtype, *args, **kwargs): super(EnumWithValue, self).__init__(*args, **kwargs) diff --git a/main/lib/idds/orm/collections.py b/main/lib/idds/orm/collections.py index 45c06624..9d17dec5 100644 --- a/main/lib/idds/orm/collections.py +++ b/main/lib/idds/orm/collections.py @@ -316,6 +316,38 @@ def get_collections(scope=None, name=None, request_id=None, workload_id=None, tr raise error +@read_session +def get_collections_by_request_ids(request_ids, session=None): + """" + Get collections by a list of request ids. + + :param request_ids: list of request ids. + + :return collections: list of collections. + """ + try: + if request_ids and type(request_ids) not in (list, tuple): + request_ids = [request_ids] + + query = session.query(models.Collection.coll_id, + models.Collection.request_id, + models.Collection.transform_id, + models.Collection.workload_id) + if request_ids: + query = query.filter(models.Collection.request_id.in_(request_ids)) + + tmp = query.all() + rets = [] + if tmp: + for t in tmp: + # rets.append(t.to_dict()) + t2 = dict(zip(t.keys(), t)) + rets.append(t2) + return rets + except Exception as error: + raise error + + @transactional_session def update_collection(coll_id, parameters, session=None): """ diff --git a/main/lib/idds/orm/commands.py b/main/lib/idds/orm/commands.py new file mode 100644 index 00000000..27e3fb5e --- /dev/null +++ b/main/lib/idds/orm/commands.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2019 + + +""" +operations related to Commands. +""" + +import re +import datetime + +import sqlalchemy +from sqlalchemy import or_ +from sqlalchemy.exc import DatabaseError, IntegrityError + +from idds.common import exceptions +from idds.common.constants import CommandLocation, CommandLocking +from idds.orm.base import models +from idds.orm.base.session import read_session, transactional_session + + +@transactional_session +def add_command(cmd_type, status, request_id, workload_id, transform_id, + username=None, retries=0, processing_id=None, + source=CommandLocation.Rest, destination=CommandLocation.Clerk, + cmd_content=None, session=None): + """ + Add a command to be submitted asynchronously to a command broker. + + :param cmd_type: The type of the cmd as a number, e.g., finished_stagein. + :param status: The status about the command + :param source: The source where the command is from. + :param request_id: The request id. + :param workload_id: The workload id. + :param transform_id: The transform id. + :param cmd_content: The command cmd_content as JSON. + :param session: The database session. + """ + + try: + cmd = models.Command(request_id=request_id, workload_id=workload_id, + transform_id=transform_id, cmd_type=cmd_type, + status=status, substatus=0, locking=0, + source=source, destination=destination, + username=username, retries=retries, + processing_id=processing_id, + cmd_content=cmd_content) + + cmd.save(session=session) + cmd_id = cmd.cmd_id + return cmd_id + except TypeError as e: + raise exceptions.DatabaseException('Invalid JSON for cmd_content: %s' % str(e)) + except DatabaseError as e: + if re.match('.*ORA-12899.*', e.args[0]) \ + or re.match('.*1406.*', e.args[0]): + raise exceptions.DatabaseException('Could not persist command, cmd_content too large: %s' % str(e)) + else: + raise exceptions.DatabaseException('Could not persist command: %s' % str(e)) + + +@transactional_session +def update_commands(commands, bulk_size=1000, session=None): + try: + session.bulk_update_mappings(models.Command, commands) + except TypeError as e: + raise exceptions.DatabaseException('Invalid JSON for cmd_content: %s' % str(e)) + except DatabaseError as e: + if re.match('.*ORA-12899.*', e.args[0]) \ + or re.match('.*1406.*', e.args[0]): + raise exceptions.DatabaseException('Could not persist command, cmd_content too large: %s' % str(e)) + else: + raise exceptions.DatabaseException('Could not persist command: %s' % str(e)) + + +@read_session +def retrieve_command(cmd_type=None, status=None, source=None, + destination=None, request_id=None, workload_id=None, + transform_id=None, processing_id=None, bulk_size=None, session=None): + """ + Retrieve up to $bulk command. + + :param bulk: Number of command as an integer. + :param cmd_type: Return only specified cmd_type. + :param status: The status about the command + :param source: The source where the command is from. + :param session: The database session. + + :returns command: List of dictionaries + """ + command = [] + try: + query = session.query(models.Command) + if request_id is not None: + query = query.with_hint(models.Command, "INDEX(COMMANDS COMMANDS_TYPE_ST_IDX)", 'oracle') + elif transform_id: + query = query.with_hint(models.Command, "INDEX(COMMANDS COMMANDS_TYPE_ST_TF_IDX)", 'oracle') + elif processing_id is not None: + query = query.with_hint(models.Command, "INDEX(COMMANDS COMMANDS_TYPE_ST_PR_IDX)", 'oracle') + else: + query = query.with_hint(models.Command, "INDEX(COMMANDS COMMANDS_TYPE_ST_IDX)", 'oracle') + + if cmd_type is not None: + query = query.filter_by(cmd_type=cmd_type) + if status is not None: + query = query.filter_by(status=status) + if source is not None: + query = query.filter_by(source=source) + if destination is not None: + query = query.filter_by(destination=destination) + if request_id is not None: + query = query.filter_by(request_id=request_id) + if workload_id is not None: + query = query.filter_by(workload_id=workload_id) + if transform_id is not None: + query = query.filter_by(transform_id=transform_id) + if processing_id is not None: + query = query.filter_by(processing_id=processing_id) + + if bulk_size: + query = query.order_by(models.Command.created_at).limit(bulk_size) + # query = query.with_for_update(nowait=True) + + tmp = query.all() + if tmp: + for t in tmp: + command.append(t.to_dict()) + return command + except IntegrityError as e: + raise exceptions.DatabaseException(e.args) + + +@transactional_session +def delete_command(command, session=None): + """ + Delete all command with the given IDs. + + :param command: The command to delete as a list of dictionaries. + """ + command_condition = [] + for command in command: + command_condition.append(models.Command.cmd_id == command['cmd_id']) + + try: + if command_condition: + session.query(models.Command).\ + with_hint(models.Command, "index(command COMMANDS_PK)", 'oracle').\ + filter(or_(*command_condition)).\ + delete(synchronize_session=False) + except IntegrityError as e: + raise exceptions.DatabaseException(e.args) + + +@transactional_session +def get_commands_by_status(status, locking=False, period=None, bulk_size=None, session=None): + """ + Get commands + + :param status: Command status. + :param locking: Whether only retrieves unlocked items. + + :param session: The database session in use. + + :returns: list of commands. + """ + try: + if status: + if not isinstance(status, (list, tuple)): + status = [status] + if len(status) == 1: + status = [status[0], status[0]] + + query = session.query(models.Command) + if status: + query = query.filter(models.Command.status.in_(status)) + + if period: + query = query.filter(models.Command.updated_at <= datetime.datetime.utcnow() - datetime.timedelta(seconds=period)) + + if locking: + query = query.filter(models.Command.locking == CommandLocking.Idle) + # query = query.with_for_update(skip_locked=True) + # query = query.order_by(asc(models.Command.updated_at)) + + if bulk_size: + query = query.limit(bulk_size) + + tmp = query.all() + rets = [] + if tmp: + rets = [t.to_dict() for t in tmp] + return rets + except sqlalchemy.orm.exc.NoResultFound as error: + raise exceptions.NoObject('No commands attached with status (%s): %s' % + (status, error)) + except Exception as error: + raise error diff --git a/main/lib/idds/orm/contents.py b/main/lib/idds/orm/contents.py index ac464506..cc9a46ec 100644 --- a/main/lib/idds/orm/contents.py +++ b/main/lib/idds/orm/contents.py @@ -331,12 +331,13 @@ def get_contents(scope=None, name=None, coll_id=None, status=None, to_json=False @read_session -def get_contents_by_transform(transform_id, to_json=False, session=None): +def get_contents_by_request_transform(request_id=None, transform_id=None, workload_id=None, status=None, status_updated=False, session=None): """ Get content or raise a NoObject exception. + :param request_id: request id. :param transform_id: transform id. - :param to_json: return json format. + :param workload_id: workload id. :param session: The database session in use. @@ -346,19 +347,29 @@ def get_contents_by_transform(transform_id, to_json=False, session=None): """ try: + if status is not None: + if not isinstance(status, (tuple, list)): + status = [status] + query = session.query(models.Content) - query = query.with_hint(models.Content, "INDEX(CONTENTS CONTENT_ID_UQ)", 'oracle') - query = query.filter(models.Content.transform_id == transform_id) - query = query.order_by(asc(models.Content.map_id)) + query = query.with_hint(models.Content, "INDEX(CONTENTS CONTENTS_REQ_TF_COLL_IDX)", 'oracle') + if request_id: + query = query.filter(models.Content.request_id == request_id) + if transform_id: + query = query.filter(models.Content.transform_id == transform_id) + if workload_id: + query = query.filter(models.Content.workload_id == workload_id) + if status is not None: + query = query.filter(models.Content.substatus.in_(status)) + if status_updated: + query = query.filter(models.Content.status != models.Content.substatus) + query = query.order_by(asc(models.Content.request_id), asc(models.Content.transform_id), asc(models.Content.map_id)) tmp = query.all() rets = [] if tmp: for t in tmp: - if to_json: - rets.append(t.to_dict_json()) - else: - rets.append(t.to_dict()) + rets.append(t.to_dict()) return rets except sqlalchemy.orm.exc.NoResultFound as error: raise exceptions.NoObject('No record can be found with (transform_id=%s): %s' % @@ -367,6 +378,64 @@ def get_contents_by_transform(transform_id, to_json=False, session=None): raise error +@read_session +def get_contents_by_content_ids(content_ids, request_id=None, bulk_size=1000, session=None): + """ + Get content or raise a NoObject exception. + + :param request_id: request id. + :param content_ids: list of content id. + :param workload_id: workload id. + + :param session: The database session in use. + + :raises NoObject: If no content is founded. + + :returns: list of contents. + """ + try: + if content_ids: + if not isinstance(content_ids, (list, tuple)): + content_ids = [content_ids] + + chunks = [content_ids[i:i + bulk_size] for i in range(0, len(content_ids), bulk_size)] + ret = [] + for chunk in chunks: + ret_chunk = get_contents_by_content_ids_real(chunk, request_id=request_id) + ret = ret + ret_chunk + return ret + except Exception as error: + raise error + + +@read_session +def get_contents_by_content_ids_real(content_ids, request_id=None, session=None): + """ + Get content or raise a NoObject exception. + + :param request_id: request id. + :param content_ids: list of content id. + :param workload_id: workload id. + + :param session: The database session in use. + + :raises NoObject: If no content is founded. + + :returns: list of contents. + """ + try: + query = session.query(models.Content) + query = query.with_hint(models.Content, "INDEX(CONTENTS CONTENTS_REQ_TF_COLL_IDX)", 'oracle') + if request_id: + query = query.filter(models.Content.request_id == request_id) + query = query.filter(models.Content.content_id.in_(content_ids)) + ret = query.all() + rets = [t.to_dict() for t in ret] + return rets + except Exception as error: + raise error + + @read_session def get_input_contents(request_id, coll_id, name=None, to_json=False, session=None): """ diff --git a/main/lib/idds/orm/messages.py b/main/lib/idds/orm/messages.py index 15272ea0..e316f3f3 100644 --- a/main/lib/idds/orm/messages.py +++ b/main/lib/idds/orm/messages.py @@ -13,6 +13,7 @@ operations related to Messages. """ +import datetime import re import copy @@ -116,7 +117,8 @@ def update_messages(messages, bulk_size=1000, session=None): @read_session def retrieve_messages(bulk_size=1000, msg_type=None, status=None, source=None, destination=None, request_id=None, workload_id=None, - transform_id=None, processing_id=None, session=None): + transform_id=None, processing_id=None, + retries=None, delay=None, session=None): """ Retrieve up to $bulk messages. @@ -156,6 +158,10 @@ def retrieve_messages(bulk_size=1000, msg_type=None, status=None, source=None, query = query.filter_by(transform_id=transform_id) if processing_id is not None: query = query.filter_by(processing_id=processing_id) + if retries: + query = query.filter_by(retries=retries) + if delay: + query = query.filter(models.Message.updated_at < datetime.datetime.utcnow() - datetime.timedelta(seconds=delay)) if bulk_size: query = query.order_by(models.Message.created_at).limit(bulk_size) diff --git a/main/lib/idds/orm/processings.py b/main/lib/idds/orm/processings.py index d7130544..772801d4 100644 --- a/main/lib/idds/orm/processings.py +++ b/main/lib/idds/orm/processings.py @@ -27,6 +27,8 @@ def create_processing(request_id, workload_id, transform_id, status=ProcessingStatus.New, locking=ProcessingLocking.Idle, submitter=None, granularity=None, granularity_type=GranularityType.File, expired_at=None, processing_metadata=None, + new_poll_period=1, update_poll_period=10, + new_retries=0, update_retries=0, max_new_retries=3, max_update_retries=0, substatus=ProcessingStatus.New, output_metadata=None): """ Create a processing. @@ -48,14 +50,25 @@ def create_processing(request_id, workload_id, transform_id, status=ProcessingSt status=status, substatus=substatus, locking=locking, submitter=submitter, granularity=granularity, granularity_type=granularity_type, expired_at=expired_at, processing_metadata=processing_metadata, + new_retries=new_retries, update_retries=update_retries, + max_new_retries=max_new_retries, max_update_retries=max_update_retries, output_metadata=output_metadata) + + if new_poll_period: + new_poll_period = datetime.timedelta(seconds=new_poll_period) + new_processing.new_poll_period = new_poll_period + if update_poll_period: + update_poll_period = datetime.timedelta(seconds=update_poll_period) + new_processing.update_poll_period = update_poll_period return new_processing @transactional_session def add_processing(request_id, workload_id, transform_id, status=ProcessingStatus.New, locking=ProcessingLocking.Idle, submitter=None, substatus=ProcessingStatus.New, - granularity=None, granularity_type=GranularityType.File, expired_at=None, processing_metadata=None, + granularity=None, granularity_type=GranularityType.File, expired_at=None, + processing_metadata=None, new_poll_period=1, update_poll_period=10, + new_retries=0, update_retries=0, max_new_retries=3, max_update_retries=0, output_metadata=None, session=None): """ Add a processing. @@ -79,7 +92,11 @@ def add_processing(request_id, workload_id, transform_id, status=ProcessingStatu try: new_processing = create_processing(request_id=request_id, workload_id=workload_id, transform_id=transform_id, status=status, substatus=substatus, locking=locking, submitter=submitter, - granularity=granularity, granularity_type=granularity_type, expired_at=expired_at, + granularity=granularity, granularity_type=granularity_type, + expired_at=expired_at, new_poll_period=new_poll_period, + update_poll_period=update_poll_period, + new_retries=new_retries, update_retries=update_retries, + max_new_retries=max_new_retries, max_update_retries=max_update_retries, processing_metadata=processing_metadata, output_metadata=output_metadata) new_processing.save(session=session) proc_id = new_processing.processing_id @@ -123,6 +140,46 @@ def get_processing(processing_id, to_json=False, session=None): raise error +@read_session +def get_processing_by_id_status(processing_id, status=None, locking=False, session=None): + """ + Get a processing or raise a NoObject exception. + + :param processing_id: The id of the processing. + :param status: request status. + :param locking: the locking status. + + :param session: The database session in use. + + :raises NoObject: If no request is founded. + + :returns: Processing. + """ + + try: + query = session.query(models.Processing).with_hint(models.Processing, "INDEX(PROCESSINGS PROCESSINGS_PK)", 'oracle')\ + .filter(models.Processing.processing_id == processing_id) + + if status: + if not isinstance(status, (list, tuple)): + status = [status] + if len(status) == 1: + status = [status[0], status[0]] + query = query.filter(models.Processing.status.in_(status)) + + if locking: + query = query.filter(models.Processing.locking == ProcessingLocking.Idle) + query = query.with_for_update(skip_locked=True) + + ret = query.first() + if not ret: + return None + else: + return ret.to_dict() + except sqlalchemy.orm.exc.NoResultFound as error: + raise exceptions.NoObject('processing processing_id: %s cannot be found: %s' % (processing_id, error)) + + @read_session def get_processings(request_id=None, workload_id=None, transform_id=None, to_json=False, session=None): """ @@ -203,7 +260,7 @@ def get_processings_by_transform_id(transform_id=None, to_json=False, session=No @transactional_session def get_processings_by_status(status, period=None, processing_ids=[], locking=False, locking_for_update=False, bulk_size=None, submitter=None, to_json=False, by_substatus=False, only_return_id=False, - for_poller=False, session=None): + new_poll=False, update_poll=False, for_poller=False, session=None): """ Get processing or raise a NoObject exception. @@ -238,7 +295,10 @@ def get_processings_by_status(status, period=None, processing_ids=[], locking=Fa query = query.filter(models.Processing.substatus.in_(status)) else: query = query.filter(models.Processing.status.in_(status)) - query = query.filter(models.Processing.next_poll_at <= datetime.datetime.utcnow()) + if new_poll: + query = query.filter(models.Processing.updated_at + models.Processing.new_poll_period <= datetime.datetime.utcnow()) + if update_poll: + query = query.filter(models.Processing.updated_at + models.Processing.update_poll_period <= datetime.datetime.utcnow()) if processing_ids: query = query.filter(models.Processing.processing_id.in_(processing_ids)) @@ -249,9 +309,9 @@ def get_processings_by_status(status, period=None, processing_ids=[], locking=Fa if submitter: query = query.filter(models.Processing.submitter == submitter) - if for_poller: - query = query.order_by(asc(models.Processing.poller_updated_at)) - elif locking_for_update: + # if for_poller: + # query = query.order_by(asc(models.Processing.poller_updated_at)) + if locking_for_update: query = query.with_for_update(skip_locked=True) else: query = query.order_by(asc(models.Processing.updated_at)) @@ -291,6 +351,10 @@ def update_processing(processing_id, parameters, session=None): """ try: + if 'new_poll_period' in parameters and type(parameters['new_poll_period']) not in [datetime.timedelta]: + parameters['new_poll_period'] = datetime.timedelta(seconds=parameters['new_poll_period']) + if 'update_poll_period' in parameters and type(parameters['update_poll_period']) not in [datetime.timedelta]: + parameters['update_poll_period'] = datetime.timedelta(seconds=parameters['update_poll_period']) parameters['updated_at'] = datetime.datetime.utcnow() if 'status' in parameters and parameters['status'] in [ProcessingStatus.Finished, ProcessingStatus.Failed, diff --git a/main/lib/idds/orm/requests.py b/main/lib/idds/orm/requests.py index 9c801ed1..8b092a8a 100644 --- a/main/lib/idds/orm/requests.py +++ b/main/lib/idds/orm/requests.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2020 +# - Wen Guan, , 2019 - 2022 """ @@ -31,6 +31,8 @@ def create_request(scope=None, name=None, requester=None, request_type=None, username=None, userdn=None, transform_tag=None, status=RequestStatus.New, locking=RequestLocking.Idle, priority=0, lifetime=None, workload_id=None, request_metadata=None, + new_poll_period=1, update_poll_period=10, + new_retries=0, update_retries=0, max_new_retries=3, max_update_retries=0, processing_metadata=None): """ Create a request. @@ -79,7 +81,15 @@ def create_request(scope=None, name=None, requester=None, request_type=None, transform_tag=transform_tag, status=status, locking=locking, priority=priority, workload_id=workload_id, expired_at=expired_at, + new_retries=new_retries, update_retries=update_retries, + max_new_retries=max_new_retries, max_update_retries=max_update_retries, request_metadata=request_metadata, processing_metadata=processing_metadata) + if new_poll_period: + new_poll_period = datetime.timedelta(seconds=new_poll_period) + new_request.new_poll_period = new_poll_period + if update_poll_period: + update_poll_period = datetime.timedelta(seconds=update_poll_period) + new_request.update_poll_period = update_poll_period return new_request @@ -88,6 +98,8 @@ def add_request(scope=None, name=None, requester=None, request_type=None, username=None, userdn=None, transform_tag=None, status=RequestStatus.New, locking=RequestLocking.Idle, priority=0, lifetime=None, workload_id=None, request_metadata=None, + new_poll_period=1, update_poll_period=10, + new_retries=0, update_retries=0, max_new_retries=3, max_update_retries=0, processing_metadata=None, session=None): """ Add a request. @@ -116,6 +128,10 @@ def add_request(scope=None, name=None, requester=None, request_type=None, username=username, userdn=userdn, transform_tag=transform_tag, status=status, locking=locking, priority=priority, workload_id=workload_id, lifetime=lifetime, + new_poll_period=new_poll_period, + update_poll_period=update_poll_period, + new_retries=new_retries, update_retries=update_retries, + max_new_retries=max_new_retries, max_update_retries=max_update_retries, request_metadata=request_metadata, processing_metadata=processing_metadata) new_request.save(session=session) request_id = new_request.request_id @@ -205,6 +221,46 @@ def get_request(request_id, to_json=False, session=None): raise exceptions.NoObject('request request_id: %s cannot be found: %s' % (request_id, error)) +@read_session +def get_request_by_id_status(request_id, status=None, locking=False, session=None): + """ + Get a request or raise a NoObject exception. + + :param request_id: The id of the request. + :param status: request status. + :param locking: the locking status. + + :param session: The database session in use. + + :raises NoObject: If no request is founded. + + :returns: Request. + """ + + try: + query = session.query(models.Request).with_hint(models.Request, "INDEX(REQUESTS REQUESTS_PK)", 'oracle')\ + .filter(models.Request.request_id == request_id) + + if status: + if not isinstance(status, (list, tuple)): + status = [status] + if len(status) == 1: + status = [status[0], status[0]] + query = query.filter(models.Request.status.in_(status)) + + if locking: + query = query.filter(models.Request.locking == RequestLocking.Idle) + query = query.with_for_update(skip_locked=True) + + ret = query.first() + if not ret: + return None + else: + return ret.to_dict() + except sqlalchemy.orm.exc.NoResultFound as error: + raise exceptions.NoObject('request request_id: %s cannot be found: %s' % (request_id, error)) + + @read_session def get_requests(request_id=None, workload_id=None, with_detail=False, with_metadata=False, with_request=False, with_transform=False, with_processing=False, to_json=False, session=None): @@ -646,7 +702,7 @@ def get_requests_by_requester(scope, name, requester, to_json=False, session=Non @transactional_session def get_requests_by_status_type(status, request_type=None, time_period=None, request_ids=[], locking=False, locking_for_update=False, bulk_size=None, to_json=False, by_substatus=False, - only_return_id=False, session=None): + new_poll=False, update_poll=False, only_return_id=False, session=None): """ Get requests. @@ -680,12 +736,13 @@ def get_requests_by_status_type(status, request_type=None, time_period=None, req query = query.filter(models.Request.substatus.in_(status)) else: query = query.filter(models.Request.status.in_(status)) - query = query.filter(models.Request.next_poll_at <= datetime.datetime.utcnow()) + if new_poll: + query = query.filter(models.Request.updated_at + models.Request.new_poll_period <= datetime.datetime.utcnow()) + if update_poll: + query = query.filter(models.Request.updated_at + models.Request.update_poll_period <= datetime.datetime.utcnow()) if request_type is not None: query = query.filter(models.Request.request_type == request_type) - # if time_period is not None: - # query = query.filter(models.Request.updated_at < datetime.datetime.utcnow() - datetime.timedelta(seconds=time_period)) if request_ids: query = query.filter(models.Request.request_id.in_(request_ids)) if locking: @@ -696,6 +753,7 @@ def get_requests_by_status_type(status, request_type=None, time_period=None, req else: query = query.order_by(asc(models.Request.updated_at))\ .order_by(desc(models.Request.priority)) + if bulk_size: query = query.limit(bulk_size) @@ -731,6 +789,11 @@ def update_request(request_id, parameters, session=None): try: parameters['updated_at'] = datetime.datetime.utcnow() + if 'new_poll_period' in parameters and type(parameters['new_poll_period']) not in [datetime.timedelta]: + parameters['new_poll_period'] = datetime.timedelta(seconds=parameters['new_poll_period']) + if 'update_poll_period' in parameters and type(parameters['update_poll_period']) not in [datetime.timedelta]: + parameters['update_poll_period'] = datetime.timedelta(seconds=parameters['update_poll_period']) + if 'request_metadata' in parameters and 'workflow' in parameters['request_metadata']: workflow = parameters['request_metadata']['workflow'] diff --git a/main/lib/idds/orm/transforms.py b/main/lib/idds/orm/transforms.py index e5154b37..f2e259aa 100644 --- a/main/lib/idds/orm/transforms.py +++ b/main/lib/idds/orm/transforms.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 - 2020 +# - Wen Guan, , 2019 - 2022 """ @@ -29,6 +29,8 @@ def create_transform(request_id, workload_id, transform_type, transform_tag=None, priority=0, status=TransformStatus.New, substatus=TransformStatus.New, locking=TransformLocking.Idle, + new_poll_period=1, update_poll_period=10, + new_retries=0, update_retries=0, max_new_retries=3, max_update_retries=0, retries=0, expired_at=None, transform_metadata=None): """ Create a transform. @@ -50,14 +52,24 @@ def create_transform(request_id, workload_id, transform_type, transform_tag=None transform_tag=transform_tag, priority=priority, status=status, substatus=substatus, locking=locking, retries=retries, expired_at=expired_at, + new_retries=new_retries, update_retries=update_retries, + max_new_retries=max_new_retries, max_update_retries=max_update_retries, transform_metadata=transform_metadata) + if new_poll_period: + new_poll_period = datetime.timedelta(seconds=new_poll_period) + new_transform.new_poll_period = new_poll_period + if update_poll_period: + update_poll_period = datetime.timedelta(seconds=update_poll_period) + new_transform.update_poll_period = update_poll_period return new_transform @transactional_session def add_transform(request_id, workload_id, transform_type, transform_tag=None, priority=0, status=TransformStatus.New, substatus=TransformStatus.New, locking=TransformLocking.Idle, - retries=0, expired_at=None, transform_metadata=None, workprogress_id=None, session=None): + new_poll_period=1, update_poll_period=10, retries=0, expired_at=None, + new_retries=0, update_retries=0, max_new_retries=3, max_update_retries=0, + transform_metadata=None, workprogress_id=None, session=None): """ Add a transform. @@ -82,6 +94,10 @@ def add_transform(request_id, workload_id, transform_type, transform_tag=None, p transform_tag=transform_tag, priority=priority, status=status, substatus=substatus, locking=locking, retries=retries, expired_at=expired_at, + new_poll_period=new_poll_period, + update_poll_period=update_poll_period, + new_retries=new_retries, update_retries=update_retries, + max_new_retries=max_new_retries, max_update_retries=max_update_retries, transform_metadata=transform_metadata) new_transform.save(session=session) transform_id = new_transform.transform_id @@ -166,6 +182,46 @@ def get_transform(transform_id, to_json=False, session=None): raise error +@read_session +def get_transform_by_id_status(transform_id, status=None, locking=False, session=None): + """ + Get a transform or raise a NoObject exception. + + :param transform_id: The id of the transform. + :param status: request status. + :param locking: the locking status. + + :param session: The database session in use. + + :raises NoObject: If no request is founded. + + :returns: Transform. + """ + + try: + query = session.query(models.Transform).with_hint(models.Transform, "INDEX(TRANSFORMS TRANSFORMS_PK)", 'oracle')\ + .filter(models.Transform.transform_id == transform_id) + + if status: + if not isinstance(status, (list, tuple)): + status = [status] + if len(status) == 1: + status = [status[0], status[0]] + query = query.filter(models.Transform.status.in_(status)) + + if locking: + query = query.filter(models.Transform.locking == TransformLocking.Idle) + query = query.with_for_update(skip_locked=True) + + ret = query.first() + if not ret: + return None + else: + return ret.to_dict() + except sqlalchemy.orm.exc.NoResultFound as error: + raise exceptions.NoObject('transform transform_id: %s cannot be found: %s' % (transform_id, error)) + + @read_session def get_transforms_with_input_collection(transform_type, transform_tag, coll_scope, coll_name, to_json=False, session=None): """ @@ -290,7 +346,8 @@ def get_transforms(request_id=None, workload_id=None, transform_id=None, @transactional_session def get_transforms_by_status(status, period=None, transform_ids=[], locking=False, locking_for_update=False, - bulk_size=None, to_json=False, by_substatus=False, only_return_id=False, session=None): + bulk_size=None, to_json=False, by_substatus=False, only_return_id=False, + new_poll=False, update_poll=False, session=None): """ Get transforms or raise a NoObject exception. @@ -322,7 +379,10 @@ def get_transforms_by_status(status, period=None, transform_ids=[], locking=Fals query = query.filter(models.Transform.substatus.in_(status)) else: query = query.filter(models.Transform.status.in_(status)) - query = query.filter(models.Transform.next_poll_at <= datetime.datetime.utcnow()) + if new_poll: + query = query.filter(models.Transform.updated_at + models.Transform.new_poll_period <= datetime.datetime.utcnow()) + if update_poll: + query = query.filter(models.Transform.updated_at + models.Transform.update_poll_period <= datetime.datetime.utcnow()) if transform_ids: query = query.filter(models.Transform.transform_id.in_(transform_ids)) @@ -373,6 +433,12 @@ def update_transform(transform_id, parameters, session=None): """ try: parameters['updated_at'] = datetime.datetime.utcnow() + + if 'new_poll_period' in parameters and type(parameters['new_poll_period']) not in [datetime.timedelta]: + parameters['new_poll_period'] = datetime.timedelta(seconds=parameters['new_poll_period']) + if 'update_poll_period' in parameters and type(parameters['update_poll_period']) not in [datetime.timedelta]: + parameters['update_poll_period'] = datetime.timedelta(seconds=parameters['update_poll_period']) + if 'status' in parameters and parameters['status'] in [TransformStatus.Finished, TransformStatus.Finished.value, TransformStatus.Failed, TransformStatus.Failed.value]: parameters['finished_at'] = datetime.datetime.utcnow() diff --git a/main/lib/idds/rest/v1/app.py b/main/lib/idds/rest/v1/app.py index 1329a74f..a53bfc7a 100644 --- a/main/lib/idds/rest/v1/app.py +++ b/main/lib/idds/rest/v1/app.py @@ -83,10 +83,10 @@ def generate_failed_auth_response(exc_msg=None): def before_request_auth(): - print("envs") - print(flask.request.environ) - print("headers") - print(flask.request.headers) + # print("envs") + # print(flask.request.environ) + # print("headers") + # print(flask.request.headers) auth_type = flask.request.headers.get('X-IDDS-Auth-Type', default='x509_proxy') vo = flask.request.headers.get('X-IDDS-Auth-VO', default=None) if auth_type in ['x509_proxy']: diff --git a/main/lib/idds/rest/v1/hyperparameteropt.py b/main/lib/idds/rest/v1/hyperparameteropt.py index cadb93d3..6ca23e10 100644 --- a/main/lib/idds/rest/v1/hyperparameteropt.py +++ b/main/lib/idds/rest/v1/hyperparameteropt.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2019 +# - Wen Guan, , 2019 - 2022 import json from traceback import format_exc @@ -63,7 +63,9 @@ def put(self, workload_id, request_id, id, loss): content_id = content['content_id'] point = content['path'] param, origin_loss = json.loads(point) - params = {'path': json.dumps((param, loss)), 'substatus': ContentStatus.Available} + params = {'path': json.dumps((param, loss)), + 'status': ContentStatus.Available, + 'substatus': ContentStatus.Available} catalog.update_content(content_id, params) except exceptions.NoObject as error: return self.generate_http_response(HTTP_STATUS_CODE.NotFound, exc_cls=error.__class__.__name__, exc_msg=error) diff --git a/main/lib/idds/rest/v1/messages.py b/main/lib/idds/rest/v1/messages.py index 6685fc87..26639d58 100644 --- a/main/lib/idds/rest/v1/messages.py +++ b/main/lib/idds/rest/v1/messages.py @@ -15,8 +15,10 @@ from idds.common import exceptions from idds.common.constants import (HTTP_STATUS_CODE, MessageType, MessageStatus, - MessageSource, MessageDestination) + MessageSource, MessageDestination, + CommandType, RequestStatus) from idds.common.utils import json_loads +from idds.core.commands import add_command from idds.core.requests import get_requests from idds.core.messages import add_message, retrieve_messages from idds.rest.v1.controller import IDDSController @@ -114,15 +116,24 @@ def post(self, request_id, workload_id): msg = self.get_request().data and json_loads(self.get_request().data) # command = msg['command'] # parameters = msg['parameters'] - add_message(msg_type=MessageType.IDDSCommunication, - status=MessageStatus.New, - destination=MessageDestination.Clerk, - source=MessageSource.Rest, - request_id=request_id, - workload_id=workload_id, - transform_id=None, - num_contents=1, - msg_content=msg) + if 'command' in msg and msg['command'] in ['update_request', 'update_processing']: + status = msg['parameters']['status'] + if status in [RequestStatus.ToCancel, RequestStatus.ToSuspend]: + add_command(request_id=request_id, cmd_type=CommandType.AbortRequest, + cmd_content=None) + elif status in [RequestStatus.ToResume]: + add_command(request_id=request_id, cmd_type=CommandType.ResumeRequest, + cmd_content=None) + else: + add_message(msg_type=MessageType.IDDSCommunication, + status=MessageStatus.New, + destination=MessageDestination.Clerk, + source=MessageSource.Rest, + request_id=request_id, + workload_id=workload_id, + transform_id=None, + num_contents=1, + msg_content=msg) except exceptions.DuplicatedObject as error: return self.generate_http_response(HTTP_STATUS_CODE.Conflict, exc_cls=error.__class__.__name__, exc_msg=error) diff --git a/main/lib/idds/rest/v1/requests.py b/main/lib/idds/rest/v1/requests.py index 810ca398..cf5f81ef 100644 --- a/main/lib/idds/rest/v1/requests.py +++ b/main/lib/idds/rest/v1/requests.py @@ -14,13 +14,16 @@ from flask import Blueprint from idds.common import exceptions +from idds.common.authentication import authenticate_is_super_user from idds.common.constants import HTTP_STATUS_CODE from idds.common.constants import RequestStatus from idds.common.constants import (MessageType, MessageStatus, - MessageSource, MessageDestination) + MessageSource, MessageDestination, + CommandType) from idds.common.utils import json_loads from idds.core.requests import add_request, get_requests from idds.core.messages import add_message +from idds.core.commands import add_command from idds.rest.v1.controller import IDDSController from idds.rest.v1.utils import convert_old_req_2_workflow_req @@ -119,7 +122,7 @@ def put(self, request_id): username = self.get_username() reqs = get_requests(request_id=request_id, with_request=True) for req in reqs: - if req['username'] and req['username'] != username: + if req['username'] and req['username'] != username and not authenticate_is_super_user(username): raise exceptions.AuthenticationNoPermission("User %s has no permission to update request %s" % (username, req['request_id'])) except exceptions.AuthenticationNoPermission as error: return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=error.__class__.__name__, exc_msg=error) @@ -213,6 +216,129 @@ def post_test(self): pprint.pprint(self.get_request().url_rule) +class RequestAbort(IDDSController): + """ Abort Request. """ + + def put(self, request_id, workload_id=None, task_id=None): + """ Abort the request. + HTTP Success: + 200 OK + HTTP Error: + 400 Bad request + 404 Not Found + 500 Internal Error + """ + if request_id == 'null': + request_id = None + if workload_id == 'null': + workload_id = None + if task_id == 'null': + task_id = None + + try: + username = self.get_username() + if task_id: + reqs = get_requests(request_id=request_id, workload_id=workload_id, with_processing=True) + else: + reqs = get_requests(request_id=request_id, workload_id=workload_id, with_request=True) + + if not reqs: + return self.generate_http_response(HTTP_STATUS_CODE.OK, data={'status': -1, 'message': 'No match requests'}) + matched_transform_id = None + if task_id: + for req in reqs: + if str(req['processing_workload_id']) == str(task_id): + matched_transform_id = req['transform_id'] + if matched_transform_id: + return self.generate_http_response(HTTP_STATUS_CODE.OK, data={'status': -1, 'message': 'No match tasks'}) + + for req in reqs: + if req['username'] and req['username'] != username and not authenticate_is_super_user(username): + msg = "User %s has no permission to update request %s" % (username, req['request_id']) + # raise exceptions.AuthenticationNoPermission(msg) + return self.generate_http_response(HTTP_STATUS_CODE.OK, data={'status': -1, 'message': msg}) + except exceptions.AuthenticationNoPermission as error: + return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=error.__class__.__name__, exc_msg=error) + except Exception as error: + print(error) + print(format_exc()) + return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) + + try: + cmd_content = None + if task_id and matched_transform_id: + cmd_content = {'task_id': task_id, + 'transform_id': matched_transform_id} + + add_command(request_id=request_id, cmd_type=CommandType.AbortRequest, + workload_id=workload_id, cmd_content=cmd_content, + username=username) + + except exceptions.NoObject as error: + return self.generate_http_response(HTTP_STATUS_CODE.NotFound, exc_cls=error.__class__.__name__, exc_msg=error) + except exceptions.IDDSException as error: + return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=error.__class__.__name__, exc_msg=error) + except Exception as error: + print(error) + print(format_exc()) + return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) + + return self.generate_http_response(HTTP_STATUS_CODE.OK, data={'status': 0, 'message': 'Command registered successfully'}) + + +class RequestRetry(IDDSController): + """ Retry Request. """ + + def put(self, request_id, workload_id=None): + """ Retry the request. + HTTP Success: + 200 OK + HTTP Error: + 400 Bad request + 404 Not Found + 500 Internal Error + """ + + if request_id == 'null': + request_id = None + if workload_id == 'null': + workload_id = None + + try: + username = self.get_username() + reqs = get_requests(request_id=request_id, workload_id=workload_id, with_request=True) + if not reqs: + return self.generate_http_response(HTTP_STATUS_CODE.OK, data={'status': -1, 'message': 'No match requests'}) + + for req in reqs: + if req['username'] and req['username'] != username and not authenticate_is_super_user(username): + msg = "User %s has no permission to update request %s" % (username, req['request_id']) + # raise exceptions.AuthenticationNoPermission(msg) + return self.generate_http_response(HTTP_STATUS_CODE.OK, data={'status': -1, 'message': msg}) + except exceptions.AuthenticationNoPermission as error: + return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=error.__class__.__name__, exc_msg=error) + except Exception as error: + print(error) + print(format_exc()) + return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) + + try: + add_command(request_id=request_id, cmd_type=CommandType.ResumeRequest, + workload_id=workload_id, cmd_content=None, + username=username) + + except exceptions.NoObject as error: + return self.generate_http_response(HTTP_STATUS_CODE.NotFound, exc_cls=error.__class__.__name__, exc_msg=error) + except exceptions.IDDSException as error: + return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=error.__class__.__name__, exc_msg=error) + except Exception as error: + print(error) + print(format_exc()) + return self.generate_http_response(HTTP_STATUS_CODE.InternalError, exc_cls=exceptions.CoreException.__name__, exc_msg=error) + + return self.generate_http_response(HTTP_STATUS_CODE.OK, data={'status': 0, 'message': 'Command registered successfully'}) + + """---------------------- Web service url maps ----------------------""" @@ -228,4 +354,12 @@ def get_blueprint(): bp.add_url_rule('/request////', view_func=request_view, methods=['get', ]) bp.add_url_rule('/request/////', view_func=request_view, methods=['get', ]) bp.add_url_rule('/request//////', view_func=request_view, methods=['get', ]) + + request_abort = RequestAbort.as_view('request_abort') + bp.add_url_rule('/request/abort//', view_func=request_abort, methods=['put', ]) + bp.add_url_rule('/request/abort///task_id', view_func=request_abort, methods=['put', ]) + + request_retry = RequestRetry.as_view('request_retry') + bp.add_url_rule('/request/retry//', view_func=request_retry, methods=['put', ]) + return bp diff --git a/main/lib/idds/tests/core_tests.py b/main/lib/idds/tests/core_tests.py index 5cbb787d..c4f8125d 100644 --- a/main/lib/idds/tests/core_tests.py +++ b/main/lib/idds/tests/core_tests.py @@ -110,19 +110,33 @@ def show_works(req): # reqs = get_requests(request_id=299111, with_request=True, with_detail=False, with_metadata=True) # reqs = get_requests(request_id=299235, with_request=True, with_detail=False, with_metadata=True) # reqs = get_requests(request_id=965, with_request=True, with_detail=False, with_metadata=True) -reqs = get_requests(request_id=1687, with_request=True, with_detail=False, with_metadata=True) +# reqs = get_requests(request_id=350695, with_request=True, with_detail=False, with_metadata=True) + +""" +reqs = get_requests(request_id=350723, with_request=True, with_detail=False, with_metadata=True) for req in reqs: # print(req['request_id']) + print(req) # print(rets) - print(json_dumps(req, sort_keys=True, indent=4)) + # print(json_dumps(req, sort_keys=True, indent=4)) # show_works(req) pass workflow = req['request_metadata']['workflow'] + print(workflow.runs.keys()) + # print(workflow.runs["1"]) + # print(json_dumps(workflow.runs["1"], sort_keys=True, indent=4)) + + print(workflow.runs["1"].works.keys()) + # print(workflow.runs["1"].works["7bdcf871"]) + # print(json_dumps(workflow.runs["1"].works["7bdcf871"], indent=4)) + print(workflow.runs["1"].works["7bdcf871"].runs.keys()) + print(json_dumps(workflow.runs["1"].works["7bdcf871"].runs["2"], indent=4)) if hasattr(workflow, 'get_relation_map'): # print(json_dumps(workflow.get_relation_map(), sort_keys=True, indent=4)) pass sys.exit(0) +""" """ # reqs = get_requests() @@ -138,17 +152,22 @@ def show_works(req): """ -""" -# tfs = get_transforms(request_id=241) -tfs = get_transforms(transform_id=176320) + +tfs = get_transforms(request_id=350723) +# tfs = get_transforms(transform_id=350723) for tf in tfs: # print(tf) # print(tf['transform_metadata']['work'].to_dict()) - print(json_dumps(tf, sort_keys=True, indent=4)) + # print(tf) + # print(json_dumps(tf, sort_keys=True, indent=4)) + print(tf['request_id'], tf['workload_id']) + print(tf['transform_metadata']['work_name']) + print(tf['transform_metadata']['work'].num_run) + print(tf['transform_metadata']['work'].task_name) pass sys.exit(0) -""" + """ msgs = retrieve_messages(workload_id=25972557) @@ -169,8 +188,8 @@ def show_works(req): sys.exit(0) """ -# prs = get_processings(request_id=219) -prs = get_processings(transform_id=176320) +prs = get_processings(request_id=350723) +# prs = get_processings(transform_id=350723) i = 0 for pr in prs: # if pr['request_id'] == 91: diff --git a/main/lib/idds/tests/test_migrate_requests.py b/main/lib/idds/tests/test_migrate_requests.py index 9a134b7c..3d0c939a 100644 --- a/main/lib/idds/tests/test_migrate_requests.py +++ b/main/lib/idds/tests/test_migrate_requests.py @@ -17,6 +17,10 @@ from idds.client.clientmanager import ClientManager from idds.common.utils import json_dumps # noqa F401 from idds.rest.v1.utils import convert_old_req_2_workflow_req +from idds.common.utils import setup_logging + + +setup_logging("idds.log") def migrate(): @@ -31,25 +35,40 @@ def migrate(): # doma google doma_google_host = 'https://34.133.138.229:443/idds' # noqa F841 + cm1 = ClientManager(host=atlas_host) cm1 = ClientManager(host=doma_host) # reqs = cm1.get_requests(request_id=290) # old_request_id = 298163 - old_request_id = 1685 + # old_request_id = 350723 + old_request_id = 359383 + # old_request_id = 349 + old_request_id = 2400 + # for old_request_id in [152]: # for old_request_id in [60]: # noqa E115 # for old_request_id in [200]: # noqa E115 for old_request_id in [old_request_id]: # noqa E115 # doma 183 reqs = cm1.get_requests(request_id=old_request_id, with_metadata=True) + cm2 = ClientManager(host=dev_host) cm2 = ClientManager(host=doma_host) # print(reqs) print("num requests: %s" % len(reqs)) for req in reqs[:1]: # print(req) + # workflow = req['request_metadata']['workflow'] + # print(json_dumps(workflow, sort_keys=True, indent=4)) + req = convert_old_req_2_workflow_req(req) workflow = req['request_metadata']['workflow'] workflow.clean_works() + + # for old idds version + t_works = workflow.template.works + if not t_works and hasattr(workflow, 'works_template'): + workflow.template.works = workflow.works_template + # print(json_dumps(workflow)) # print(json_dumps(workflow, sort_keys=True, indent=4)) req_id = cm2.submit(workflow) diff --git a/main/lib/idds/tests/test_workflow_condition_v2.py b/main/lib/idds/tests/test_workflow_condition_v2.py index e799152e..8b9989cc 100644 --- a/main/lib/idds/tests/test_workflow_condition_v2.py +++ b/main/lib/idds/tests/test_workflow_condition_v2.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2021 +# - Wen Guan, , 2021 - 2022 """ @@ -1279,6 +1279,49 @@ def test_workflow_subloopworkflow3(self): assert(works == []) assert(workflow.is_terminated() is True) + def test_custom_condition(self): + work1 = Work(executable='/bin/hostname', arguments=None, sandbox=None, work_id=1) + work1.add_custom_condition(key="to_continue", value=True) + assert(work1.get_custom_condition_status() is False) + # output_data will be set based on the outputs of jobs. + work1.output_data = {'to_continue': True} + assert(work1.get_custom_condition_status() is True) + + def test_workflow_subloopworkflow4(self): + work1 = Work(executable='/bin/hostname', arguments=None, sandbox=None, work_id=1) + work2 = Work(executable='/bin/hostname', arguments=None, sandbox=None, work_id=2) + + workflow1 = Workflow() + workflow1.add_work(work1, initial=False) + workflow1.add_work(work2, initial=False) + + work2.add_custom_condition(key="to_continue", value=True) + cond = Condition(cond=work2.get_custom_condition_status) + workflow1.add_loop_condition(cond) + + work3 = Work(executable='/bin/hostname', arguments=None, sandbox=None, work_id=3) + cond1 = Condition(cond=work3.is_finished, true_work=workflow1) + + workflow = Workflow() + workflow.add_work(work3, initial=False) + workflow.add_work(workflow1, initial=False) + workflow.add_condition(cond1) + + works = workflow.get_new_works() + works.sort(key=lambda x: x.work_id) + assert(works == [work3]) + # assert(workflow.num_run == 1) + + for work in works: + # if work.work_id == 3: + work.transforming = True + work.status = WorkStatus.Failed + + works = workflow.get_new_works() + works.sort(key=lambda x: x.work_id) + assert(works == []) + assert(workflow.is_terminated() is True) + def test_workflow_subloopworkflow_reload(self): work1 = Work(executable='/bin/hostname', arguments=None, sandbox=None, work_id=1) work2 = Work(executable='/bin/hostname', arguments=None, sandbox=None, work_id=2) diff --git a/main/tools/env/environment.yml b/main/tools/env/environment.yml index 2a964de1..ee34fd3f 100644 --- a/main/tools/env/environment.yml +++ b/main/tools/env/environment.yml @@ -23,7 +23,10 @@ dependencies: - recommonmark # use Markdown with Sphinx - sphinx-rtd-theme # sphinx readthedoc theme - nevergrad # nevergrad hyper parameter optimization - - psycopg2-binary + - psycopg2-binary + - pyjwt + - cryptography + - redis - idds-common==0.11.5 - idds-workflow==0.11.5 - - idds-client==0.11.5 \ No newline at end of file + - idds-client==0.11.5 diff --git a/main/tools/env/install_idds.sh b/main/tools/env/install_idds.sh index ad0ad4c7..eab6057b 100644 --- a/main/tools/env/install_idds.sh +++ b/main/tools/env/install_idds.sh @@ -8,4 +8,4 @@ # Authors: # - Wen Guan, , 2019 -python setup.py install --old-and-unmanageable +python setup.py install --old-and-unmanageable --force diff --git a/main/tools/env/install_idds_full.sh b/main/tools/env/install_idds_full.sh index 6887f6df..05b38c64 100644 --- a/main/tools/env/install_idds_full.sh +++ b/main/tools/env/install_idds_full.sh @@ -96,3 +96,8 @@ sphinx-apidoc -f -o ./source/codes/doma/ ../doma/lib/idds yum install fetch-crl.noarch yum install lcg-CA + + +yum install redis +systemctl start redis +systemctl enable redis diff --git a/main/tools/env/setup_dev.sh b/main/tools/env/setup_dev.sh index 1c2fe9e7..bdc87d14 100644 --- a/main/tools/env/setup_dev.sh +++ b/main/tools/env/setup_dev.sh @@ -25,7 +25,8 @@ conda activate $CondaDir #export PYTHONPATH=${IDDS_HOME}/lib:$PYTHONPATH export RUCIO_HOME=$RootDir -export RUCIO_ACCOUNT=ddmadmin +#export RUCIO_ACCOUNT=ddmadmin +export RUCIO_ACCOUNT=wguan export X509_USER_PROXY=/tmp/x509up_u23959 # export PYTHONPATH=$PYTHONPATH:/cvmfs/atlas.cern.ch/repo/ATLASLocalRootBase/x86_64/rucio-clients/current/lib/python3.6/site-packages/ diff --git a/main/tools/env/setup_panda.sh b/main/tools/env/setup_panda.sh index 21aa246f..0c201024 100644 --- a/main/tools/env/setup_panda.sh +++ b/main/tools/env/setup_panda.sh @@ -5,6 +5,9 @@ if [ "$#" -eq 1 ]; then instance=$1 fi +export X509_USER_PROXY=/afs/cern.ch/user/w/wguan/workdisk/iDDS/test/x509up +export RUCIO_ACCOUNT=pilot + export PANDA_BEHIND_REAL_LB=true # export PANDA_SYS=/opt/idds/ @@ -15,6 +18,15 @@ if [ "$instance" == "k8s" ]; then export PANDAMON_URL=https://panda-doma.cern.ch export PANDA_AUTH_VO=panda_dev + # export PANDA_CONFIG_ROOT=/afs/cern.ch/user/w/wguan/workdisk/iDDS/main/etc/panda/ + export PANDA_CONFIG_ROOT=~/.panda/ +elif [ "$instance" == "slac" ]; then + export PANDA_AUTH=oidc + export PANDA_URL_SSL=https://rubin-panda-server-dev.slac.stanford.edu:443/server/panda + export PANDA_URL=http://rubin-panda-server-dev.slac.stanford.edu:80/server/panda + export PANDAMON_URL=https://rubin-panda-bigmon-dev.slac.stanford.edu + export PANDA_AUTH_VO=Rubin + # export PANDA_CONFIG_ROOT=/afs/cern.ch/user/w/wguan/workdisk/iDDS/main/etc/panda/ export PANDA_CONFIG_ROOT=~/.panda/ else @@ -23,7 +35,10 @@ else export PANDA_URL=http://pandaserver-doma.cern.ch:25080/server/panda export PANDAMON_URL=https://panda-doma.cern.ch export PANDA_AUTH_VO=panda_dev + + export PANDACACHE_URL=$PANDA_URL_SSL + export PANDA_SYS=/afs/cern.ch/user/w/wguan/workdisk/iDDS/.conda/iDDS/ # export PANDA_CONFIG_ROOT=/afs/cern.ch/user/w/wguan/workdisk/iDDS/main/etc/panda/ export PANDA_CONFIG_ROOT=~/.panda/ fi diff --git a/main/tools/orm/create_database.py b/main/tools/orm/create_database.py index e6e75aba..3c6bdf8e 100644 --- a/main/tools/orm/create_database.py +++ b/main/tools/orm/create_database.py @@ -13,7 +13,7 @@ create the database. """ -from ess.orm.utils import build_database +from idds.orm.base.utils import build_database if __name__ == '__main__': diff --git a/main/tools/orm/destory_database.py b/main/tools/orm/destory_database.py index d90dfa37..05df2f80 100644 --- a/main/tools/orm/destory_database.py +++ b/main/tools/orm/destory_database.py @@ -14,7 +14,7 @@ """ -from ess.orm.utils import destory_everything +from idds.orm.base.utils import destory_everything if __name__ == '__main__': diff --git a/main/tools/orm/dump_db_schema.py b/main/tools/orm/dump_db_schema.py index 982e27f5..7b9cf6c7 100644 --- a/main/tools/orm/dump_db_schema.py +++ b/main/tools/orm/dump_db_schema.py @@ -13,7 +13,7 @@ dump the database schema. """ -from ess.orm.utils import dump_schema +from idds.orm.base.utils import dump_schema if __name__ == '__main__': diff --git a/monitor/data/conf.js b/monitor/data/conf.js index 78fee6d3..69dede90 100644 --- a/monitor/data/conf.js +++ b/monitor/data/conf.js @@ -1,9 +1,9 @@ var appConfig = { - 'iddsAPI_request': "https://lxplus8s18.cern.ch:443/idds/monitor_request/null/null", - 'iddsAPI_transform': "https://lxplus8s18.cern.ch:443/idds/monitor_transform/null/null", - 'iddsAPI_processing': "https://lxplus8s18.cern.ch:443/idds/monitor_processing/null/null", - 'iddsAPI_request_detail': "https://lxplus8s18.cern.ch:443/idds/monitor/null/null/true/false/false", - 'iddsAPI_transform_detail': "https://lxplus8s18.cern.ch:443/idds/monitor/null/null/false/true/false", - 'iddsAPI_processing_detail': "https://lxplus8s18.cern.ch:443/idds/monitor/null/null/false/false/true" + 'iddsAPI_request': "https://lxplus8s05.cern.ch:443/idds/monitor_request/null/null", + 'iddsAPI_transform': "https://lxplus8s05.cern.ch:443/idds/monitor_transform/null/null", + 'iddsAPI_processing': "https://lxplus8s05.cern.ch:443/idds/monitor_processing/null/null", + 'iddsAPI_request_detail': "https://lxplus8s05.cern.ch:443/idds/monitor/null/null/true/false/false", + 'iddsAPI_transform_detail': "https://lxplus8s05.cern.ch:443/idds/monitor/null/null/false/true/false", + 'iddsAPI_processing_detail': "https://lxplus8s05.cern.ch:443/idds/monitor/null/null/false/false/true" } diff --git a/requirements.yaml b/requirements.yaml index 57a376c0..38b56e69 100644 --- a/requirements.yaml +++ b/requirements.yaml @@ -23,4 +23,7 @@ dependencies: - recommonmark # use Markdown with Sphinx - sphinx-rtd-theme # sphinx readthedoc theme - nevergrad # nevergrad hyper parameter optimization - - psycopg2-binary + - psycopg2-binary + - pyjwt + - cryptography + - redis diff --git a/workflow/lib/idds/workflow/base.py b/workflow/lib/idds/workflow/base.py index 24fd2198..16ea905c 100644 --- a/workflow/lib/idds/workflow/base.py +++ b/workflow/lib/idds/workflow/base.py @@ -38,6 +38,7 @@ def get_item(self, key, default): class Base(DictClass): def __init__(self): self.metadata = IDDSMetadata() + self.origin_metadata = None pass def add_metadata_item(self, key, value): diff --git a/workflow/lib/idds/workflow/datawork.py b/workflow/lib/idds/workflow/datawork.py new file mode 100644 index 00000000..0b7a955d --- /dev/null +++ b/workflow/lib/idds/workflow/datawork.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2022 + +from .work import Work + + +class DataWork(Work): + def __init__(self, executable=None, arguments=None, parameters=None, setup=None, work_type=None, + work_tag=None, exec_type='local', sandbox=None, request_id=None, work_id=None, work_name=None, + primary_input_collection=None, other_input_collections=None, input_collections=None, + primary_output_collection=None, other_output_collections=None, output_collections=None, + log_collections=None, release_inputs_after_submitting=False, username=None, + agent_attributes=None, is_template=False, + logger=None): + super(DataWork, self).__init__(executable=executable, arguments=arguments, + parameters=parameters, setup=setup, work_type=work_type, + exec_type=exec_type, sandbox=sandbox, work_id=work_id, + primary_input_collection=primary_input_collection, + other_input_collections=other_input_collections, + primary_output_collection=primary_output_collection, + other_output_collections=other_output_collections, + input_collections=input_collections, + output_collections=output_collections, + log_collections=log_collections, + agent_attributes=agent_attributes, + logger=logger) diff --git a/workflow/lib/idds/workflow/processingwork.py b/workflow/lib/idds/workflow/processingwork.py new file mode 100644 index 00000000..da90c91e --- /dev/null +++ b/workflow/lib/idds/workflow/processingwork.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2022 + +from idds.common.constants import TransformType + +from .work import Work + + +class ProcessingWork(Work): + def __init__(self, executable=None, arguments=None, parameters=None, setup=None, work_type=None, + work_tag=None, exec_type='local', sandbox=None, request_id=None, work_id=None, work_name=None, + primary_input_collection=None, other_input_collections=None, input_collections=None, + primary_output_collection=None, other_output_collections=None, output_collections=None, + log_collections=None, release_inputs_after_submitting=False, username=None, + agent_attributes=None, is_template=False, + logger=None): + super(ProcessingWork, self).__init__(executable=executable, arguments=arguments, + parameters=parameters, setup=setup, work_type=TransformType.Processing, + exec_type=exec_type, sandbox=sandbox, work_id=work_id, + primary_input_collection=primary_input_collection, + other_input_collections=other_input_collections, + primary_output_collection=primary_output_collection, + other_output_collections=other_output_collections, + input_collections=input_collections, + output_collections=output_collections, + log_collections=log_collections, + agent_attributes=agent_attributes, + logger=logger) diff --git a/workflow/lib/idds/workflow/utils.py b/workflow/lib/idds/workflow/utils.py new file mode 100644 index 00000000..dcc033dd --- /dev/null +++ b/workflow/lib/idds/workflow/utils.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2021 + + +def show_relation_map(relation_map, level=0): + # a workflow with a list of works. + if level == 0: + prefix = "" + else: + prefix = " " * level * 4 + + for item in relation_map: + if type(item) in [dict]: + # it's a Work + print("%s%s" % (prefix, item['work']['workload_id'])) + if 'next_works' in item: + # print("%s%s next_works:" % (prefix, item['work']['workload_id'])) + next_works = item['next_works'] + # it's a list. + show_relation_map(next_works, level=level + 1) + elif type(item) in [list]: + # it's a subworkflow with a list of works. + print("%ssubworkflow:" % (prefix)) + show_relation_map(next_works, level=level + 1) diff --git a/workflow/lib/idds/workflow/version.py b/workflow/lib/idds/workflow/version.py index 7c2e8610..a09efc7f 100644 --- a/workflow/lib/idds/workflow/version.py +++ b/workflow/lib/idds/workflow/version.py @@ -9,4 +9,4 @@ # - Wen Guan, , 2019 - 2021 -release_version = "0.11.5" +release_version = "0.7.7" diff --git a/workflow/lib/idds/workflow/work.py b/workflow/lib/idds/workflow/work.py index b4d8965a..aa266f37 100644 --- a/workflow/lib/idds/workflow/work.py +++ b/workflow/lib/idds/workflow/work.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 +# - Wen Guan, , 2020 - 2021 import copy import datetime @@ -19,6 +19,7 @@ from idds.common import exceptions from idds.common.constants import (WorkStatus, ProcessingStatus, CollectionStatus, CollectionType) +from idds.common.constants import get_work_status_from_transform_processing_status from idds.common.utils import setup_logging from idds.common.utils import str_to_date # from idds.common.utils import json_dumps @@ -63,6 +64,11 @@ def __init__(self, scope=None, name=None, coll_type=CollectionType.Dataset, coll self.status = CollectionStatus.New self.substatus = CollectionStatus.New + self.total_files = 0 + self.processed_files = 0 + self.processing_files = 0 + self.bytes = 0 + @property def internal_id(self): return self.get_metadata_item('internal_id') @@ -81,31 +87,40 @@ def coll_id(self, value): @property def status(self): - return self.get_metadata_item('status', CollectionStatus.New) + st = self.get_metadata_item('status', CollectionStatus.New) + if type(st) in [int]: + st = CollectionStatus(st) + return st @status.setter def status(self, value): - self.add_metadata_item('status', value) + self.add_metadata_item('status', value.value if value else value) if self.collection: self.collection['status'] = value @property def coll_type(self): - return self.get_metadata_item('coll_type', CollectionType.Dataset) + st = self.get_metadata_item('coll_type', CollectionType.Dataset) + if type(st) in [int]: + st = CollectionType(st) + return st @coll_type.setter def coll_type(self, value): - self.add_metadata_item('coll_type', value) + self.add_metadata_item('coll_type', value.value if value else value) if self.collection: self.collection['coll_type'] = value @property def substatus(self): - return self.get_metadata_item('substatus', CollectionStatus.New) + st = self.get_metadata_item('substatus', CollectionStatus.New) + if type(st) in [int]: + st = CollectionStatus(st) + return st @substatus.setter def substatus(self, value): - self.add_metadata_item('substatus', value) + self.add_metadata_item('substatus', value.value if value else value) if self.collection: self.collection['substatus'] = value @@ -125,6 +140,14 @@ def collection(self, value): self.status = self._collection['status'] self.substatus = self._collection['substatus'] + self.total_files = self._collection['total_files'] + self.processed_files = self._collection['processed_files'] + self.processing_files = self._collection['processing_files'] + self.bytes = self._collection['bytes'] + + def to_origin_dict(self): + return {'scope': self.scope, 'name': self.name} + class Processing(Base): @@ -140,6 +163,7 @@ def __init__(self, processing_metadata={}): self.processing = None self.internal_id = str(uuid.uuid4())[:8] + self.task_name = None self.processing_id = None self.workload_id = None self.status = ProcessingStatus.New @@ -155,6 +179,8 @@ def __init__(self, processing_metadata={}): self.operation_time = datetime.datetime.utcnow() self.submitted_at = None + self.username = None + self.external_id = None self.errors = None @@ -191,21 +217,27 @@ def get_workload_id(self): @property def status(self): - return self.get_metadata_item('status', ProcessingStatus.New) + st = self.get_metadata_item('status', ProcessingStatus.New) + if type(st) in [int]: + st = ProcessingStatus(st) + return st @status.setter def status(self, value): - self.add_metadata_item('status', value) + self.add_metadata_item('status', value.value if value else value) if self.processing: self.processing['status'] = value @property def substatus(self): - return self.get_metadata_item('substatus', ProcessingStatus.New) + st = self.get_metadata_item('substatus', ProcessingStatus.New) + if type(st) in [int]: + st = ProcessingStatus(st) + return st @substatus.setter def substatus(self, value): - self.add_metadata_item('substatus', value) + self.add_metadata_item('substatus', value.value if value else value) if self.processing: self.processing['substatus'] = value @@ -348,6 +380,22 @@ def external_id(self): def external_id(self, value): self.add_metadata_item('external_id', value) + @property + def old_external_id(self): + return self.get_metadata_item('old_external_id', []) + + @old_external_id.setter + def old_external_id(self, value): + self.add_metadata_item('old_external_id', value) + + @property + def task_name(self): + return self.get_metadata_item('task_name', None) + + @task_name.setter + def task_name(self, value): + self.add_metadata_item('task_name', value) + @property def processing(self): return self._processing @@ -379,9 +427,10 @@ def has_new_updates(self): class Work(Base): def __init__(self, executable=None, arguments=None, parameters=None, setup=None, work_type=None, - work_tag=None, exec_type='local', sandbox=None, work_id=None, work_name=None, - primary_input_collection=None, other_input_collections=None, - output_collections=None, log_collections=None, release_inputs_after_submitting=False, + work_tag=None, exec_type='local', sandbox=None, request_id=None, work_id=None, work_name=None, + primary_input_collection=None, other_input_collections=None, input_collections=None, + primary_output_collection=None, other_output_collections=None, output_collections=None, + log_collections=None, release_inputs_after_submitting=False, username=None, agent_attributes=None, is_template=False, logger=None): """ @@ -400,6 +449,14 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, :param output_collections: List of the output collections. # :param workflow: The workflow the current work belongs to. """ + self._collections = {} + self._primary_input_collection = None + self._primary_output_collection = None + self._other_input_collections = [] + self._other_output_collections = [] + + self._processings = {} + super(Work, self).__init__() self.internal_id = str(uuid.uuid4())[:8] @@ -418,10 +475,12 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, self.arguments = arguments self.parameters = parameters + self.username = username self.work_type = work_type self.work_tag = work_tag self.exec_type = exec_type self.sandbox = sandbox + self.request_id = request_id self.work_id = work_id self.work_name = work_name if not self.work_name: @@ -430,20 +489,45 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, self.transforming = False self.workdir = None - self.collections = {} - self.primary_input_collection = None - self.other_input_collections = [] - self.output_collections = [] self.log_collections = [] + if input_collections and (primary_input_collection or other_input_collections): + raise Exception("input_collections and (primary_input_collection, other_input_collections) cannot be used at the same time.") + if output_collections and (primary_output_collection or other_output_collections): + raise Exception("output_collections and (primary_output_collection, other_output_collections) cannot be used at the same time.") + + if input_collections and type(input_collections) not in [list, tuple]: + input_collections = [input_collections] + if output_collections and type(output_collections) not in [list, tuple]: + output_collections = [output_collections] + + if input_collections: + primary_input_collection = input_collections[0] + if len(input_collections) > 1: + other_input_collections = input_collections[1:] + if output_collections: + primary_output_collection = output_collections[0] + if len(output_collections) > 1: + other_output_collections = output_collections[1:] + # self.primary_input_collection = primary_input_collection self.set_primary_input_collection(primary_input_collection) + self.set_primary_output_collection(primary_output_collection) + # self.other_input_collections = other_input_collections if other_input_collections and type(other_input_collections) not in [list, tuple]: other_input_collections = [other_input_collections] self.add_other_input_collections(other_input_collections) - if output_collections and type(output_collections) not in [list, tuple]: - output_collections = [output_collections] - self.add_output_collections(output_collections) + if other_output_collections and type(other_output_collections) not in [list, tuple]: + other_output_collections = [other_output_collections] + self.add_other_output_collections(other_output_collections) + + # if input_collections and type(input_collections) not in [list, tuple]: + # input_collections = [input_collections] + # self.add_input_collections(input_collections) + # if output_collections and type(output_collections) not in [list, tuple]: + # output_collections = [output_collections] + # self.add_output_collections(output_collections) + if log_collections and type(log_collections) not in [list, tuple]: log_collections = [log_collections] self.add_log_collections(log_collections) @@ -466,7 +550,7 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, self.suspended_processings = [] self.old_processings = [] self.terminated_msg = "" - self.output_data = None + self.output_data = {} self.parameters_for_next_task = None self.status_statistics = {} @@ -489,7 +573,12 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, self.backup_to_release_inputs = {'0': [], '1': [], '2': []} - self.num_run = None + self.num_run = 0 + + self.or_custom_conditions = {} + self.and_custom_conditions = {} + + self.sliced_global_parameters = None """ self._running_data_names = [] @@ -504,6 +593,11 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, self._running_data_names.append(name) """ + def get_logger(self): + if self.logger is None: + self.logger = self.setup_logger() + return self.logger + def get_class_name(self): return self.__class__.__name__ @@ -540,6 +634,14 @@ def workload_id(self): def workload_id(self, value): self.add_metadata_item('workload_id', value) + @property + def external_id(self): + return self.get_metadata_item('external_id', None) + + @external_id.setter + def external_id(self, value): + self.add_metadata_item('external_id', value) + def get_workload_id(self): return self.workload_id @@ -567,6 +669,18 @@ def parameters(self): def parameters(self, value): self.add_metadata_item('parameters', value) + @property + def output_data(self): + return self.get_metadata_item('output_data', {}) + + @output_data.setter + def output_data(self, value): + self.add_metadata_item('output_data', value) + if value and type(value) in [dict]: + for key in value: + new_key = "user_" + str(key) + setattr(self, new_key, value[key]) + @property def work_id(self): return self.get_metadata_item('work_id', None) @@ -575,6 +689,14 @@ def work_id(self): def work_id(self, value): self.add_metadata_item('work_id', value) + @property + def parent_workload_id(self): + return self.get_metadata_item('parent_workload_id', None) + + @parent_workload_id.setter + def parent_workload_id(self, value): + self.add_metadata_item('parent_workload_id', value) + @property def transforming(self): return self.get_metadata_item('transforming', False) @@ -617,7 +739,10 @@ def started(self, value): @property def status(self): - return self.get_metadata_item('status', WorkStatus.New) + st = self.get_metadata_item('status', WorkStatus.New) + if type(st) in [int]: + st = WorkStatus(st) + return st @status.setter def status(self, value): @@ -628,15 +753,18 @@ def status(self, value): WorkStatus.Failed, WorkStatus.Running]: self.transforming = True - self.add_metadata_item('status', value) + self.add_metadata_item('status', value.value if value else value) @property def substatus(self): - return self.get_metadata_item('substatus', WorkStatus.New) + st = self.get_metadata_item('substatus', WorkStatus.New) + if type(st) in [int]: + st = WorkStatus(st) + return st @substatus.setter def substatus(self, value): - self.add_metadata_item('substatus', value) + self.add_metadata_item('substatus', value.value if value else value) @property def polling_retries(self): @@ -656,7 +784,7 @@ def errors(self, value): @property def next_works(self): - return self.get_metadata_item('next_works', 0) + return self.get_metadata_item('next_works', []) @next_works.setter def next_works(self, value): @@ -689,7 +817,9 @@ def processings(self, value): for k in self._processings: proc = self._processings[k] if type(proc) in [Processing]: - proc_metadata[k] = {'processing_id': proc.processing_id} + proc_metadata[k] = {'processing_id': proc.processing_id, + 'workload_id': proc.workload_id, + 'external_id': proc.external_id} self.add_metadata_item('processings', proc_metadata) def refresh_work(self): @@ -706,31 +836,41 @@ def refresh_work(self): for k in self._processings: proc = self._processings[k] if type(proc) in [Processing]: - proc_metadata[k] = {'processing_id': proc.processing_id} + proc_metadata[k] = {'processing_id': proc.processing_id, + 'workload_id': proc.workload_id, + 'external_id': proc.external_id} self.add_metadata_item('processings', proc_metadata) def load_work(self): coll_metadata = self.get_metadata_item('collections', {}) - for k in coll_metadata: - if k in self._collections: + for k in self._collections: + if k in coll_metadata: coll_id = coll_metadata[k]['coll_id'] self._collections[k].coll_id = coll_id - else: - self._collections[k] = Collection(scope=None, name=None) - coll_id = coll_metadata[k]['coll_id'] - self._collections[k].coll_id = coll_id - self._collections[k].internal_id = k proc_metadata = self.get_metadata_item('processings', {}) - for k in proc_metadata: - if k in self._processings: + for k in self._processings: + if k in proc_metadata: proc_id = proc_metadata[k]['processing_id'] self._processings[k].processing_id = proc_id - else: + if 'workload_id' in proc_metadata[k] and proc_metadata[k]['workload_id']: + self._processings[k].workload_id = proc_metadata[k]['workload_id'] + self.workload_id = proc_metadata[k]['workload_id'] + if 'external_id' in proc_metadata[k] and proc_metadata[k]['external_id']: + self._processings[k].external_id = proc_metadata[k]['external_id'] + self.external_id = proc_metadata[k]['external_id'] + for k in proc_metadata: + if k not in self._processings: self._processings[k] = Processing(processing_metadata={}) proc_id = proc_metadata[k]['processing_id'] self._processings[k].processing_id = proc_id self._processings[k].internal_id = k + if 'workload_id' in proc_metadata[k] and proc_metadata[k]['workload_id']: + self._processings[k].workload_id = proc_metadata[k]['workload_id'] + self.workload_id = proc_metadata[k]['workload_id'] + if 'external_id' in proc_metadata[k] and proc_metadata[k]['external_id']: + self._processings[k].external_id = proc_metadata[k]['external_id'] + self.external_id = proc_metadata[k]['external_id'] def load_metadata(self): self.load_work() @@ -837,6 +977,98 @@ def to_update_processings(self): def to_update_processings(self, value): self.add_metadata_item('to_update_processings', value) + @property + def num_run(self): + return self.get_metadata_item('num_run', 0) + + @num_run.setter + def num_run(self, value): + self.add_metadata_item('num_run', value) + if value is not None and value > 1: + # for k in self._collections: + for coll in self.output_collections: + if type(coll) in [Collection]: + if "___idds___" not in coll.name: + coll.name = coll.name + "." + str(value) + + @property + def primary_input_collection(self): + if self._primary_input_collection: + return self.collections[self._primary_input_collection] + return None + + @primary_input_collection.setter + def primary_input_collection(self, value): + if type(value) in [str] and len(value) == 8: + # local value from old idds version + self._primary_input_collection = value + else: + self.set_primary_input_collection(value) + + @property + def primary_output_collection(self): + if self._primary_output_collection: + return self.collections[self._primary_output_collection] + return None + + @primary_output_collection.setter + def primary_output_collection(self, value): + if type(value) in [str] and len(value) == 8: + # local value from old idds version + self._primary_output_collection = value + else: + self.set_primary_output_collection(value) + + @property + def input_collections(self): + if self._primary_input_collection: + keys = [self._primary_input_collection] + self._other_input_collections + else: + keys = self._other_input_collections + return [self.collections[k] for k in keys] + + @input_collections.setter + def input_collections(self, value): + if value and type(value) not in [list, tuple]: + value = [value] + + if value: + primary_collection = value[0] + other_collections = [] + if len(value) > 1: + other_collections = value[1:] + + self.set_primary_input_collection(primary_collection) + + if other_collections and type(other_collections) not in [list, tuple]: + other_collections = [other_collections] + self.add_other_input_collections(other_collections) + + @property + def output_collections(self): + if self._primary_output_collection: + keys = [self._primary_output_collection] + self._other_output_collections + else: + keys = self._other_output_collections + return [self.collections[k] for k in keys] + + @output_collections.setter + def output_collections(self, value): + if value and type(value) not in [list, tuple]: + value = [value] + + if value: + primary_collection = value[0] + other_collections = [] + if len(value) > 1: + other_collections = value[1:] + + self.set_primary_output_collection(primary_collection) + + if other_collections and type(other_collections) not in [list, tuple]: + other_collections = [other_collections] + self.add_other_output_collections(other_collections) + def set_work_name(self, work_name): self.work_name = work_name @@ -846,11 +1078,127 @@ def get_work_name(self): def get_is_template(self): self.is_template + def sync_global_parameters(self, global_parameters, sliced_global_parameters=None): + if sliced_global_parameters: + self.sliced_global_parameters = sliced_global_parameters + + if global_parameters: + for key in global_parameters: + sliced_index = None + sliced_name = None + if self.sliced_global_parameters and key in self.sliced_global_parameters: + sliced_index = self.sliced_global_parameters[key]['index'] + sliced_name = self.sliced_global_parameters[key]['name'] + if type(global_parameters[key]) in [list, tuple] and sliced_index < len(global_parameters[key]): + pass + else: + sliced_index = None + if not sliced_name: + sliced_name = key + + if sliced_index is None: + setattr(self, sliced_name, global_parameters[key]) + else: + setattr(self, sliced_name, global_parameters[key][sliced_index]) + + def get_global_parameter_from_output_data(self, key): + self.logger.debug("get_global_parameter_from_output_data, key: %s, output_data: %s" % (key, str(self.output_data))) + gp_output_data = {} + if self.output_data and type(self.output_data) in [dict]: + for key in self.output_data: + new_key = "user_" + str(key) + gp_output_data[new_key] = self.output_data[key] + if key in gp_output_data: + return True, gp_output_data[key] + else: + return False, None + + def renew_parameters_from_attributes(self): + pass + + def add_custom_condition(self, key, value, op='and'): + # op in ['and', 'or'] + if op and op == 'or': + op = 'or' + else: + op = 'and' + if op == 'and': + self.and_custom_conditions[key] = value + else: + self.or_custom_conditions[key] = value + + def get_custom_condition_status_value_bool(self, key): + user_key = "user_" + key + if hasattr(self, user_key): + key = user_key + + if hasattr(self, key) and getattr(self, key): + value = getattr(self, key) + if type(value) in [str]: + value = value.lower() + if value == 'true': + return True + else: + return False + elif type(value) in [bool]: + return value + elif type(value) in [int]: + if value > 0: + return True + else: + return False + else: + return value + else: + return False + + def get_custom_condition_status_value(self, key): + if self.output_data and key in self.output_data: + return self.output_data[key] + + user_key = "user_" + key + if hasattr(self, user_key): + key = user_key + + if hasattr(self, key) and getattr(self, key): + return getattr(self, key) + else: + return None + + def get_custom_condition_status_real(self): + if self.or_custom_conditions: + for key in self.or_custom_conditions: + value = self.get_custom_condition_status_value(key) + if value == self.or_custom_conditions[key]: + return True + + if self.and_custom_conditions: + for key in self.and_custom_conditions: + value = self.get_custom_condition_status_value(key) + if not (value == self.and_custom_conditions[key]): + return False + return True + + return False + + def get_custom_condition_status(self): + # self.logger.debug("get_custom_condition_status, or_custom_conditions: %s" % str(self.or_custom_conditions)) + # self.logger.debug("get_custom_condition_status, and_custom_conditions: %s" % str(self.and_custom_conditions)) + # self.logger.debug("get_custom_condition_status, work: %s" % (json_dumps(self, sort_keys=True, indent=4))) + + status = self.get_custom_condition_status_real() + self.logger.debug("get_custom_condition_status, status: %s" % (status)) + return status + + def get_not_custom_condition_status(self): + return not self.get_custom_condition_status() + def setup_logger(self): """ Setup logger """ self.logger = logging.getLogger(self.get_class_name()) + return self.logger def add_errors(self, error): self.errors.append(error) @@ -860,7 +1208,7 @@ def get_errors(self): def set_work_id(self, work_id, transforming=True): """ - *** Function called by Marshaller agent. + *** Function called by Marshaller and clerk agent. *** It's the transform_id set by core_workprogresses """ self.work_id = work_id @@ -868,7 +1216,7 @@ def set_work_id(self, work_id, transforming=True): def get_work_id(self): """ - *** Function called by Marshaller agent. + *** Function called by Marshaller and clerk agent. """ return self.work_id @@ -895,7 +1243,7 @@ def clean_work(self): self.suspended_processings = [] self.old_processings = [] self.terminated_msg = "" - self.output_data = None + self.output_data = {} self.parameters_for_next_task = None def set_agent_attributes(self, attrs, req_attributes=None): @@ -957,12 +1305,6 @@ def __eq__(self, obj): def __hash__(self): return self.work_id - """ - def to_dict(self): - return {key: value for key, value - in self.__dict__.items() if not key.startswith('_')} - """ - def __str__(self): return str(self.to_dict()) @@ -980,6 +1322,11 @@ def get_work_tag(self): def set_parameters(self, parameters): self.parameters = parameters + for p in self.parameters: + if self.parameters[p] is not None and hasattr(self, p): + # fp = getattr(self, p) + # fp = self.parameters[p] # noqa F841 + setattr(self, p, self.parameters[p]) def get_parameters(self): return self.parameters @@ -990,6 +1337,9 @@ def set_arguments(self, arguments): def get_arguments(self): return self.arguments + def get_ancestry_works(self): + return [] + def has_to_release_inputs(self): if self.backup_to_release_inputs['0'] or self.backup_to_release_inputs['1'] or self.backup_to_release_inputs['2']: return True @@ -1010,7 +1360,7 @@ def is_started(self): return self.started def is_running(self): - if self.status in [WorkStatus.Running]: + if self.status in [WorkStatus.Running, WorkStatus.Transforming]: return True return False @@ -1072,7 +1422,9 @@ def is_suspended(self): return False def add_next_work(self, work): - self.next_works.append(work) + next_works = self.next_works + next_works.append(work) + self.next_works = next_works def parse_arguments(self): try: @@ -1166,15 +1518,38 @@ def add_collection_to_collections(self, coll): def set_primary_input_collection(self, coll): if coll: - collection = self.add_collection_to_collections(coll) - self.primary_input_collection = collection.internal_id + if type(coll) in [str] and len(coll) == 8: + # local value from old idds version + # load value submitted from old idds version + self._primary_input_collection = coll + else: + collection = self.add_collection_to_collections(coll) + self._primary_input_collection = collection.internal_id def get_primary_input_collection(self): """ *** Function called by Marshaller agent. """ - if self.primary_input_collection: - return self.collections[self.primary_input_collection] + if self._primary_input_collection: + return self.collections[self._primary_input_collection] + return None + + def set_primary_output_collection(self, coll): + if coll: + if type(coll) in [str] and len(coll) == 8: + # local value from old idds version + # load value submitted from old idds version + self._primary_output_collection = coll + else: + collection = self.add_collection_to_collections(coll) + self._primary_output_collection = collection.internal_id + + def get_primary_output_collection(self): + """ + *** Function called by Marshaller agent. + """ + if self._primary_output_collection: + return self.collections[self._primary_output_collection] return None def add_other_input_collections(self, colls): @@ -1185,18 +1560,47 @@ def add_other_input_collections(self, colls): for coll in colls: collection = self.add_collection_to_collections(coll) - self.other_input_collections.append(collection.internal_id) + self._other_input_collections.append(collection.internal_id) def get_other_input_collections(self): - return [self.collections[k] for k in self.other_input_collections] + return [self.collections[k] for k in self._other_input_collections] + + def add_other_output_collections(self, colls): + if not colls: + return + if type(colls) not in [list, tuple]: + colls = [colls] + + for coll in colls: + collection = self.add_collection_to_collections(coll) + self._other_output_collections.append(collection.internal_id) + + def get_other_output_collections(self): + return [self.collections[k] for k in self._other_output_collections] + + def get_input_collections(self, poll_externel=False): + """ + *** Function called by Transformer agent. + """ + if self._primary_input_collection: + keys = [self._primary_input_collection] + self._other_input_collections + else: + keys = self._other_input_collections + return [self.collections[k] for k in keys] - def get_input_collections(self): + def get_output_collections(self): """ *** Function called by Transformer agent. """ - keys = [self.primary_input_collection] + self.other_input_collections + if self._primary_output_collection: + keys = [self._primary_output_collection] + self._other_output_collections + else: + keys = self._other_output_collections return [self.collections[k] for k in keys] + def get_collections(self): + return [self.collections[k] for k in self.collections.keys()] + def is_input_collections_closed(self): colls = self.get_input_collections() for coll in colls: @@ -1220,6 +1624,9 @@ def get_internal_collections(self, coll): return [] return [] + def poll_external_collection(self, coll): + return coll + def poll_internal_collection(self, coll): try: if coll.status in [CollectionStatus.Closed]: @@ -1263,7 +1670,7 @@ def get_internal_input_contents(self, coll): """ Get all input contents from iDDS collections. """ - coll = self.collections[self.primary_input_collection] + coll = self.collections[self._primary_input_collection] internal_colls = self.get_internal_collection(coll) internal_coll_ids = [coll.coll_id for coll in internal_colls] if internal_coll_ids: @@ -1287,12 +1694,18 @@ def add_output_collections(self, colls): if type(colls) not in [list, tuple]: colls = [colls] - for coll in colls: - collection = self.add_collection_to_collections(coll) - self.output_collections.append(collection.internal_id) + value = colls + if value: + primary_collection = value[0] + other_collections = [] + if len(value) > 1: + other_collections = value[1:] - def get_output_collections(self): - return [self.collections[k] for k in self.output_collections] + self.set_primary_output_collection(primary_collection) + + if other_collections and type(other_collections) not in [list, tuple]: + other_collections = [other_collections] + self.add_other_output_collections(other_collections) def get_output_contents(self): pass @@ -1313,6 +1726,15 @@ def get_log_collections(self): def set_has_new_inputs(self, yes=True): self.has_new_inputs = yes + def has_dependency(self): + return False + + def get_parent_work_names(self): + return [] + + def get_parent_workload_ids(self): + return [] + def get_new_input_output_maps(self, mapped_input_output_maps={}): """ *** Function called by Transformer agent. @@ -1346,7 +1768,7 @@ def get_new_input_output_maps(self, mapped_input_output_maps={}): for ip in new_inputs: self.num_mapped_inputs += 1 out_ip = copy.deepcopy(ip) - out_ip['coll_id'] = self.collections[self.output_collections[0]]['coll_id'] + out_ip['coll_id'] = self.collections[self._primary_output_collection]['coll_id'] new_input_output_maps[next_key] = {'inputs': [ip], 'outputs': [out_ip], 'inputs_dependency': [], @@ -1622,7 +2044,7 @@ def submit_processing(self, processing): """ raise exceptions.NotImplementedException - def abort_processing(self, processing): + def abort_processing_old(self, processing): """ *** Function called by Carrier agent. """ @@ -1644,7 +2066,7 @@ def suspend_processing(self, processing): proc = processing['processing_metadata']['processing'] proc.tosuspend = True - def resume_processing(self, processing): + def resume_processing_old(self, processing): """ *** Function called by Carrier agent. """ @@ -1734,14 +2156,28 @@ def syn_work_status(self, input_output_maps, all_updates_flushed=True, output_st self.started = True self.logger.debug("syn_work_status(%s): work.status: %s" % (str(self.get_processing_ids()), str(self.status))) - def sync_work_data(self, status, substatus, work): + def sync_work_data(self, status, substatus, work, workload_id=None, output_data=None): # self.status = work.status work.work_id = self.work_id work.transforming = self.transforming - self.metadata = work.metadata + + # clerk will update next_works while transformer doesn't. + # synchronizing work metadata from transformer to clerk needs to keep it at first. + next_works = self.next_works + # self.metadata = work.metadata + self.next_works = next_works self.status_statistics = work.status_statistics - self.processings = work.processings + # self.processings = work.processings + if output_data: + self.output_data = output_data + else: + self.output_data = work.output_data + + self.status = get_work_status_from_transform_processing_status(status) + self.substatus = get_work_status_from_transform_processing_status(substatus) + if workload_id: + self.workload_id = workload_id """ self.status = WorkStatus(status.value) @@ -1763,6 +2199,14 @@ def sync_work_data(self, status, substatus, work): self.suspended_processings = work.suspended_processings """ + def abort_processing(self, processing, log_prefix=''): + msg = "abort processing is not implemented" + self.logger.error(log_prefix + msg) + + def resume_processing(self, processing, log_prefix=''): + msg = "resume processing is not implemented" + self.logger.error(log_prefix + msg) + def add_proxy(self, proxy): self.proxy = proxy diff --git a/workflow/lib/idds/workflow/workflow.py b/workflow/lib/idds/workflow/workflow.py index e37fab10..b7333046 100644 --- a/workflow/lib/idds/workflow/workflow.py +++ b/workflow/lib/idds/workflow/workflow.py @@ -6,23 +6,22 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 +# - Wen Guan, , 2020 - 2021 import copy import datetime import logging import inspect import random -import time import uuid from idds.common import exceptions -from idds.common.constants import IDDSEnum +from idds.common.constants import IDDSEnum, WorkStatus from idds.common.utils import json_dumps, setup_logging, get_proxy from idds.common.utils import str_to_date from .base import Base -from .work import Work +from .work import Work, Collection setup_logging(__name__) @@ -47,8 +46,9 @@ def __init__(self, operator=ConditionOperator.And, conditions=[], true_works=Non super(CompositeCondition, self).__init__() - self.internal_id = str(uuid.uuid1()) + self.internal_id = str(uuid.uuid4())[:8] self.template_id = self.internal_id + # self.template_id = str(uuid.uuid4())[:8] self.logger = logger if self.logger is None: @@ -77,29 +77,6 @@ def __init__(self, operator=ConditionOperator.And, conditions=[], true_works=Non self.true_works = true_works self.false_works = false_works - # real_conditions = [] - # for cond in conditions: - # # real_conditions.append({'condition': cond, 'current_work': cond.__self__}) - # real_conditions.append(cond) - # self.conditions = real_conditions - - # real_true_works, real_false_works = [], [] - # for true_work in true_works: - # # real_true_works.append({'work': true_work, 'triggered': False}) - # real_true_works.append(true_work) - # self.true_works = real_true_works - # for false_work in false_works: - # # real_false_works.append({'work': false_work, 'triggered': False}) - # real_false_works.append(false_work) - # self.false_works = real_false_works - - # self.set_class_metadata() - - # def set_class_metadata(self): - # self.add_metadata_item('class', {'class': self.__class__.__name__, - # 'module': self.__class__.__module__}) - # self.add_metadata_item('operator', self.operator.value) - def get_class_name(self): return self.__class__.__name__ @@ -109,6 +86,27 @@ def get_internal_id(self): def get_template_id(self): return self.template_id + def copy(self): + new_cond = copy.deepcopy(self) + return new_cond + + def __deepcopy__(self, memo): + logger = self.logger + self.logger = None + + cls = self.__class__ + result = cls.__new__(cls) + + memo[id(self)] = result + + # Deep copy all other attributes + for k, v in self.__dict__.items(): + setattr(result, k, copy.deepcopy(v, memo)) + + self.logger = logger + result.logger = logger + return result + @property def conditions(self): # return self.get_metadata_item('true_works', []) @@ -117,17 +115,6 @@ def conditions(self): @conditions.setter def conditions(self, value): self._conditions = value - new_value = [] - for cond in value: - if cond is None: - continue - if inspect.ismethod(cond): - new_cond = {'idds_method': cond.__name__, - 'idds_method_class_id': cond.__self__.get_template_id()} - else: - new_cond = cond - new_value.append(new_cond) - self.add_metadata_item('conditions', new_value) @property def true_works(self): @@ -142,12 +129,14 @@ def true_works(self, value): if work is None: continue if isinstance(work, Work): - if work.get_template_id() not in true_work_meta: - true_work_meta[work.get_template_id()] = {'triggered': False} + if work.get_internal_id() not in true_work_meta: + true_work_meta[work.get_internal_id()] = {'triggered': False} elif isinstance(work, CompositeCondition): - if work.get_template_id() not in true_work_meta: - true_work_meta[work.get_template_id()] = {'triggered': False, - 'metadata': work.metadata} + if work.get_internal_id() not in true_work_meta: + true_work_meta[work.get_internal_id()] = {'triggered': False} + elif isinstance(work, Workflow): + if work.get_internal_id() not in true_work_meta: + true_work_meta[work.get_internal_id()] = {'triggered': False} self.add_metadata_item('true_works', true_work_meta) @property @@ -163,12 +152,14 @@ def false_works(self, value): if work is None: continue if isinstance(work, Work): - if work.get_template_id() not in false_work_meta: - false_work_meta[work.get_template_id()] = {'triggered': False} + if work.get_internal_id() not in false_work_meta: + false_work_meta[work.get_internal_id()] = {'triggered': False} elif isinstance(work, CompositeCondition): - if work.get_template_id() not in false_work_meta: - false_work_meta[work.get_template_id()] = {'triggered': False, - 'metadata': work.metadata} + if work.get_internal_id() not in false_work_meta: + false_work_meta[work.get_internal_id()] = {'triggered': False} + elif isinstance(work, Workflow): + if work.get_internal_id() not in false_work_meta: + false_work_meta[work.get_internal_id()] = {'triggered': False} self.add_metadata_item('false_works', false_work_meta) def validate_conditions(self, conditions): @@ -176,41 +167,20 @@ def validate_conditions(self, conditions): raise exceptions.IDDSException("conditions must be list") for cond in conditions: assert(inspect.ismethod(cond)) - assert(isinstance(cond.__self__, Work)) - if (cond.__self__.is_template): - raise exceptions.IDDSException("Work class for CompositeCondition must not be a template") def add_condition(self, cond): assert(inspect.ismethod(cond)) assert(isinstance(cond.__self__, Work)) - if (cond.__self__.is_template): - raise exceptions.IDDSException("Work class for CompositeCondition must not be a template") # self.conditions.append({'condition': cond, 'current_work': cond.__self__}) self._conditions.append(cond) - new_value = self.get_metadata_item('conditions', []) - if inspect.ismethod(cond): - new_cond = {'idds_method': cond.__name__, - 'idds_method_class_id': cond.__self__.get_template_id()} - else: - new_cond = cond - new_value.append(new_cond) - self.add_metadata_item('conditions', new_value) def load_metadata(self): # conditions = self.get_metadata_item('conditions', []) - true_works_meta = self.get_metadata_item('true_works', {}) - false_works_meta = self.get_metadata_item('false_works', {}) - - for work in self.true_works: - if isinstance(work, CompositeCondition): - if work.get_template_id() in true_works_meta: - work.metadata = true_works_meta[work.get_template_id()]['metadata'] - for work in self.false_works: - if isinstance(work, CompositeCondition): - if work.get_template_id() in false_works_meta: - work.metadata = false_works_meta[work.get_template_id()]['metadata'] + # true_works_meta = self.get_metadata_item('true_works', {}) + # false_works_meta = self.get_metadata_item('false_works', {}) + pass def to_dict(self): # print('to_dict') @@ -228,19 +198,36 @@ def to_dict(self): new_value = [] for cond in value: if inspect.ismethod(cond): - new_cond = {'idds_method': cond.__name__, - 'idds_method_class_id': cond.__self__.get_template_id()} + if isinstance(cond.__self__, Work): + new_cond = {'idds_method': cond.__name__, + 'idds_method_internal_id': cond.__self__.get_internal_id()} + elif isinstance(cond.__self__, CompositeCondition): + new_cond = {'idds_method': cond.__name__, + 'idds_method_condition': cond.__self__.to_dict()} + elif isinstance(cond.__self__, Workflow): + new_cond = {'idds_method': cond.__name__, + 'idds_method_internal_id': cond.__self__.get_internal_id()} + else: + new_cond = {'idds_method': cond.__name__, + 'idds_method_internal_id': cond.__self__.get_internal_id()} else: - new_cond = cond + if hasattr(cond, '__self__'): + new_cond = {'idds_attribute': cond.__name__, + 'idds_method_internal_id': cond.__self__.get_internal_id()} + else: + new_cond = cond new_value.append(new_cond) value = new_value elif key in ['_true_works', '_false_works']: new_value = [] for w in value: if isinstance(w, Work): - new_w = w.get_template_id() + new_w = w.get_internal_id() elif isinstance(w, CompositeCondition): new_w = w.to_dict() + elif isinstance(w, Workflow): + # new_w = w.to_dict() + new_w = w.get_internal_id() else: new_w = w new_value.append(new_w) @@ -250,44 +237,65 @@ def to_dict(self): ret['attributes'][key] = value return ret - def get_work_from_id(self, work_id, works, works_template): - for w_id in works: - if works[w_id].get_template_id() == work_id: - return works[w_id] - for w_id in works_template: - if works_template[w_id].get_template_id() == work_id: - return works_template[w_id] - return None + def get_work_from_id(self, work_id, works): + return works[work_id] - def load_conditions(self, works, works_template): + def load_conditions(self, works): new_conditions = [] for cond in self.conditions: - if 'idds_method' in cond and 'idds_method_class_id' in cond: - class_id = cond['idds_method_class_id'] - work = self.get_work_from_id(class_id, works, works_template) - if work is not None: - new_cond = getattr(work, cond['idds_method']) + if callable(cond): + new_conditions.append(cond) + else: + if 'idds_method' in cond and 'idds_method_internal_id' in cond: + self.logger.debug("idds_method_internal_id: %s" % cond['idds_method_internal_id']) + self.logger.debug("idds_method: %s" % cond['idds_method']) + + internal_id = cond['idds_method_internal_id'] + work = self.get_work_from_id(internal_id, works) + + self.logger.debug("get_work_from_id: %s: [%s]" % (internal_id, [work])) + + if work is not None: + new_cond = getattr(work, cond['idds_method']) + else: + self.logger.error("Condition method work cannot be found for %s" % (internal_id)) + new_cond = cond + elif 'idds_attribute' in cond and 'idds_method_internal_id' in cond: + internal_id = cond['idds_method_internal_id'] + work = self.get_work_from_id(internal_id, works) + if work is not None: + new_cond = getattr(work, cond['idds_attribute']) + else: + self.logger.error("Condition attribute work cannot be found for %s" % (internal_id)) + new_cond = cond + elif 'idds_method' in cond and 'idds_method_condition' in cond: + new_cond = cond['idds_method_condition'] + new_cond = getattr(new_cond, cond['idds_method']) else: - self.logger.error("Work cannot be found for %s" % class_id) new_cond = cond - else: - new_cond = cond - new_conditions.append(new_cond) + new_conditions.append(new_cond) self.conditions = new_conditions new_true_works = [] + self.logger.debug("true_works: %s" % str(self.true_works)) + for w in self.true_works: + # self.logger.debug("true_work: %s" % str(w)) if isinstance(w, CompositeCondition): # work = w.load_conditions(works, works_template) - w.load_conditions(works, works_template) + w.load_conditions(works) + work = w + elif isinstance(w, Workflow): + work = w + elif isinstance(w, Work): work = w elif type(w) in [str]: - work = self.get_work_from_id(w, works, works_template) + work = self.get_work_from_id(w, works) if work is None: - self.logger.error("Work cannot be found for %s" % str(w)) + self.logger.error("True work cannot be found for %s" % str(w)) work = w else: - self.logger.error("Work cannot be found for %s" % str(w)) + self.logger.error("True work cannot be found for type(%s): %s" % (type(w), str(w))) work = w new_true_works.append(work) self.true_works = new_true_works @@ -296,15 +304,19 @@ def load_conditions(self, works, works_template): for w in self.false_works: if isinstance(w, CompositeCondition): # work = w.load_condtions(works, works_template) - w.load_conditions(works, works_template) + w.load_conditions(works) + work = w + elif isinstance(w, Workflow): + work = w + elif isinstance(w, Work): work = w elif type(w) in [str]: - work = self.get_work_from_id(w, works, works_template) + work = self.get_work_from_id(w, works) if work is None: - self.logger.error("Work cannot be found for %s" % str(w)) + self.logger.error("False work cannot be found for type(%s): %s" % (type(w), str(w))) work = w else: - self.logger.error("Work cannot be found for %s" % str(w)) + self.logger.error("False work cannot be found for %s" % str(w)) work = w new_false_works.append(work) self.false_works = new_false_works @@ -319,7 +331,10 @@ def all_condition_ids(self): works = [] for cond in self.conditions: if inspect.ismethod(cond): - works.append(cond.__self__.get_template_id()) + if isinstance(cond.__self__, Work) or isinstance(cond.__self__, Workflow): + works.append(cond.__self__.get_internal_id()) + elif isinstance(cond.__self__, CompositeCondition): + works = works + cond.__self__.all_condition_ids() else: self.logger.error("cond cannot be recognized: %s" % str(cond)) works.append(cond) @@ -332,7 +347,10 @@ def all_pre_works(self): works = [] for cond in self.conditions: if inspect.ismethod(cond): - works.append(cond.__self__) + if isinstance(cond.__self__, Work) or isinstance(cond.__self__, Workflow): + works.append(cond.__self__) + elif isinstance(cond.__self__, CompositeCondition): + works = works + cond.__self__.all_pre_works() else: self.logger.error("cond cannot be recognized: %s" % str(cond)) works.append(cond) @@ -395,20 +413,17 @@ def get_next_works(self, trigger=ConditionTrigger.NotTriggered): if isinstance(work, CompositeCondition): works = works + work.get_next_works(trigger=trigger) else: - if work.get_template_id() not in true_work_meta: - true_work_meta[work.get_template_id()] = {'triggered': False} + if work.get_internal_id() not in true_work_meta: + true_work_meta[work.get_internal_id()] = {'triggered': False} if trigger == ConditionTrigger.ToTrigger: - if not true_work_meta[work.get_template_id()]['triggered']: - true_work_meta[work.get_template_id()]['triggered'] = True - works.append(work) - elif work.get_is_template(): - # A template can be triggered many times. + if not true_work_meta[work.get_internal_id()]['triggered']: + true_work_meta[work.get_internal_id()]['triggered'] = True works.append(work) elif trigger == ConditionTrigger.NotTriggered: - if not true_work_meta[work.get_template_id()]['triggered']: + if not true_work_meta[work.get_internal_id()]['triggered']: works.append(work) elif trigger == ConditionTrigger.Triggered: - if true_work_meta[work.get_template_id()]['triggered']: + if true_work_meta[work.get_internal_id()]['triggered']: works.append(work) self.add_metadata_item('true_works', true_work_meta) else: @@ -417,20 +432,17 @@ def get_next_works(self, trigger=ConditionTrigger.NotTriggered): if isinstance(work, CompositeCondition): works = works + work.get_next_works(trigger=trigger) else: - if work.get_template_id() not in false_work_meta: - false_work_meta[work.get_template_id()] = {'triggered': False} + if work.get_internal_id() not in false_work_meta: + false_work_meta[work.get_internal_id()] = {'triggered': False} if trigger == ConditionTrigger.ToTrigger: - if not false_work_meta[work.get_template_id()]['triggered']: - false_work_meta[work.get_template_id()]['triggered'] = True - works.append(work) - elif work.get_is_template(): - # A template can be triggered many times. + if not false_work_meta[work.get_internal_id()]['triggered']: + false_work_meta[work.get_internal_id()]['triggered'] = True works.append(work) elif trigger == ConditionTrigger.NotTriggered: - if not false_work_meta[work.get_template_id()]['triggered']: + if not false_work_meta[work.get_internal_id()]['triggered']: works.append(work) elif trigger == ConditionTrigger.Triggered: - if false_work_meta[work.get_template_id()]['triggered']: + if false_work_meta[work.get_internal_id()]['triggered']: works.append(work) self.add_metadata_item('false_works', false_work_meta) return works @@ -517,18 +529,81 @@ def add_condition(self, cond): raise exceptions.IDDSException("Condition class doesn't support add_condition. To support multiple condition, please use CompositeCondition.") -class Workflow(Base): +class ParameterLink(Base): + def __init__(self, parameters): + super(ParameterLink, self).__init__() + self.parameters = {} + self.num_parameters = 0 + if parameters: + if type(parameters) not in [list, tuple]: + parameters = [parameters] + for p in parameters: + if p: + if type(p) in [str]: + self.parameters[str(self.num_parameters)] = {'source': p, 'destination': p} + self.num_parameters += 1 + elif type(p) in [dict] and 'source' in p and 'destination' in p: + self.parameters[str(self.num_parameters)] = {'source': p['source'], 'destination': p['destination']} + self.num_parameters += 1 + else: + raise Exception("Cannot parse the parameters format. Accepted format: list of string or dict{'source': <>, 'destination': <>}") + + self.internal_id = str(uuid.uuid4())[:8] + self.template_id = self.internal_id + + def get_internal_id(self): + return self.internal_id + + def get_parameter_value(self, work, p): + ret = None + p_f = getattr(work, p, 'None') + if p_f: + if callable(p_f): + ret = p_f() + else: + ret = p_f + else: + ret = None + if ret and type(ret) in [Collection] and hasattr(ret, 'to_origin_dict'): + ret = ret.to_origin_dict() + return ret + + def set_parameters(self, work): + p_values = {} + for p in self.parameters: + p_values[p] = self.get_parameter_value(work, self.parameters[p]['source']) + self.add_metadata_item('parameters', p_values) + + def get_parameters(self): + p_values = self.get_metadata_item('parameters', {}) + ret = {} + for p in self.parameters: + if p in p_values: + ret[self.parameters[p]['destination']] = p_values[p] + return ret + + +class WorkflowBase(Base): def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None, logger=None): """ Init a workflow. """ + self._works = {} self._conditions = {} self._work_conds = {} - super(Workflow, self).__init__() - self.internal_id = str(uuid.uuid1()) + self.parameter_links = {} + self.parameter_links_source = {} + self.parameter_links_destination = {} + + self._global_parameters = {} + + super(WorkflowBase, self).__init__() + + self.internal_id = str(uuid.uuid4())[:8] self.template_work_id = self.internal_id + # self.template_work_id = str(uuid.uuid4())[:8] self.lifetime = lifetime self.pending_time = pending_time @@ -539,18 +614,20 @@ def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None self._name = 'idds.workflow.' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S_%f") + str(random.randint(1, 1000)) if workload_id is None: - workload_id = int(time.time()) + # workload_id = int(time.time()) + pass self.workload_id = workload_id self.logger = logger if self.logger is None: self.setup_logger() - self.works_template = {} self._works = {} self.works = {} self.work_sequence = {} # order list + self.next_works = [] + self.terminated_works = [] self.initial_works = [] # if the primary initial_work is not set, it's the first initial work. @@ -583,6 +660,14 @@ def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None self.userdn = None self.proxy = None + self._loop_condition_position = 'end' + self.loop_condition = None + + self.num_run = None + + self.global_parameters = {} + + self.to_cancel = False """ self._running_data_names = [] for name in ['internal_id', 'template_work_id', 'workload_id', 'work_sequence', 'terminated_works', @@ -602,21 +687,11 @@ def name(self): def name(self, value): self._name = value - @property - def internal_id(self): - return self.get_metadata_item('internal_id') - - @internal_id.setter - def internal_id(self, value): - self.add_metadata_item('internal_id', value) - - @property - def template_work_id(self): - return self.get_metadata_item('template_work_id') + def get_template_work_id(self): + return self.template_work_id - @template_work_id.setter - def template_work_id(self, value): - self.add_metadata_item('template_work_id', value) + def get_template_id(self): + return self.template_work_id @property def workload_id(self): @@ -685,9 +760,18 @@ def works(self, value): if self._works: for k in self._works: work = self._works[k] - work_metadata[k] = {'internal_id': work.internal_id, - 'template_work_id': work.template_work_id, - 'work_id': work.work_id} + if isinstance(work, Workflow): + work_metadata[k] = {'type': 'workflow', + 'metadata': work.metadata} + else: + work_metadata[k] = {'type': 'work', + 'work_id': work.work_id, + 'workload_id': work.workload_id, + 'external_id': work.external_id, + 'status': work.status.value if work.status else work.status, + 'substatus': work.substatus.value if work.substatus else work.substatus, + 'next_works': work.next_works, + 'transforming': work.transforming} self.add_metadata_item('works', work_metadata) def refresh_works(self): @@ -695,31 +779,50 @@ def refresh_works(self): if self._works: for k in self._works: work = self._works[k] - work_metadata[k] = {'internal_id': work.internal_id, - 'template_work_id': work.template_work_id, - 'work_id': work.work_id, - 'transforming': work.transforming} + if isinstance(work, Workflow): + work.refresh_works() + work_metadata[k] = {'type': 'workflow', + 'metadata': work.metadata} + else: + work_metadata[k] = {'type': 'work', + 'work_id': work.work_id, + 'workload_id': work.workload_id, + 'external_id': work.external_id, + 'status': work.status.value if work.status else work.status, + 'substatus': work.substatus.value if work.substatus else work.substatus, + 'next_works': work.next_works, + 'transforming': work.transforming} if work.last_updated_at and (not self.last_updated_at or work.last_updated_at > self.last_updated_at): self.last_updated_at = work.last_updated_at self.add_metadata_item('works', work_metadata) def load_works(self): work_metadata = self.get_metadata_item('works', {}) - for k in work_metadata: - if k not in self._works: - template_id = work_metadata[k]['template_work_id'] - work_template = self.works_template[template_id] - new_work = work_template.generate_work_from_template() - new_work.work_id = work_metadata[k]['work_id'] - new_work.internal_id = work_metadata[k]['internal_id'] - self._works[k] = new_work - - self._works[k].work_id = work_metadata[k]['work_id'] - self._works[k].transforming = work_metadata[k]['transforming'] + for k in self._works: + if k in work_metadata: + if work_metadata[k]['type'] == 'work': + self._works[k].work_id = work_metadata[k]['work_id'] + self._works[k].workload_id = work_metadata[k]['workload_id'] if 'workload_id' in work_metadata[k] else None + self._works[k].external_id = work_metadata[k]['external_id'] if 'external_id' in work_metadata[k] else None + self._works[k].transforming = work_metadata[k]['transforming'] + self._works[k].status = WorkStatus(work_metadata[k]['status']) if work_metadata[k]['status'] else work_metadata[k]['status'] + self._works[k].substatus = WorkStatus(work_metadata[k]['substatus']) if work_metadata[k]['substatus'] else work_metadata[k]['substatus'] + self._works[k].next_works = work_metadata[k]['next_works'] if 'next_works' in work_metadata[k] else [] + elif work_metadata[k]['type'] == 'workflow': + self._works[k].metadata = work_metadata[k]['metadata'] + work = self._works[k] if work.last_updated_at and (not self.last_updated_at or work.last_updated_at > self.last_updated_at): self.last_updated_at = work.last_updated_at + @property + def next_works(self): + return self.get_metadata_item('next_works', []) + + @next_works.setter + def next_works(self, value): + self.add_metadata_item('next_works', value) + @property def conditions(self): return self._conditions @@ -744,10 +847,86 @@ def work_conds(self, value): def load_work_conditions(self): conditions_metadata = self.get_metadata_item('conditions', {}) - for cond_id in self.conditions: - self.conditions[cond_id].load_conditions(self.works, self.get_works_template()) - if cond_id in conditions_metadata: - self.conditions[cond_id].metadata = conditions_metadata[cond_id] + for cond_internal_id in self._conditions: + if cond_internal_id in conditions_metadata: + self.conditions[cond_internal_id].metadata = conditions_metadata[cond_internal_id] + if isinstance(self.conditions[cond_internal_id], Workflow): + # self.conditions[cond_internal_id].load_conditions(self.works) + pass + elif isinstance(self.conditions[cond_internal_id], Work): + # self.conditions[cond_internal_id].load_conditions(self.works) + pass + elif isinstance(self.conditions[cond_internal_id], CompositeCondition): + self.conditions[cond_internal_id].load_conditions(self.works) + pass + # work_conds = self.get_metadata_item('work_conds', {}) + # self._work_conds = work_conds + + @property + def global_parameters(self): + self._global_parameters = self.get_metadata_item('gp', {}) + return self._global_parameters + + @global_parameters.setter + def global_parameters(self, value): + self._global_parameters = value + gp_metadata = {} + if self._global_parameters: + for key in self._global_parameters: + if key.startswith("user_"): + gp_metadata[key] = self._global_parameters[key] + else: + self.logger.warn("Only parameters start with 'user_' can be set as global parameters. The parameter '%s' will be ignored." % (key)) + self.add_metadata_item('gp', gp_metadata) + + def set_global_parameters(self, value): + self.global_parameters = value + + @property + def sliced_global_parameters(self): + self._sliced_global_parameters = self.get_metadata_item('sliced_gp', {}) + return self._sliced_global_parameters + + @sliced_global_parameters.setter + def sliced_global_parameters(self, value): + self._sliced_global_parameters = value + gp_metadata = {} + if self._sliced_global_parameters: + for key in self._sliced_global_parameters: + if key.startswith("user_"): + gp_metadata[key] = self._sliced_global_parameters[key] + else: + self.logger.warn("Only parameters start with 'user_' can be set as global parameters. The parameter '%s' will be ignored." % (key)) + self.add_metadata_item('sliced_gp', gp_metadata) + + def set_sliced_global_parameters(self, source, name=None, index=0): + sliced_global_parameters = self.sliced_global_parameters + sliced_global_parameters[source] = {'name': name, 'index': index} + # to trigger the setter function + self.sliced_global_parameters = self.sliced_global_parameters + + def sync_global_parameters_from_work(self, work): + self.log_debug("work %s is_terminated, global_parameters: %s" % (work.get_internal_id(), str(self.global_parameters))) + if self.global_parameters: + for key in self.global_parameters: + status, value = work.get_global_parameter_from_output_data(key) + self.log_debug("work %s get_global_parameter_from_output_data(key: %s) results(%s:%s)" % (work.get_internal_id(), key, status, value)) + if status: + self.global_parameters[key] = value + elif hasattr(work, key): + self.global_parameters[key] = getattr(work, key) + self.set_global_parameters(self.global_parameters) + + @property + def loop_condition(self): + return self._loop_condition + + @loop_condition.setter + def loop_condition(self, value): + # self._loop_condition_position = position + self._loop_condition = value + if self._loop_condition: + self.add_metadata_item('loop_condition', self._loop_condition.get_condition_status()) @property def work_sequence(self): @@ -773,6 +952,14 @@ def first_initial(self): def first_initial(self, value): self.add_metadata_item('first_initial', value) + @property + def to_start_works(self): + return self.get_metadata_item('to_start_works', []) + + @to_start_works.setter + def to_start_works(self, value): + self.add_metadata_item('to_start_works', value) + @property def new_to_run_works(self): return self.get_metadata_item('new_to_run_works', []) @@ -853,6 +1040,14 @@ def last_work(self): def last_work(self, value): self.add_metadata_item('last_work', value) + @property + def init_works(self): + return self.get_metadata_item('init_works', []) + + @init_works.setter + def init_works(self, value): + self.add_metadata_item('init_works', value) + @property def to_update_transforms(self): return self.get_metadata_item('to_update_transforms', {}) @@ -861,9 +1056,26 @@ def to_update_transforms(self): def to_update_transforms(self, value): self.add_metadata_item('to_update_transforms', value) + @property + def num_run(self): + return self.get_metadata_item('num_run', 0) + + @num_run.setter + def num_run(self, value): + self.add_metadata_item('num_run', value) + + @property + def to_cancel(self): + return self.get_metadata_item('to_cancel', False) + + @to_cancel.setter + def to_cancel(self, value): + self.add_metadata_item('to_cancel', value) + def load_metadata(self): self.load_works() self.load_work_conditions() + self.load_parameter_links() def get_class_name(self): return self.__class__.__name__ @@ -908,30 +1120,65 @@ def __deepcopy__(self, memo): result.logger = logger return result - def get_works_template(self): - return self.works_template + def get_works(self): + return self.works - def add_work_template(self, work): - self.works_template[work.get_template_id()] = work - - def get_new_work_from_template(self, work_id, new_parameters=None): + def get_new_work_to_run(self, work_id, new_parameters=None): + # 1. initialize works # template_id = work.get_template_id() - template_id = work_id - work = self.works_template[template_id] - new_work = work.generate_work_from_template() + work = self.works[work_id] + work.workload_id = None + + if isinstance(work, Workflow): + work.parent_num_run = self.num_run + work.sync_works(to_cancel=self.to_cancel) + + work.sequence_id = self.num_total_works + + works = self.works + self.works = works + # self.work_sequence.append(new_work.get_internal_id()) + self.work_sequence[str(self.num_total_works)] = work.get_internal_id() + self.num_total_works += 1 + self.new_to_run_works.append(work.get_internal_id()) + self.last_work = work.get_internal_id() + else: + new_parameters = self.get_destination_parameters(work_id) + if new_parameters: + work.set_parameters(new_parameters) + work.sequence_id = self.num_total_works + + work.num_run = self.num_run + work.initialize_work() + work.sync_global_parameters(self.global_parameters, self.sliced_global_parameters) + work.renew_parameters_from_attributes() + if work.parent_workload_id is None and self.num_total_works > 0: + last_work_id = self.work_sequence[str(self.num_total_works - 1)] + last_work = self.works[last_work_id] + work.parent_workload_id = last_work.workload_id + last_work.add_next_work(work.get_internal_id()) + works = self.works + self.works = works + # self.work_sequence.append(new_work.get_internal_id()) + self.work_sequence[str(self.num_total_works)] = work.get_internal_id() + self.num_total_works += 1 + self.new_to_run_works.append(work.get_internal_id()) + self.last_work = work.get_internal_id() + + return work + + def get_new_parameters_for_work(self, work): + new_parameters = self.get_destination_parameters(work.get_internal_id()) if new_parameters: - new_work.set_parameters(new_parameters) - new_work.sequence_id = self.num_total_works - new_work.initialize_work() + work.set_parameters(new_parameters) + work.sequence_id = self.num_total_works + + work.initialize_work() + work.sync_global_parameters(self.global_parameters, self.sliced_global_parameters) + work.renew_parameters_from_attributes() works = self.works - works[new_work.get_internal_id()] = new_work self.works = works - # self.work_sequence.append(new_work.get_internal_id()) - self.work_sequence[str(self.num_total_works)] = new_work.get_internal_id() - self.num_total_works += 1 - self.new_to_run_works.append(new_work.get_internal_id()) - self.last_work = new_work.get_internal_id() - return new_work + return work def register_user_defined_condition(self, condition): cond_src = inspect.getsource(condition) @@ -954,49 +1201,144 @@ def set_workload_id(self, workload_id): def get_workload_id(self): return self.workload_id + def add_initial_works(self, work): + self.initial_works.append(work.get_internal_id()) + if self.primary_initial_work is None: + self.primary_initial_work = work.get_internal_id() + def add_work(self, work, initial=False, primary=False): self.first_initial = False - self.add_work_template(work) + self.works[work.get_internal_id()] = work if initial: if primary: - self.primary_initial_work = work.get_template_id() + self.primary_initial_work = work.get_internal_id() self.add_initial_works(work) - self.independent_works.append(work.get_template_id()) + self.independent_works.append(work.get_internal_id()) def add_condition(self, cond): self.first_initial = False cond_works = cond.all_works() for cond_work in cond_works: - assert(cond_work.get_template_id() in self.get_works_template()) + assert(cond_work.get_internal_id() in self.get_works()) - if cond.get_template_id() not in self.conditions: - conditions = self.conditions - conditions[cond.get_template_id()] = cond - self.conditions = conditions + conditions = self.conditions + conditions[cond.get_internal_id()] = cond + self.conditions = conditions # if cond.current_work not in self.work_conds: # self.work_conds[cond.current_work] = [] # self.work_conds[cond.current_work].append(cond) work_conds = self.work_conds for work in cond.all_pre_works(): - if work.get_template_id() not in work_conds: - work_conds[work.get_template_id()] = [] - work_conds[work.get_template_id()].append(cond.get_template_id()) + if work.get_internal_id() not in work_conds: + work_conds[work.get_internal_id()] = [] + work_conds[work.get_internal_id()].append(cond.get_internal_id()) self.work_conds = work_conds # if a work is a true_work or false_work of a condition, # should remove it from independent_works cond_next_works = cond.all_next_works() for next_work in cond_next_works: - if next_work.get_template_id() in self.independent_works: - self.independent_works.remove(next_work.get_template_id()) + if next_work.get_internal_id() in self.independent_works: + self.independent_works.remove(next_work.get_internal_id()) - def add_initial_works(self, work): - assert(work.get_template_id() in self.get_works_template()) - self.initial_works.append(work.get_template_id()) - if self.primary_initial_work is None: - self.primary_initial_work = work.get_template_id() + def find_workflow_from_work(self, work): + if work.get_internal_id() in self._works: + return self + else: + for k in self._works: + wk = self._works[k] + if isinstance(wk, Workflow): + wf = wk.find_workflow_from_work(work) + if wf: + return wf + return None + + def add_parameter_link(self, work_source, work_destinations, parameter_link): + wf_s = self.find_workflow_from_work(work_source) + if not wf_s: + raise Exception("Cannot find work %s in the workflow." % work_source.get_internal_id()) + if work_source.get_internal_id() not in wf_s.parameter_links_source: + wf_s.parameter_links_source[work_source.get_internal_id()] = [] + wf_s.parameter_links_source[work_source.get_internal_id()].append(parameter_link.get_internal_id()) + + if type(work_destinations) not in [list, tuple]: + work_destinations = [work_destinations] + for work_destination in work_destinations: + wf = self.find_workflow_from_work(work_destination) + if not wf: + raise Exception("Cannot find work %s in the workflow." % work_destination.get_internal_id()) + if parameter_link.get_internal_id() not in wf.parameter_links: + wf.parameter_links[parameter_link.get_internal_id()] = parameter_link + if work_destination.get_internal_id() not in wf.parameter_links_destination: + wf.parameter_links_destination[work_destination.get_internal_id()] = [] + wf.parameter_links_destination[work_destination.get_internal_id()].append(parameter_link.get_internal_id()) + + def find_parameter_links_from_id(self, internal_id): + rets = [] + if internal_id in self.parameter_links: + rets.append((self, self.parameter_links[internal_id])) + for k in self._works: + wk = self._works[k] + if isinstance(wk, Workflow): + links = wk.find_parameter_links_from_id(internal_id) + rets = rets + links + return rets + + def refresh_parameter_links(self): + p_metadata = {} + for internal_id in self.parameter_links: + p_metadata[internal_id] = self.parameter_links[internal_id].metadata + self.add_metadata_item('parameter_links', p_metadata) + + def get_parameter_links_metadata(self): + p_metadata = {} + for internal_id in self.parameter_links: + p_metadata[internal_id] = self.parameter_links[internal_id].metadata + self.add_metadata_item('parameter_links', p_metadata) + return p_metadata + + def set_parameter_links_metadata(self, p_links): + for internal_id in self.parameter_links: + if internal_id in p_links: + p_metadata = p_links[internal_id] + self.parameter_links[internal_id].metadata = p_metadata + + def set_source_parameters(self, internal_id): + work = self.works[internal_id] + # if type(work) in [Work]: + # print(work.work_id) + # print(internal_id) + # print(self.parameter_links_source) + if internal_id in self.parameter_links_source: + for p_id in self.parameter_links_source[internal_id]: + # print(p_id) + p_links = self.find_parameter_links_from_id(p_id) + # print(p_links) + for wf, p_link in p_links: + p_link.set_parameters(work) + wf.refresh_parameter_links() + + def get_destination_parameters(self, internal_id): + # work = self.works[internal_id] + parameters = {} + if internal_id in self.parameter_links_destination: + for p_id in self.parameter_links_destination[internal_id]: + p_link = self.parameter_links[p_id] + parameters.update(p_link.get_parameters()) + return parameters + + def load_parameter_links(self): + p_metadata = self.get_metadata_item('parameter_links', {}) + for p_id in self.parameter_links: + if p_id in p_metadata: + self.parameter_links[p_id].metadata = p_metadata[p_id] + + def add_next_work(self, work_id): + next_works = self.next_works + next_works.append(work_id) + self.next_works = next_works def enable_next_works(self, work, cond): self.log_debug("Checking Work %s condition: %s" % (work.get_internal_id(), @@ -1013,13 +1355,30 @@ def enable_next_works(self, work, cond): new_next_works = [] if next_works is not None: for next_work in next_works: - new_parameters = work.get_parameters_for_next_task() - new_next_work = self.get_new_work_from_template(next_work.get_template_id(), new_parameters) + # parameters = self.get_destination_parameters(next_work.get_internal_id()) + new_next_work = self.get_new_work_to_run(next_work.get_internal_id()) work.add_next_work(new_next_work.get_internal_id()) + new_next_work.parent_workload_id = work.workload_id # cond.add_condition_work(new_next_work) ####### TODO: new_next_works.append(new_next_work) return new_next_works + def add_loop_condition(self, condition, position='end'): + self.loop_condition_position = position + self.loop_condition = condition + + def has_loop_condition(self): + if self.loop_condition: + return True + return False + + def get_loop_condition_status(self): + if self.has_loop_condition(): + self.loop_condition.load_conditions(self.works) + # self.logger.debug("Loop condition %s" % (json_dumps(self.loop_condition, sort_keys=True, indent=4))) + return self.loop_condition.get_condition_status() + return False + def __str__(self): return str(json_dumps(self)) @@ -1029,8 +1388,32 @@ def get_new_works(self): new works to be ready to start """ - self.sync_works() - return [self.works[k] for k in self.new_to_run_works] + if self.to_cancel: + return [] + + self.sync_works(to_cancel=self.to_cancel) + works = [] + + if self.to_start_works: + init_works = self.init_works + to_start_works = self.to_start_works + work_id = to_start_works.pop(0) + self.to_start_works = to_start_works + self.get_new_work_to_run(work_id) + if not init_works: + init_works.append(work_id) + self.init_works = init_works + + for k in self.new_to_run_works: + if isinstance(self.works[k], Work): + self.works[k] = self.get_new_parameters_for_work(self.works[k]) + works.append(self.works[k]) + if isinstance(self.works[k], Workflow): + works = works + self.works[k].get_new_works() + for k in self.current_running_works: + if isinstance(self.works[k], Workflow): + works = works + self.works[k].get_new_works() + return works def get_current_works(self): """ @@ -1038,8 +1421,14 @@ def get_current_works(self): Current running works """ - self.sync_works() - return [self.works[k] for k in self.current_running_works] + self.sync_works(to_cancel=self.to_cancel) + works = [] + for k in self.current_running_works: + if isinstance(self.works[k], Work): + works.append(self.works[k]) + if isinstance(self.works[k], Workflow): + works = works + self.works[k].get_current_works() + return works def get_all_works(self): """ @@ -1047,8 +1436,15 @@ def get_all_works(self): Current running works """ - self.sync_works() - return [self.works[k] for k in self.works] + self.sync_works(to_cancel=self.to_cancel) + + works = [] + for k in self.works: + if isinstance(self.works[k], Work): + works.append(self.works[k]) + if isinstance(self.works[k], Workflow): + works = works + self.works[k].get_all_works() + return works def get_primary_initial_collection(self): """ @@ -1056,14 +1452,26 @@ def get_primary_initial_collection(self): """ if self.primary_initial_work: - return self.get_works_template()[self.primary_initial_work].get_primary_input_collection() + if isinstance(self.get_works()[self.primary_initial_work], Workflow): + return self.get_works()[self.primary_initial_work].get_primary_initial_collection() + else: + return self.get_works()[self.primary_initial_work].get_primary_input_collection() elif self.initial_works: - return self.get_works_template()[self.initial_works[0]].get_primary_input_collection() + if isinstance(self.get_works()[self.initial_works[0]], Workflow): + return self.get_works()[self.initial_works[0]].get_primary_initial_collection() + else: + return self.get_works()[self.initial_works[0]].get_primary_input_collection() elif self.independent_works: - return self.get_works_template()[self.independent_works[0]].get_primary_input_collection() + if isinstance(self.get_works()[self.independent_works[0]], Workflow): + return self.get_works()[self.independent_works[0]].get_primary_initial_collection() + else: + return self.get_works()[self.independent_works[0]].get_primary_input_collection() else: - keys = self.get_works_template().keys() - return self.get_works_template()[keys[0]].get_primary_input_collection() + keys = self.get_works().keys() + if isinstance(self.get_works()[keys[0]], Workflow): + return self.get_works()[keys[0]].get_primary_initial_collection() + else: + return self.get_works()[keys[0]].get_primary_input_collection() return None def get_dependency_works(self, work_id, depth, max_depth): @@ -1079,16 +1487,18 @@ def get_dependency_works(self, work_id, depth, max_depth): return deps def order_independent_works(self): + self.log_debug("ordering independent works") ind_work_ids = self.independent_works + self.log_debug("independent works: %s" % (str(ind_work_ids))) self.independent_works = [] self.work_dependencies = {} for ind_work_id in ind_work_ids: - work = self.works_template[ind_work_id] + work = self.works[ind_work_id] self.work_dependencies[ind_work_id] = [] for ind_work_id1 in ind_work_ids: if ind_work_id == ind_work_id1: continue - work1 = self.works_template[ind_work_id1] + work1 = self.works[ind_work_id1] if work.depend_on(work1): self.work_dependencies[ind_work_id].append(ind_work_id1) self.log_debug('work dependencies 1: %s' % str(self.work_dependencies)) @@ -1101,23 +1511,39 @@ def order_independent_works(self): self.log_debug('work dependencies 2: %s' % str(self.work_dependencies)) while True: + # self.log_debug('independent_works N: %s' % str(self.independent_works)) + # self.log_debug('work dependencies N: %s' % str(self.work_dependencies)) + has_changes = False for work_id in self.work_dependencies: if work_id not in self.independent_works and len(self.work_dependencies[work_id]) == 0: self.independent_works.append(work_id) + has_changes = True for work_id in self.independent_works: if work_id in self.work_dependencies: del self.work_dependencies[work_id] + has_changes = True for work_id in self.work_dependencies: for in_work_id in self.independent_works: if in_work_id in self.work_dependencies[work_id]: self.work_dependencies[work_id].remove(in_work_id) + has_changes = True if not self.work_dependencies: break + if not has_changes: + self.log_debug("There are loop dependencies between works.") + self.log_debug('independent_works N: %s' % str(self.independent_works)) + self.log_debug('work dependencies N: %s' % str(self.work_dependencies)) + for work_id in self.work_dependencies: + if work_id not in self.independent_works: + self.independent_works.append(work_id) + break self.log_debug('independent_works: %s' % str(self.independent_works)) + self.log_debug("ordered independent works") def first_initialize(self): # set new_to_run works if not self.first_initial: + self.log_debug("first initializing") self.first_initial = True self.order_independent_works() if self.initial_works: @@ -1125,13 +1551,19 @@ def first_initialize(self): elif self.independent_works: tostart_works = self.independent_works else: - tostart_works = list(self.get_works_template().keys()) + tostart_works = list(self.get_works().keys()) tostart_works = [tostart_works[0]] + to_start_works = self.to_start_works for work_id in tostart_works: - self.get_new_work_from_template(work_id) - - def sync_works(self): + to_start_works.append(work_id) + self.to_start_works = to_start_works + self.log_debug("first initialized") + + def sync_works(self, to_cancel=False): + if to_cancel: + self.to_cancel = to_cancel + self.log_debug("synchroning works") self.first_initialize() self.refresh_works() @@ -1146,10 +1578,18 @@ def sync_works(self): self.current_running_works.append(work.get_internal_id()) for work in [self.works[k] for k in self.current_running_works]: - if work.get_template_id() in self.work_conds: + if isinstance(work, Workflow): + work.sync_works(to_cancel=self.to_cancel) + + if work.is_terminated(): + self.log_debug("work %s is_terminated, sync_global_parameters_from_work" % (work.get_internal_id())) + self.set_source_parameters(work.get_internal_id()) + self.sync_global_parameters_from_work(work) + + if work.get_internal_id() in self.work_conds: self.log_debug("Work %s has condition dependencies %s" % (work.get_internal_id(), - json_dumps(self.work_conds[work.get_template_id()], sort_keys=True, indent=4))) - for cond_id in self.work_conds[work.get_template_id()]: + json_dumps(self.work_conds[work.get_internal_id()], sort_keys=True, indent=4))) + for cond_id in self.work_conds[work.get_internal_id()]: cond = self.conditions[cond_id] self.log_debug("Work %s has condition dependencie %s" % (work.get_internal_id(), json_dumps(cond, sort_keys=True, indent=4))) @@ -1158,7 +1598,7 @@ def sync_works(self): if work.is_terminated(): self.log_info("Work %s is terminated(%s)" % (work.get_internal_id(), work.get_status())) self.log_debug("Work conditions: %s" % json_dumps(self.work_conds, sort_keys=True, indent=4)) - if work.get_template_id() not in self.work_conds: + if work.get_internal_id() not in self.work_conds: # has no next work self.log_info("Work %s has no condition dependencies" % work.get_internal_id()) self.terminated_works.append(work.get_internal_id()) @@ -1183,6 +1623,11 @@ def sync_works(self): self.num_cancelled_works += 1 elif work.is_suspended(): self.num_suspended_works += 1 + + # if work.is_terminated(): + # # if it's a loop workflow, to generate new loop + # if isinstance(work, Workflow): + # work.sync_works() log_str = "num_total_works: %s" % self.num_total_works log_str += ", num_finished_works: %s" % self.num_finished_works log_str += ", num_subfinished_works: %s" % self.num_subfinished_works @@ -1192,7 +1637,11 @@ def sync_works(self): log_str += ", num_suspended_works: %s" % self.num_suspended_works self.log_debug(log_str) + self.refresh_works() + self.log_debug("synchronized works") + def resume_works(self): + self.to_cancel = False self.num_subfinished_works = 0 self.num_finished_works = 0 self.num_failed_works = 0 @@ -1206,7 +1655,90 @@ def resume_works(self): self.terminated_works = [] self.current_running_works = self.current_running_works + t_works for work in [self.works[k] for k in self.current_running_works]: - work.resume_work() + if isinstance(work, Workflow): + work.resume_works() + else: + work.resume_work() + + def get_relation_data(self, work): + ret = {'work': {'workload_id': work.workload_id, + 'external_id': work.external_id, + 'work_name': work.get_work_name()}} + if hasattr(work, 'get_ancestry_works'): + ret['work']['ancestry_works'] = work.get_ancestry_works() + + next_works = work.next_works + if next_works: + next_works_data = [] + for next_id in next_works: + next_work = self.works[next_id] + if isinstance(next_work, Workflow): + next_work_data = next_work.get_relation_map() + else: + next_work_data = self.get_relation_data(next_work) + next_works_data.append(next_work_data) + ret['next_works'] = next_works_data + return ret + + def organzie_based_on_ancestry_works(self, works): + new_ret = [] + + ordered_items = {} + left_items = [] + for item in works: + if type(item) in [dict]: + if 'ancestry_works' not in item['work'] or not item['work']['ancestry_works']: + new_ret.append(item) + ordered_items[item['work']['work_name']] = item + else: + # ancestry_works = item['work']['ancestry_works'] + left_items.append(item) + elif type(item) in [list]: + # subworkflow + # work_names, ancestry_works = self.get_workflow_ancestry_works(item) + # if not ancestry_works: + # new_ret.append(item) + # currently now support to use dependency_map to depend_on a workflow. + # depending on a workflow should use Condition. It's already processed. + new_ret.append(item) + while True: + new_left_items = left_items + left_items = [] + has_updates = False + for item in new_left_items: + ancestry_works = item['work']['ancestry_works'] + all_ancestry_ready = True + for work_name in ancestry_works: + if work_name not in ordered_items and work_name != item['work']['work_name']: + all_ancestry_ready = False + if all_ancestry_ready: + for work_name in ancestry_works: + if work_name != item['work']['work_name']: + if 'next_works' not in ordered_items[work_name]: + ordered_items[work_name]['next_works'] = [item] + else: + ordered_items[work_name]['next_works'].append(item) + has_updates = True + ordered_items[item['work']['work_name']] = item + else: + left_items.append(item) + if not has_updates or not left_items: + break + for item in left_items: + new_ret.append(item) + return new_ret + + def get_relation_map(self): + ret = [] + init_works = self.init_works + for internal_id in init_works: + if isinstance(self.works[internal_id], Workflow): + work_data = self.works[internal_id].get_relation_map() + else: + work_data = self.get_relation_data(self.works[internal_id]) + ret.append(work_data) + ret = self.organzie_based_on_ancestry_works(ret) + return ret def clean_works(self): self.num_subfinished_works = 0 @@ -1221,7 +1753,7 @@ def clean_works(self): self.terminated_works = [] self.current_running_works = [] - self.works = {} + # self.works = {} self.work_sequence = {} # order list self.first_initial = False @@ -1248,8 +1780,8 @@ def is_terminated(self): """ *** Function called by Marshaller agent. """ - self.sync_works() - if len(self.new_to_run_works) == 0 and len(self.current_running_works) == 0: + self.sync_works(to_cancel=self.to_cancel) + if (self.to_cancel or len(self.new_to_run_works) == 0) and len(self.current_running_works) == 0: return True return False @@ -1330,6 +1862,25 @@ def get_terminated_msg(self): return self.works[self.last_work].get_terminated_msg() return None + def get_status(self): + if self.is_terminated(): + if self.is_finished(): + return WorkStatus.Finished + elif self.is_subfinished(): + return WorkStatus.SubFinished + elif self.is_failed(): + return WorkStatus.Failed + elif self.is_expired(): + return WorkStatus.Expired + elif self.is_cancelled(): + return WorkStatus.Cancelled + elif self.is_suspended(): + return WorkStatus.Suspended + return WorkStatus.Transforming + + def depend_on(self, work): + return False + def add_proxy(self): self.proxy = get_proxy() if not self.proxy: @@ -1337,3 +1888,429 @@ def add_proxy(self): def get_proxy(self): return self.proxy + + +class Workflow(Base): + def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None, logger=None): + # super(Workflow, self).__init__(name=name, workload_id=workload_id, lifetime=lifetime, pending_time=pending_time, logger=logger) + self.logger = logger + if self.logger is None: + self.setup_logger() + + self.template = WorkflowBase(name=name, workload_id=workload_id, lifetime=lifetime, pending_time=pending_time, logger=logger) + self.parent_num_run = None + self._num_run = 0 + self.runs = {} + self.loop_condition_position = 'end' + self.origin_metadata = None + + # for old idds version + t_works = self.template.works + if not t_works and hasattr(self, 'works_template'): + self.template.works = self.works_template + + def setup_logger(self): + # Setup logger + self.logger = logging.getLogger(self.get_class_name()) + + def log_info(self, info): + if self.logger is None: + self.setup_logger() + self.logger.info(info) + + def log_debug(self, info): + if self.logger is None: + self.setup_logger() + self.logger.debug(info) + + def __deepcopy__(self, memo): + logger = self.logger + self.logger = None + + cls = self.__class__ + result = cls.__new__(cls) + + memo[id(self)] = result + + # Deep copy all other attributes + for k, v in self.__dict__.items(): + setattr(result, k, copy.deepcopy(v, memo)) + + self.logger = logger + result.logger = logger + return result + + def get_template_id(self): + return self.template.get_template_id() + + @property + def metadata(self): + run_metadata = {'parent_num_run': self.parent_num_run, + 'num_run': self._num_run, + 'runs': {}} + for run_id in self.runs: + run_metadata['runs'][run_id] = self.runs[run_id].metadata + if not self.runs: + run_metadata['parameter_links'] = self.template.get_parameter_links_metadata() + return run_metadata + + @metadata.setter + def metadata(self, value): + self.template.load_metadata() + self.origin_metadata = value + + run_metadata = value + self.parent_num_run = run_metadata['parent_num_run'] + self._num_run = run_metadata['num_run'] + runs = run_metadata['runs'] + if not runs and 'parameter_links' in run_metadata: + parameter_links = run_metadata['parameter_links'] + self.template.set_parameter_links_metadata(parameter_links) + for run_id in runs: + self.runs[run_id] = self.template.copy() + self.runs[run_id].metadata = runs[run_id] + # self.add_metadata_item('runs', ) + + @property + def independent_works(self): + if self.runs: + return self.runs[str(self.num_run)].independent_works + return self.template.independent_works + + @independent_works.setter + def independent_works(self, value): + if self.runs: + self.runs[str(self.num_run)].independent_works = value + self.template.independent_works = value + + def add_next_work(self, work_id): + if self.runs: + self.runs[str(self.num_run)].add_next_work(work_id) + else: + raise Exception("There are no runs. It should not have next work") + + @property + def last_updated_at(self): + if self.runs: + return self.runs[str(self.num_run)].last_updated_at + return None + + @last_updated_at.setter + def last_updated_at(self, value): + if self.runs: + self.runs[str(self.num_run)].last_updated_at = value + + @property + def name(self): + return self.template.name + + @name.setter + def name(self, value): + self.template.name = value + + @property + def username(self): + return self.template.username + + @username.setter + def username(self, value): + self.template.username = value + + @property + def userdn(self): + return self.template.userdn + + @userdn.setter + def userdn(self, value): + self.template.userdn = value + + @property + def lifetime(self): + return self.template.lifetime + + @lifetime.setter + def lifetime(self, value): + self.template.lifetime = value + + @property + def to_cancel(self): + return self.template.to_cancel + + @to_cancel.setter + def to_cancel(self, value): + if self.runs: + self.runs[str(self.num_run)].to_cancel = value + self.template.to_cancel = value + + @property + def num_run(self): + if self.parent_num_run: + # return self.parent_num_run * 100 + self._num_run + pass + return self._num_run + + @num_run.setter + def num_run(self, value): + if self.parent_num_run: + # self._num_run = value - self.parent_num_run * 100 + self._num_run = value + else: + self._num_run = value + + @property + def transforming(self): + if self.runs and str(self.num_run) in self.runs: + return True + return False + + @transforming.setter + def transforming(self, value): + if self._num_run < 1: + self._num_run = 1 + if str(self.num_run) not in self.runs: + self.runs[str(self.num_run)] = self.template.copy() + self.runs[str(self.num_run)].num_run = self.num_run + if self.runs[str(self.num_run)].has_loop_condition(): + self.runs[str(self.num_run)].num_run = self.num_run + if self._num_run > 1: + p_metadata = self.runs[str(self.num_run - 1)].get_metadata_item('parameter_links') + self.runs[str(self.num_run)].add_metadata_item('parameter_links', p_metadata) + + def set_workload_id(self, workload_id): + if self.runs: + self.runs[str(self.num_run)].workload_id = workload_id + else: + self.template.workload_id = workload_id + # self.dynamic.workload_id = workload_id + + def get_internal_id(self): + if self.runs: + return self.runs[str(self.num_run)].get_internal_id() + return self.template.get_internal_id() + + def get_workload_id(self): + if self.runs: + return self.runs[str(self.num_run)].workload_id + return self.template.workload_id + + def add_work(self, work, initial=False, primary=False): + self.template.add_work(work, initial, primary) + + def add_condition(self, cond): + self.template.add_condition(cond) + + def add_parameter_link(self, work_source, work_destinations, parameter_link): + self.template.add_parameter_link(work_source, work_destinations, parameter_link) + + def find_workflow_from_work(self, work): + return self.template.find_workflow_from_work(work) + + def find_parameter_links_from_id(self, internal_id): + if self.runs: + return self.runs[str(self.num_run)].find_parameter_links_from_id(internal_id) + return self.template.find_parameter_links_from_id(internal_id) + + def refresh_parameter_links(self): + if self.runs: + self.runs[str(self.num_run)].refresh_parameter_links() + + def set_global_parameters(self, value): + self.template.set_global_parameters(value) + + def set_sliced_global_parameters(self, source, index=0): + self.template.set_sliced_global_parameters(source, index) + + def sync_global_parameters_from_work(self, work): + if self.runs: + return self.runs[str(self.num_run)].sync_global_parameters_from_work(work) + return self.template.sync_global_parameters_from_work(work) + + def get_new_works(self): + self.log_debug("synchronizing works") + self.sync_works(to_cancel=self.to_cancel) + self.log_debug("synchronized works") + if self.runs: + return self.runs[str(self.num_run)].get_new_works() + return [] + + def get_current_works(self): + self.sync_works(to_cancel=self.to_cancel) + if self.runs: + return self.runs[str(self.num_run)].get_current_works() + return [] + + def get_all_works(self): + self.sync_works(to_cancel=self.to_cancel) + if self.runs: + return self.runs[str(self.num_run)].get_all_works() + return [] + + def get_primary_initial_collection(self): + if self.runs: + return self.runs[str(self.num_run)].get_primary_initial_collection() + return self.template.get_primary_initial_collection() + + def resume_works(self): + if self.runs: + self.runs[str(self.num_run)].resume_works() + self.template.to_cancel = False + + def clean_works(self): + # if self.runs: + # self.runs[str(self.num_run)].clean_works() + self.parent_num_run = None + self._num_run = 0 + self.runs = {} + + def is_to_expire(self, expired_at=None, pending_time=None, request_id=None): + if self.runs: + return self.runs[str(self.num_run)].is_to_expire(expired_at=expired_at, pending_time=pending_time, request_id=request_id) + return False + + def is_terminated(self): + if self.runs: + if self.runs[str(self.num_run)].is_terminated(): + if not self.runs[str(self.num_run)].has_loop_condition() or not self.runs[str(self.num_run)].get_loop_condition_status(): + return True + return False + + def is_finished(self): + if self.is_terminated(): + return self.runs[str(self.num_run)].is_finished() + return False + + def is_subfinished(self): + if self.is_terminated(): + return self.runs[str(self.num_run)].is_subfinished() + return False + + def is_failed(self): + if self.is_terminated(): + return self.runs[str(self.num_run)].is_failed() + return False + + def is_expired(self): + if self.is_terminated(): + return self.runs[str(self.num_run)].is_expired() + return False + + def is_cancelled(self): + if self.is_terminated(): + return self.runs[str(self.num_run)].is_cancelled() + return False + + def is_suspended(self): + if self.is_terminated(): + return self.runs[str(self.num_run)].is_suspended() + return False + + def get_terminated_msg(self): + if self.is_terminated(): + return self.runs[str(self.num_run)].get_terminated_msg() + return None + + def get_status(self): + if not self.runs: + return WorkStatus.New + if not self.is_terminated(): + return WorkStatus.Transforming + return self.runs[str(self.num_run)].get_status() + + def depend_on(self, work): + return self.template.depend_on(work) + + def add_proxy(self): + self.template.add_proxy() + + def get_proxy(self): + self.template.get_proxy() + + def add_loop_condition(self, condition, position='end'): + if not position or position != 'begin': + position = 'end' + position = 'end' # force position to end currently. position = 'begin' is not supported now. + self.template.add_loop_condition(condition, position=position) + self.loop_condition_position = position + + def refresh_works(self): + if self.runs: + self.runs[str(self.num_run)].refresh_works() + + def sync_works(self, to_cancel=False): + if to_cancel: + self.to_cancel = to_cancel + + origin_metadata = self.origin_metadata + t_works = self.template.works + if not t_works and hasattr(self, 'works_template'): + self.template.works = self.works_template + if origin_metadata: + self.metadata = origin_metadata + + # position is end. + if self.num_run < 1: + self.num_run = 1 + if str(self.num_run) not in self.runs: + self.runs[str(self.num_run)] = self.template.copy() + + t_works = self.runs[str(self.num_run)].works + if not t_works and hasattr(self, 'works_template'): + self.runs[str(self.num_run)].works = self.works_template + if origin_metadata: + self.metadata = origin_metadata + + self.runs[str(self.num_run)].num_run = self.num_run + if self.runs[str(self.num_run)].has_loop_condition(): + self.runs[str(self.num_run)].num_run = self.num_run + if self.num_run > 1: + p_metadata = self.runs[str(self.num_run - 1)].get_metadata_item('parameter_links') + self.runs[str(self.num_run)].add_metadata_item('parameter_links', p_metadata) + + t_works = self.runs[str(self.num_run)].works + if not t_works and hasattr(self, 'works_template'): + self.runs[str(self.num_run)].works = self.works_template + if origin_metadata: + self.metadata = origin_metadata + + self.runs[str(self.num_run)].sync_works(to_cancel=to_cancel) + + if self.runs[str(self.num_run)].is_terminated(): + if to_cancel: + self.logger.info("num_run %s, to cancel" % self.num_run) + else: + if self.runs[str(self.num_run)].has_loop_condition(): + if self.runs[str(self.num_run)].get_loop_condition_status(): + self.logger.info("num_run %s get_loop_condition_status %s, start next run" % (self.num_run, self.runs[str(self.num_run)].get_loop_condition_status())) + self._num_run += 1 + self.runs[str(self.num_run)] = self.template.copy() + + self.runs[str(self.num_run)].num_run = self.num_run + p_metadata = self.runs[str(self.num_run - 1)].get_metadata_item('parameter_links') + self.runs[str(self.num_run)].add_metadata_item('parameter_links', p_metadata) + + self.runs[str(self.num_run)].global_parameters = self.runs[str(self.num_run - 1)].global_parameters + else: + self.logger.info("num_run %s get_loop_condition_status %s, terminated loop" % (self.num_run, self.runs[str(self.num_run)].get_loop_condition_status())) + + def get_relation_map(self): + if not self.runs: + return [] + if self.template.has_loop_condition(): + rets = {} + for run in self.runs: + rets[run] = self.runs[run].get_relation_map() + return [rets] + else: + return self.runs[str(self.num_run)].get_relation_map() + + +class SubWorkflow(Workflow): + def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None, logger=None): + # Init a workflow. + super(SubWorkflow, self).__init__(name=name, workload_id=workload_id, lifetime=lifetime, pending_time=pending_time, logger=logger) + + +class LoopWorkflow(Workflow): + def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None, logger=None): + # Init a workflow. + super(LoopWorkflow, self).__init__(name=name, workload_id=workload_id, lifetime=lifetime, pending_time=pending_time, logger=logger) diff --git a/workflow/lib/idds/workflow/workflow1.py b/workflow/lib/idds/workflow/workflow1.py deleted file mode 100644 index e0f60f8d..00000000 --- a/workflow/lib/idds/workflow/workflow1.py +++ /dev/null @@ -1,1655 +0,0 @@ -#!/usr/bin/env python -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0OA -# -# Authors: -# - Wen Guan, , 2020 - 2021 - -import copy -import datetime -import logging -import inspect -import random -import time -import uuid - - -from idds.common import exceptions -from idds.common.constants import IDDSEnum -from idds.common.utils import json_dumps, setup_logging, get_proxy -from idds.common.utils import str_to_date -from .base import Base -from .work import Work - - -setup_logging(__name__) - - -class ConditionOperator(IDDSEnum): - And = 0 - Or = 1 - - -class ConditionTrigger(IDDSEnum): - NotTriggered = 0 - ToTrigger = 1 - Triggered = 2 - - -class CompositeCondition(Base): - def __init__(self, operator=ConditionOperator.And, conditions=[], true_works=None, false_works=None, logger=None): - self._conditions = [] - self._true_works = [] - self._false_works = [] - - super(CompositeCondition, self).__init__() - - self.internal_id = str(uuid.uuid4())[:8] - self.template_id = self.internal_id - # self.template_id = str(uuid.uuid4())[:8] - - self.logger = logger - if self.logger is None: - self.setup_logger() - - if conditions is None: - conditions = [] - if true_works is None: - true_works = [] - if false_works is None: - false_works = [] - if conditions and type(conditions) not in [tuple, list]: - conditions = [conditions] - if true_works and type(true_works) not in [tuple, list]: - true_works = [true_works] - if false_works and type(false_works) not in [tuple, list]: - false_works = [false_works] - self.validate_conditions(conditions) - - self.operator = operator - self.conditions = [] - self.true_works = [] - self.false_works = [] - - self.conditions = conditions - self.true_works = true_works - self.false_works = false_works - - def get_class_name(self): - return self.__class__.__name__ - - def get_internal_id(self): - return self.internal_id - - def get_template_id(self): - return self.template_id - - @property - def conditions(self): - # return self.get_metadata_item('true_works', []) - return self._conditions - - @conditions.setter - def conditions(self, value): - self._conditions = value - new_value = [] - for cond in value: - if cond is None: - continue - if inspect.ismethod(cond): - new_cond = {'idds_method': cond.__name__, - 'idds_method_internal_id': cond.__self__.get_internal_id(), - 'idds_method_class_id': cond.__self__.get_template_id()} - else: - new_cond = cond - new_value.append(new_cond) - self.add_metadata_item('conditions', new_value) - - @property - def true_works(self): - # return self.get_metadata_item('true_works', []) - return self._true_works - - @true_works.setter - def true_works(self, value): - self._true_works = value - true_work_meta = self.get_metadata_item('true_works', {}) - for work in value: - if work is None: - continue - if isinstance(work, Work): - if work.get_template_id() not in true_work_meta: - true_work_meta[work.get_internal_id()] = {'triggered': False, - 'template_id': work.get_template_id()} - elif isinstance(work, CompositeCondition): - if work.get_template_id() not in true_work_meta: - true_work_meta[work.get_internal_id()] = {'triggered': False, - 'template_id': work.get_template_id(), - 'metadata': work.metadata} - elif isinstance(work, Workflow): - if work.get_template_id() not in true_work_meta: - true_work_meta[work.get_internal_id()] = {'triggered': False, - 'template_id': work.get_template_id(), - 'metadata': work.metadata} - self.add_metadata_item('true_works', true_work_meta) - - @property - def false_works(self): - # return self.get_metadata_item('false_works', []) - return self._false_works - - @false_works.setter - def false_works(self, value): - self._false_works = value - false_work_meta = self.get_metadata_item('false_works', {}) - for work in value: - if work is None: - continue - if isinstance(work, Work): - if work.get_template_id() not in false_work_meta: - false_work_meta[work.get_internal_id()] = {'triggered': False, - 'template_id': work.get_template_id()} - elif isinstance(work, CompositeCondition): - if work.get_template_id() not in false_work_meta: - false_work_meta[work.get_internal_id()] = {'triggered': False, - 'template_id': work.get_template_id(), - 'metadata': work.metadata} - elif isinstance(work, Workflow): - if work.get_template_id() not in false_work_meta: - false_work_meta[work.get_internal_id()] = {'triggered': False, - 'template_id': work.get_template_id(), - 'metadata': work.metadata} - self.add_metadata_item('false_works', false_work_meta) - - def validate_conditions(self, conditions): - if type(conditions) not in [tuple, list]: - raise exceptions.IDDSException("conditions must be list") - for cond in conditions: - assert(inspect.ismethod(cond)) - assert(isinstance(cond.__self__, Work)) - if (cond.__self__.is_template): - raise exceptions.IDDSException("Work class for CompositeCondition must not be a template") - - def add_condition(self, cond): - assert(inspect.ismethod(cond)) - assert(isinstance(cond.__self__, Work)) - if (cond.__self__.is_template): - raise exceptions.IDDSException("Work class for CompositeCondition must not be a template") - - # self.conditions.append({'condition': cond, 'current_work': cond.__self__}) - - self._conditions.append(cond) - new_value = self.get_metadata_item('conditions', []) - if inspect.ismethod(cond): - new_cond = {'idds_method': cond.__name__, - 'idds_method_internal_id': cond.__self__.get_internal_id(), - 'idds_method_class_id': cond.__self__.get_template_id()} - else: - new_cond = cond - new_value.append(new_cond) - self.add_metadata_item('conditions', new_value) - - def load_metadata(self): - # conditions = self.get_metadata_item('conditions', []) - true_works_meta = self.get_metadata_item('true_works', {}) - false_works_meta = self.get_metadata_item('false_works', {}) - - for work in self.true_works: - if isinstance(work, CompositeCondition) or isinstance(work, Workflow): - if work.get_internal_id() in true_works_meta: - work.metadata = true_works_meta[work.get_internal_id()]['metadata'] - for work in self.false_works: - if isinstance(work, CompositeCondition) or isinstance(work, Workflow): - if work.get_internal_id() in false_works_meta: - work.metadata = false_works_meta[work.get_internal_id()]['metadata'] - - def to_dict(self): - # print('to_dict') - ret = {'class': self.__class__.__name__, - 'module': self.__class__.__module__, - 'attributes': {}} - for key, value in self.__dict__.items(): - # print(key) - # print(value) - # if not key.startswith('__') and not key.startswith('_'): - if not key.startswith('__'): - if key == 'logger': - value = None - elif key == '_conditions': - new_value = [] - for cond in value: - if inspect.ismethod(cond): - new_cond = {'idds_method': cond.__name__, - 'idds_method_internal_id': cond.__self__.get_internal_id(), - 'idds_method_class_id': cond.__self__.get_template_id()} - else: - new_cond = cond - new_value.append(new_cond) - value = new_value - elif key in ['_true_works', '_false_works']: - new_value = [] - for w in value: - if isinstance(w, Work): - new_w = w.get_template_id() - elif isinstance(w, CompositeCondition): - new_w = w.to_dict() - elif isinstance(w, Workflow): - new_w = w.to_dict() - else: - new_w = w - new_value.append(new_w) - value = new_value - else: - value = self.to_dict_l(value) - ret['attributes'][key] = value - return ret - - def get_work_from_id(self, work_id, class_id, works, works_template): - for w_id in works: - if works[w_id].get_internal_id() == work_id: - return works[w_id] - for w_id in works_template: - if works_template[w_id].get_template_id() == class_id: - return works_template[w_id] - return None - - def load_conditions(self, works, works_template): - new_conditions = [] - for cond in self.conditions: - if 'idds_method' in cond and 'idds_method_class_id' in cond: - internal_id = cond['idds_method_internal_id'] - class_id = cond['idds_method_class_id'] - work = self.get_work_from_id(internal_id, class_id, works, works_template) - if work is not None: - new_cond = getattr(work, cond['idds_method']) - else: - self.logger.error("Work cannot be found for %s:%s" % (internal_id, class_id)) - new_cond = cond - else: - new_cond = cond - new_conditions.append(new_cond) - self.conditions = new_conditions - - new_true_works = [] - for w in self.true_works: - class_id = self.true_works[w]['template_id'] - if isinstance(w, CompositeCondition): - # work = w.load_conditions(works, works_template) - w.load_conditions(works, works_template) - work = w - elif type(w) in [str]: - work = self.get_work_from_id(w, class_id, works, works_template) - if work is None: - self.logger.error("Work cannot be found for %s" % str(w)) - work = w - else: - self.logger.error("Work cannot be found for %s" % str(w)) - work = w - new_true_works.append(work) - self.true_works = new_true_works - - new_false_works = [] - for w in self.false_works: - class_id = self.true_works[w]['template_id'] - if isinstance(w, CompositeCondition): - # work = w.load_condtions(works, works_template) - w.load_conditions(works, works_template) - work = w - elif type(w) in [str]: - work = self.get_work_from_id(w, class_id, works, works_template) - if work is None: - self.logger.error("Work cannot be found for %s" % str(w)) - work = w - else: - self.logger.error("Work cannot be found for %s" % str(w)) - work = w - new_false_works.append(work) - self.false_works = new_false_works - - def all_works(self): - works = [] - works = works + self.all_pre_works() - works = works + self.all_next_works() - return works - - def all_condition_ids(self): - works = [] - for cond in self.conditions: - if inspect.ismethod(cond): - works.append(cond.__self__.get_internal_id()) - else: - self.logger.error("cond cannot be recognized: %s" % str(cond)) - works.append(cond) - for work in self.true_works + self.false_works: - if isinstance(work, CompositeCondition): - works = works + work.all_condition_ids() - return works - - def all_pre_works(self): - works = [] - for cond in self.conditions: - if inspect.ismethod(cond): - works.append(cond.__self__) - else: - self.logger.error("cond cannot be recognized: %s" % str(cond)) - works.append(cond) - for work in self.true_works + self.false_works: - if isinstance(work, CompositeCondition): - works = works + work.all_pre_works() - return works - - def all_next_works(self): - works = [] - for work in self.true_works + self.false_works: - if isinstance(work, CompositeCondition): - works = works + work.all_next_works() - else: - works.append(work) - return works - - def get_current_cond_status(self, cond): - if callable(cond): - if cond(): - return True - else: - return False - else: - if cond: - return True - else: - return False - - def get_cond_status(self): - if self.operator == ConditionOperator.And: - for cond in self.conditions: - if not self.get_current_cond_status(cond): - return False - return True - else: - for cond in self.conditions: - if self.get_current_cond_status(cond): - return True - return False - - def get_condition_status(self): - return self.get_cond_status() - - def is_condition_true(self): - if self.get_cond_status(): - return True - return False - - def is_condition_false(self): - if not self.get_cond_status(): - return True - return False - - def get_next_works(self, trigger=ConditionTrigger.NotTriggered): - works = [] - if self.get_cond_status(): - true_work_meta = self.get_metadata_item('true_works', {}) - for work in self.true_works: - if isinstance(work, CompositeCondition) or isinstance(work, Workflow): - works = works + work.get_next_works(trigger=trigger) - else: - if work.get_internal_id() not in true_work_meta: - true_work_meta[work.get_internal_id()] = {'triggered': False, - 'template_id': work.get_template_id()} - if trigger == ConditionTrigger.ToTrigger: - if not true_work_meta[work.get_internal_id()]['triggered']: - true_work_meta[work.get_internal_id()]['triggered'] = True - works.append(work) - elif work.get_is_template(): - # A template can be triggered many times. - works.append(work) - elif trigger == ConditionTrigger.NotTriggered: - if not true_work_meta[work.get_internal_id()]['triggered']: - works.append(work) - elif trigger == ConditionTrigger.Triggered: - if true_work_meta[work.get_internal_id()]['triggered']: - works.append(work) - self.add_metadata_item('true_works', true_work_meta) - else: - false_work_meta = self.get_metadata_item('false_works', {}) - for work in self.false_works: - if isinstance(work, CompositeCondition) or isinstance(work, Workflow): - works = works + work.get_next_works(trigger=trigger) - else: - if work.get_internal_id() not in false_work_meta: - false_work_meta[work.get_internal_id()] = {'triggered': False, - 'template_id': work.get_template_id()} - if trigger == ConditionTrigger.ToTrigger: - if not false_work_meta[work.get_internal_id()]['triggered']: - false_work_meta[work.get_internal_id()]['triggered'] = True - works.append(work) - elif work.get_is_template(): - # A template can be triggered many times. - works.append(work) - elif trigger == ConditionTrigger.NotTriggered: - if not false_work_meta[work.get_internal_id()]['triggered']: - works.append(work) - elif trigger == ConditionTrigger.Triggered: - if false_work_meta[work.get_internal_id()]['triggered']: - works.append(work) - self.add_metadata_item('false_works', false_work_meta) - return works - - def generate_condition_from_template(self): - logger = self.logger - self.logger = None - new_cond = copy.deepcopy(self) - self.logger = logger - new_cond.logger = logger - # new_work.template_work_id = self.get_internal_id() - new_cond.internal_id = str(uuid.uuid4())[:8] - return new_cond - - -class AndCondition(CompositeCondition): - def __init__(self, conditions=[], true_works=None, false_works=None, logger=None): - super(AndCondition, self).__init__(operator=ConditionOperator.And, - conditions=conditions, - true_works=true_works, - false_works=false_works, - logger=logger) - - -class OrCondition(CompositeCondition): - def __init__(self, conditions=[], true_works=None, false_works=None, logger=None): - super(OrCondition, self).__init__(operator=ConditionOperator.Or, - conditions=conditions, - true_works=true_works, - false_works=false_works, - logger=logger) - - -class Condition(CompositeCondition): - def __init__(self, cond=None, current_work=None, true_work=None, false_work=None, logger=None): - super(Condition, self).__init__(operator=ConditionOperator.And, - conditions=[cond] if cond else [], - true_works=[true_work] if true_work else [], - false_works=[false_work] if false_work else [], - logger=logger) - - # to support load from old conditions - @property - def cond(self): - # return self.get_metadata_item('true_works', []) - return self.conditions[0] if len(self.conditions) >= 1 else None - - @cond.setter - def cond(self, value): - self.conditions = [value] - - @property - def true_work(self): - # return self.get_metadata_item('true_works', []) - return self.true_works if len(self.true_works) >= 1 else None - - @true_work.setter - def true_work(self, value): - self.true_works = [value] - - @property - def false_work(self): - # return self.get_metadata_item('true_works', []) - return self.false_works if len(self.false_works) >= 1 else None - - @false_work.setter - def false_work(self, value): - self.false_works = [value] - - -class TemplateCondition(CompositeCondition): - def __init__(self, cond=None, current_work=None, true_work=None, false_work=None, logger=None): - if true_work is not None and not isinstance(true_work, Work): - raise exceptions.IDDSException("true_work can only be set with Work class") - if false_work is not None and not isinstance(false_work, Work): - raise exceptions.IDDSException("false_work can only be set with Work class") - - super(TemplateCondition, self).__init__(operator=ConditionOperator.And, - conditions=[cond] if cond else [], - true_works=[true_work] if true_work else [], - false_works=[false_work] if false_work else [], - logger=logger) - - def validate_conditions(self, conditions): - if type(conditions) not in [tuple, list]: - raise exceptions.IDDSException("conditions must be list") - if len(conditions) > 1: - raise exceptions.IDDSException("Condition class can only support one condition. To support multiple condition, please use CompositeCondition.") - for cond in conditions: - assert(inspect.ismethod(cond)) - assert(isinstance(cond.__self__, Work)) - - def add_condition(self, cond): - raise exceptions.IDDSException("Condition class doesn't support add_condition. To support multiple condition, please use CompositeCondition.") - - -class ParameterLink(Base): - def __init__(self, parameters): - self.parameters = parameters - - def get_parameter_value(self, work, p): - p_f = getattr(work, p, 'None') - if p_f: - if callable(p_f): - return p_f() - else: - return p_f - else: - return None - - def set_parameters(self, work): - p_values = {} - for p in self.parameters: - p_values[p] = self.get_parameter_value(work, p) - self.add_metadata_item('parameters', p_values) - - def get_parameters(self): - return self.get_metadata_item('parameters', {}) - - -class Workflow(Base): - - def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None, logger=None): - """ - Init a workflow. - """ - self._conditions = {} - self._conditions_temp = {} - self._work_conds = {} - self.conditions_template = {} - self.work_conds_template = {} - - self._parameter_links = {} - - super(Workflow, self).__init__() - - self.internal_id = str(uuid.uuid4())[:8] - self.template_work_id = self.internal_id - # self.template_work_id = str(uuid.uuid4())[:8] - self.lifetime = lifetime - self.pending_time = pending_time - - if name: - self._name = name + "." + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S_%f") + str(random.randint(1, 1000)) - else: - self._name = 'idds.workflow.' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S_%f") + str(random.randint(1, 1000)) - - if workload_id is None: - workload_id = int(time.time()) - self.workload_id = workload_id - - self.logger = logger - if self.logger is None: - self.setup_logger() - - self.works_template = {} - self._works = {} - self.works = {} - self.work_sequence = {} # order list - - self.terminated_works = [] - self.initial_works = [] - # if the primary initial_work is not set, it's the first initial work. - self.primary_initial_work = None - self.independent_works = [] - - self.first_initial = False - self.new_to_run_works = [] - self.current_running_works = [] - - self.num_subfinished_works = 0 - self.num_finished_works = 0 - self.num_failed_works = 0 - self.num_cancelled_works = 0 - self.num_suspended_works = 0 - self.num_expired_works = 0 - self.num_total_works = 0 - - self.last_work = None - - self.last_updated_at = datetime.datetime.utcnow() - self.expired = False - - self.to_update_transforms = {} - - # user defined Condition class - self.user_defined_conditions = {} - - self.username = None - self.userdn = None - self.proxy = None - - self.loop_condition_template = {} - self.loop_condition = None - - self.parameter_links_template = {} - self.parameter_links = {} - - """ - self._running_data_names = [] - for name in ['internal_id', 'template_work_id', 'workload_id', 'work_sequence', 'terminated_works', - 'first_initial', 'new_to_run_works', 'current_running_works', - 'num_subfinished_works', 'num_finished_works', 'num_failed_works', 'num_cancelled_works', 'num_suspended_works', - 'num_expired_works', 'num_total_works', 'last_work']: - self._running_data_names.append(name) - for name in ['works']: - self._running_data_names.append(name) - """ - - @property - def name(self): - return self._name - - @name.setter - def name(self, value): - self._name = value - - @property - def internal_id(self): - return self.get_metadata_item('internal_id') - - @internal_id.setter - def internal_id(self, value): - self.add_metadata_item('internal_id', value) - - @property - def template_work_id(self): - return self.get_metadata_item('template_work_id') - - @template_work_id.setter - def template_work_id(self, value): - self.add_metadata_item('template_work_id', value) - - def get_template_work_id(self): - return self.template_work_id - - def get_template_id(self): - return self.template_work_id - - @property - def workload_id(self): - return self.get_metadata_item('workload_id') - - @workload_id.setter - def workload_id(self, value): - self.add_metadata_item('workload_id', value) - - @property - def lifetime(self): - # return self.get_metadata_item('lifetime', None) - return getattr(self, '_lifetime', None) - - @lifetime.setter - def lifetime(self, value): - # self.add_metadata_item('lifetime', value) - self._lifetime = value - - @property - def pending_time(self): - # return self.get_metadata_item('pending_time', None) - return getattr(self, '_pending_time', None) - - @pending_time.setter - def pending_time(self, value): - # self.add_metadata_item('pending_time', value) - self._pending_time = value - - @property - def last_updated_at(self): - last_updated_at = self.get_metadata_item('last_updated_at', None) - if last_updated_at and type(last_updated_at) in [str]: - last_updated_at = str_to_date(last_updated_at) - return last_updated_at - - @last_updated_at.setter - def last_updated_at(self, value): - self.add_metadata_item('last_updated_at', value) - - def has_new_updates(self): - self.last_updated_at = datetime.datetime.utcnow() - - @property - def expired(self): - t = self.get_metadata_item('expired', False) - if type(t) in [bool]: - return t - elif type(t) in [str] and t.lower() in ['true']: - return True - else: - return False - - @expired.setter - def expired(self, value): - self.add_metadata_item('expired', value) - - @property - def works(self): - return self._works - - @works.setter - def works(self, value): - self._works = value - work_metadata = {} - if self._works: - for k in self._works: - work = self._works[k] - if isinstance(work, Workflow): - work_metadata[k] = {'internal_id': work.internal_id, - 'template_id': work.get_template_id(), - 'type': 'workflow', - 'metadata': work.metadata} - else: - work_metadata[k] = {'internal_id': work.internal_id, - 'template_id': work.get_template_id(), - 'type': 'work', - 'work_id': work.work_id, - 'status': work.status, - 'substatus': work.substatus, - 'transforming': work.transforming} - self.add_metadata_item('works', work_metadata) - - def refresh_works(self): - work_metadata = {} - if self._works: - for k in self._works: - work = self._works[k] - if isinstance(work, Workflow): - work.refresh_works() - work_metadata[k] = {'internal_id': work.internal_id, - 'template_id': work.get_template_id(), - 'type': 'workflow', - 'metadata': work.metadata} - else: - work_metadata[k] = {'internal_id': work.internal_id, - 'template_id': work.get_template_id(), - 'type': 'work', - 'work_id': work.work_id, - 'status': work.status, - 'substatus': work.substatus, - 'transforming': work.transforming} - if work.last_updated_at and (not self.last_updated_at or work.last_updated_at > self.last_updated_at): - self.last_updated_at = work.last_updated_at - self.add_metadata_item('works', work_metadata) - - def load_works(self): - work_metadata = self.get_metadata_item('works', {}) - for k in work_metadata: - if 'type' not in work_metadata[k]: - work_metadata[k]['type'] = 'work' - - if work_metadata[k]['type'] == 'work': - if k not in self._works: - template_id = work_metadata[k]['template_id'] - work_template = self.works_template[template_id] - new_work = work_template.generate_work_from_template() - new_work.work_id = work_metadata[k]['work_id'] - new_work.internal_id = work_metadata[k]['internal_id'] - self._works[k] = new_work - self._works[k].work_id = work_metadata[k]['work_id'] - self._works[k].transforming = work_metadata[k]['transforming'] - if 'status' in work_metadata[k]: - self._works[k].status = work_metadata[k]['status'] - self._works[k].substatus = work_metadata[k]['substatus'] - elif work_metadata[k]['type'] == 'workflow': - if k not in self._works: - template_id = work_metadata[k]['template_id'] - workflow_template = self.works_template[template_id] - new_workflow = workflow_template.generate_work_from_template() - new_workflow.metadata = work_metadata[k]['metadata'] - # new_workflow.load_works() - new_workflow.internal_id = work_metadata[k]['internal_id'] - self._works[k] = new_workflow - - work = self._works[k] - if work.last_updated_at and (not self.last_updated_at or work.last_updated_at > self.last_updated_at): - self.last_updated_at = work.last_updated_at - - @property - def conditions(self): - return self._conditions - - @conditions.setter - def conditions(self, value): - self._conditions = value - conditions_metadata = {} - if self._conditions: - for k in self._conditions: - conditions_metadata[k] = {'template_id': self._conditions[k].get_template_id(), - 'metadata': self._conditions[k].metadata} - self.add_metadata_item('conditions', conditions_metadata) - - @property - def conditions_temp(self): - return self._conditions_temp - - @conditions_temp.setter - def conditions_temp(self, value): - self._conditions_temp = value - conditions_metadata = {} - if self._conditions_temp: - conditions_metadata = self._conditions_temp - self.add_metadata_item('conditions_temp', conditions_metadata) - - @property - def work_conds(self): - return self._work_conds - - @work_conds.setter - def work_conds(self, value): - self._work_conds = value - self.add_metadata_item('work_conds', value) - - def load_work_conditions(self): - conditions_metadata = self.get_metadata_item('conditions', {}) - for cond_internal_id in conditions_metadata: - template_id = conditions_metadata[cond_internal_id]['template_id'] - cond_template = self.conditions_template[template_id] - cond = cond_template.generate_condition_from_template() - cond.metadata = conditions_metadata[cond_internal_id]['metadata'] - self.conditions[cond_internal_id] = cond - self.conditions[cond_internal_id].load_conditions(self.works, self.get_works_template()) - - work_conds = self.get_metadata_item('work_conds', {}) - self._work_conds = work_conds - - @property - def loop_condition(self): - return self._loop_condition - - @loop_condition.setter - def loop_condition(self, value): - self._loop_condition = value - self.add_metadata_item('loop_condition', self._loop_condition.get_condition_status()) - - @property - def work_sequence(self): - return self.get_metadata_item('work_sequence', {}) - - @work_sequence.setter - def work_sequence(self, value): - self.add_metadata_item('work_sequence', value) - - @property - def terminated_works(self): - return self.get_metadata_item('terminated_works', []) - - @terminated_works.setter - def terminated_works(self, value): - self.add_metadata_item('terminated_works', value) - - @property - def first_initial(self): - return self.get_metadata_item('first_initial', False) - - @first_initial.setter - def first_initial(self, value): - self.add_metadata_item('first_initial', value) - - @property - def new_to_run_works(self): - return self.get_metadata_item('new_to_run_works', []) - - @new_to_run_works.setter - def new_to_run_works(self, value): - self.add_metadata_item('new_to_run_works', value) - - @property - def current_running_works(self): - return self.get_metadata_item('current_running_works', []) - - @current_running_works.setter - def current_running_works(self, value): - self.add_metadata_item('current_running_works', value) - - @property - def num_subfinished_works(self): - return self.get_metadata_item('num_subfinished_works', 0) - - @num_subfinished_works.setter - def num_subfinished_works(self, value): - self.add_metadata_item('num_subfinished_works', value) - - @property - def num_finished_works(self): - return self.get_metadata_item('num_finished_works', 0) - - @num_finished_works.setter - def num_finished_works(self, value): - self.add_metadata_item('num_finished_works', value) - - @property - def num_failed_works(self): - return self.get_metadata_item('num_failed_works', 0) - - @num_failed_works.setter - def num_failed_works(self, value): - self.add_metadata_item('num_failed_works', value) - - @property - def num_cancelled_works(self): - return self.get_metadata_item('num_cancelled_works', 0) - - @num_cancelled_works.setter - def num_cancelled_works(self, value): - self.add_metadata_item('num_cancelled_works', value) - - @property - def num_suspended_works(self): - return self.get_metadata_item('num_suspended_works', 0) - - @num_suspended_works.setter - def num_suspended_works(self, value): - self.add_metadata_item('num_suspended_works', value) - - @property - def num_expired_works(self): - return self.get_metadata_item('num_expired_works', 0) - - @num_expired_works.setter - def num_expired_works(self, value): - self.add_metadata_item('num_expired_works', value) - - @property - def num_total_works(self): - return self.get_metadata_item('num_total_works', 0) - - @num_total_works.setter - def num_total_works(self, value): - self.add_metadata_item('num_total_works', value) - - @property - def last_work(self): - return self.get_metadata_item('last_work', None) - - @last_work.setter - def last_work(self, value): - self.add_metadata_item('last_work', value) - - @property - def to_update_transforms(self): - return self.get_metadata_item('to_update_transforms', {}) - - @to_update_transforms.setter - def to_update_transforms(self, value): - self.add_metadata_item('to_update_transforms', value) - - def load_metadata(self): - self.load_works() - self.load_work_conditions() - - def get_class_name(self): - return self.__class__.__name__ - - def setup_logger(self): - """ - Setup logger - """ - self.logger = logging.getLogger(self.get_class_name()) - - def log_info(self, info): - if self.logger is None: - self.setup_logger() - self.logger.info(info) - - def log_debug(self, info): - if self.logger is None: - self.setup_logger() - self.logger.debug(info) - - def get_internal_id(self): - return self.internal_id - - def copy(self): - new_wf = copy.deepcopy(self) - return new_wf - - def __deepcopy__(self, memo): - logger = self.logger - self.logger = None - - cls = self.__class__ - result = cls.__new__(cls) - - memo[id(self)] = result - - # Deep copy all other attributes - for k, v in self.__dict__.items(): - setattr(result, k, copy.deepcopy(v, memo)) - - self.logger = logger - result.logger = logger - return result - - def get_works_template(self): - return self.works_template - - def generate_work_from_template(self): - logger = self.logger - self.logger = None - new_workflow = copy.deepcopy(self) - self.logger = logger - new_workflow.logger = logger - # new_work.template_work_id = self.get_internal_id() - new_workflow.internal_id = str(uuid.uuid4())[:8] - return new_workflow - - def add_work_template(self, work): - self.works_template[work.get_template_id()] = work - - def get_new_work_from_template(self, work_id, new_parameters=None): - # 1. initialize works - # template_id = work.get_template_id() - template_id = work_id - work = self.works_template[template_id] - new_work = work.generate_work_from_template() - if new_parameters: - new_work.set_parameters(new_parameters) - new_work.sequence_id = self.num_total_works - if isinstance(new_work, Workflow): - pass - else: - new_work.initialize_work() - works = self.works - works[new_work.get_internal_id()] = new_work - self.works = works - # self.work_sequence.append(new_work.get_internal_id()) - self.work_sequence[str(self.num_total_works)] = new_work.get_internal_id() - if isinstance(new_work, Workflow): - self.num_total_works += 1 - else: - self.num_total_works += 1 - self.new_to_run_works.append(new_work.get_internal_id()) - self.last_work = new_work.get_internal_id() - - # 2. initialize conditions related to this work - if template_id in self.work_conds_template: - conds = self.work_conds_template[template_id] - self.work_conds[new_work.get_internal_id()] = [] - for cond_template_id in conds: - cond_template = self.conditions_template[cond_template_id] - if not cond_template.has_multiple_pre_works(): - cond = cond_template.generate_new_cond_from_template() - self.conditions[cond.get_internal_id()] = cond - self.work_conds[new_work.get_internal_id()].append(cond.get_internal_id()) - cond.attach_pre_work(new_work) - else: - if cond_template_id in self.conditions_temp: - # condition is already created, for example, for AndCondition or OrCondition - # cond_temp will be cleaned when a new loop is created in LoopWorkflow - cond_internal_id = self.conditions_temp[cond_template_id] - self.work_conds[new_work.get_internal_id()].append(cond_internal_id) - cond = self.conditions[cond_internal_id] - cond.attach_pre_work(new_work) - else: - cond = cond_template.generate_new_cond_from_template() - self.conditions[cond.get_internal_id()] = cond - self.work_conds[new_work.get_internal_id()].append(cond.get_internal_id()) - self.conditions_temp[cond_template_id] = cond.get_internal_id() - cond.attach_pre_work(new_work) - return new_work - - def register_user_defined_condition(self, condition): - cond_src = inspect.getsource(condition) - self.user_defined_conditions[condition.__name__] = cond_src - - def load_user_defined_condition(self): - # try: - # Condition() - # except NameError: - # global Condition - # import Condition - - for cond_src_name in self.user_defined_conditions: - # global cond_src_name - exec(self.user_defined_conditions[cond_src_name]) - - def set_workload_id(self, workload_id): - self.workload_id = workload_id - - def get_workload_id(self): - return self.workload_id - - def add_work(self, work, initial=False, primary=False): - self.first_initial = False - self.add_work_template(work) - if initial: - if primary: - self.primary_initial_work = work.get_template_id() - self.add_initial_works(work) - - self.independent_works.append(work.get_template_id()) - - def add_condition(self, cond): - self.first_initial = False - cond_works = cond.all_works() - for cond_work in cond_works: - assert(cond_work.get_template_id() in self.get_works_template()) - - if cond.get_template_id() not in self.conditions_template: - conditions = self.conditions_template - conditions[cond.get_template_id()] = cond - self.conditions_template = conditions - - # if cond.current_work not in self.work_conds: - # self.work_conds[cond.current_work] = [] - # self.work_conds[cond.current_work].append(cond) - work_conds = self.work_conds_template - for work in cond.all_pre_works(): - if work.get_template_id() not in work_conds: - work_conds[work.get_template_id()] = [] - work_conds[work.get_template_id()].append(cond.get_template_id()) - self.work_conds_template = work_conds - - # if a work is a true_work or false_work of a condition, - # should remove it from independent_works - cond_next_works = cond.all_next_works() - for next_work in cond_next_works: - if next_work.get_template_id() in self.independent_works: - self.independent_works.remove(next_work.get_template_id()) - - def add_initial_works(self, work): - assert(work.get_template_id() in self.get_works_template()) - self.initial_works.append(work.get_template_id()) - if self.primary_initial_work is None: - self.primary_initial_work = work.get_template_id() - - def enable_next_works(self, work, cond): - self.log_debug("Checking Work %s condition: %s" % (work.get_internal_id(), - json_dumps(cond, sort_keys=True, indent=4))) - # load_conditions should cover it. - # if cond and self.is_class_method(cond.cond): - # # cond_work_id = self.works[cond.cond['idds_method_class_id']] - # cond.cond = getattr(work, cond.cond['idds_method']) - - self.log_info("Work %s condition: %s" % (work.get_internal_id(), cond.conditions)) - next_works = cond.get_next_works(trigger=ConditionTrigger.ToTrigger) - self.log_info("Work %s condition status %s" % (work.get_internal_id(), cond.get_cond_status())) - self.log_info("Work %s next works %s" % (work.get_internal_id(), str(next_works))) - new_next_works = [] - if next_works is not None: - for next_work in next_works: - new_parameters = work.get_parameters_for_next_task() - new_next_work = self.get_new_work_from_template(next_work.get_template_id(), new_parameters) - work.add_next_work(new_next_work.get_internal_id()) - # cond.add_condition_work(new_next_work) ####### TODO: - new_next_works.append(new_next_work) - return new_next_works - - def add_loop_condition(self, condition, position='end'): - self.loop_condition_template = {'position': position, - 'condition': condition} - - def has_loop_condition(self): - if self.loop_condition_template and 'condition' in self.loop_condition_template: - return True - return False - - def get_loop_condition_status(self): - if self.has_loop_condition(): - cond_template = self.loop_condition_template['condition'] - loop_condition = cond_template.generate_condition_from_template() - loop_condition.load_conditions(self.works, self.get_works_template()) - self.loop_condition = loop_condition - return self.loop_condition.get_condition_status() - return False - - def __str__(self): - return str(json_dumps(self)) - - def get_new_works(self): - """ - *** Function called by Marshaller agent. - - new works to be ready to start - """ - self.sync_works() - return [self.works[k] for k in self.new_to_run_works] - - def get_current_works(self): - """ - *** Function called by Marshaller agent. - - Current running works - """ - self.sync_works() - return [self.works[k] for k in self.current_running_works] - - def get_all_works(self): - """ - *** Function called by Marshaller agent. - - Current running works - """ - self.sync_works() - return [self.works[k] for k in self.works] - - def get_primary_initial_collection(self): - """ - *** Function called by Clerk agent. - """ - - if self.primary_initial_work: - return self.get_works_template()[self.primary_initial_work].get_primary_input_collection() - elif self.initial_works: - return self.get_works_template()[self.initial_works[0]].get_primary_input_collection() - elif self.independent_works: - return self.get_works_template()[self.independent_works[0]].get_primary_input_collection() - else: - keys = self.get_works_template().keys() - return self.get_works_template()[keys[0]].get_primary_input_collection() - return None - - def get_dependency_works(self, work_id, depth, max_depth): - if depth > max_depth: - return [] - - deps = [] - for dep_work_id in self.work_dependencies[work_id]: - deps.append(dep_work_id) - l_deps = self.get_dependency_works(dep_work_id, depth + 1, max_depth) - deps += l_deps - deps = list(dict.fromkeys(deps)) - return deps - - def order_independent_works(self): - ind_work_ids = self.independent_works - self.independent_works = [] - self.work_dependencies = {} - for ind_work_id in ind_work_ids: - work = self.works_template[ind_work_id] - self.work_dependencies[ind_work_id] = [] - for ind_work_id1 in ind_work_ids: - if ind_work_id == ind_work_id1: - continue - work1 = self.works_template[ind_work_id1] - if work.depend_on(work1): - self.work_dependencies[ind_work_id].append(ind_work_id1) - self.log_debug('work dependencies 1: %s' % str(self.work_dependencies)) - - max_depth = len(ind_work_ids) + 1 - work_dependencies = copy.deepcopy(self.work_dependencies) - for work_id in work_dependencies: - deps = self.get_dependency_works(work_id, 0, max_depth) - self.work_dependencies[work_id] = deps - self.log_debug('work dependencies 2: %s' % str(self.work_dependencies)) - - while True: - for work_id in self.work_dependencies: - if work_id not in self.independent_works and len(self.work_dependencies[work_id]) == 0: - self.independent_works.append(work_id) - for work_id in self.independent_works: - if work_id in self.work_dependencies: - del self.work_dependencies[work_id] - for work_id in self.work_dependencies: - for in_work_id in self.independent_works: - if in_work_id in self.work_dependencies[work_id]: - self.work_dependencies[work_id].remove(in_work_id) - if not self.work_dependencies: - break - self.log_debug('independent_works: %s' % str(self.independent_works)) - - def first_initialize(self): - # set new_to_run works - if not self.first_initial: - self.first_initial = True - self.order_independent_works() - if self.initial_works: - tostart_works = self.initial_works - elif self.independent_works: - tostart_works = self.independent_works - else: - tostart_works = list(self.get_works_template().keys()) - tostart_works = [tostart_works[0]] - - for work_id in tostart_works: - self.get_new_work_from_template(work_id) - - def sync_works(self): - self.first_initialize() - - self.refresh_works() - - for k in self.works: - work = self.works[k] - self.log_debug("work %s is_terminated(%s:%s)" % (work.get_internal_id(), work.is_terminated(), work.get_status())) - - for work in [self.works[k] for k in self.new_to_run_works]: - if work.transforming: - self.new_to_run_works.remove(work.get_internal_id()) - self.current_running_works.append(work.get_internal_id()) - - for work in [self.works[k] for k in self.current_running_works]: - if isinstance(work, Workflow): - work.sync_works() - if work.get_internal_id() in self.work_conds: - self.log_debug("Work %s has condition dependencies %s" % (work.get_internal_id(), - json_dumps(self.work_conds[work.get_internal_id()], sort_keys=True, indent=4))) - for cond_id in self.work_conds[work.get_internal_id()]: - cond = self.conditions[cond_id] - self.log_debug("Work %s has condition dependencie %s" % (work.get_internal_id(), - json_dumps(cond, sort_keys=True, indent=4))) - self.enable_next_works(work, cond) - - if work.is_terminated(): - self.log_info("Work %s is terminated(%s)" % (work.get_internal_id(), work.get_status())) - self.log_debug("Work conditions: %s" % json_dumps(self.work_conds, sort_keys=True, indent=4)) - if work.get_template_id() not in self.work_conds: - # has no next work - self.log_info("Work %s has no condition dependencies" % work.get_internal_id()) - self.terminated_works.append(work.get_internal_id()) - self.current_running_works.remove(work.get_internal_id()) - else: - # self.log_debug("Work %s has condition dependencies %s" % (work.get_internal_id(), - # json_dumps(self.work_conds[work.get_template_id()], sort_keys=True, indent=4))) - # for cond in self.work_conds[work.get_template_id()]: - # self.enable_next_works(work, cond) - self.terminated_works.append(work.get_internal_id()) - self.current_running_works.remove(work.get_internal_id()) - - if work.is_finished(): - self.num_finished_works += 1 - elif work.is_subfinished(): - self.num_subfinished_works += 1 - elif work.is_failed(): - self.num_failed_works += 1 - elif work.is_expired(): - self.num_expired_works += 1 - elif work.is_cancelled(): - self.num_cancelled_works += 1 - elif work.is_suspended(): - self.num_suspended_works += 1 - log_str = "num_total_works: %s" % self.num_total_works - log_str += ", num_finished_works: %s" % self.num_finished_works - log_str += ", num_subfinished_works: %s" % self.num_subfinished_works - log_str += ", num_failed_works: %s" % self.num_failed_works - log_str += ", num_expired_works: %s" % self.num_expired_works - log_str += ", num_cancelled_works: %s" % self.num_cancelled_works - log_str += ", num_suspended_works: %s" % self.num_suspended_works - self.log_debug(log_str) - - def resume_works(self): - self.num_subfinished_works = 0 - self.num_finished_works = 0 - self.num_failed_works = 0 - self.num_cancelled_works = 0 - self.num_suspended_works = 0 - self.num_expired_works = 0 - - self.last_updated_at = datetime.datetime.utcnow() - - t_works = self.terminated_works - self.terminated_works = [] - self.current_running_works = self.current_running_works + t_works - for work in [self.works[k] for k in self.current_running_works]: - if isinstance(work, Workflow): - work.resume_works() - else: - work.resume_work() - - def clean_works(self): - self.num_subfinished_works = 0 - self.num_finished_works = 0 - self.num_failed_works = 0 - self.num_cancelled_works = 0 - self.num_suspended_works = 0 - self.num_expired_works = 0 - self.num_total_works = 0 - - self.last_updated_at = datetime.datetime.utcnow() - - self.terminated_works = [] - self.current_running_works = [] - self.works = {} - self.work_sequence = {} # order list - - self.first_initial = False - self.new_to_run_works = [] - - def get_exact_workflows(self): - """ - *** Function called by Clerk agent. - - TODO: The primary dataset for the initial work is a dataset with '*'. - workflow.primary_initial_collection = 'some datasets with *' - collections = get_collection(workflow.primary_initial_collection) - wfs = [] - for coll in collections: - wf = self.copy() - wf.name = self.name + "_" + number - wf.primary_initial_collection = coll - wfs.append(wf) - return wfs - """ - return [self] - - def is_terminated(self): - """ - *** Function called by Marshaller agent. - """ - self.sync_works() - if len(self.new_to_run_works) == 0 and len(self.current_running_works) == 0: - return True - return False - - def is_finished(self): - """ - *** Function called by Marshaller agent. - """ - return self.is_terminated() and self.num_finished_works == self.num_total_works - - def is_subfinished(self): - """ - *** Function called by Marshaller agent. - """ - return self.is_terminated() and (self.num_finished_works + self.num_subfinished_works > 0 and self.num_finished_works + self.num_subfinished_works <= self.num_total_works) - - def is_failed(self): - """ - *** Function called by Marshaller agent. - """ - return self.is_terminated() and (self.num_failed_works > 0) and (self.num_cancelled_works == 0) and (self.num_suspended_works == 0) and (self.num_expired_works == 0) - - def is_to_expire(self, expired_at=None, pending_time=None, request_id=None): - if self.expired: - # it's already expired. avoid sending duplicated messages again and again. - return False - if expired_at: - if type(expired_at) in [str]: - expired_at = str_to_date(expired_at) - if expired_at < datetime.datetime.utcnow(): - self.logger.info("Request(%s) expired_at(%s) is smaller than utc now(%s), expiring" % (request_id, - expired_at, - datetime.datetime.utcnow())) - return True - - act_pending_time = None - if self.pending_time: - # in days - act_pending_time = float(self.pending_time) - else: - if pending_time: - act_pending_time = float(pending_time) - if act_pending_time: - act_pending_seconds = int(86400 * act_pending_time) - if self.last_updated_at + datetime.timedelta(seconds=act_pending_seconds) < datetime.datetime.utcnow(): - log_str = "Request(%s) last updated at(%s) + pending seconds(%s)" % (request_id, - self.last_updated_at, - act_pending_seconds) - log_str += " is smaller than utc now(%s), expiring" % (datetime.datetime.utcnow()) - self.logger.info(log_str) - return True - - return False - - def is_expired(self): - """ - *** Function called by Marshaller agent. - """ - # return self.is_terminated() and (self.num_expired_works > 0) - return self.is_terminated() and self.expired - - def is_cancelled(self): - """ - *** Function called by Marshaller agent. - """ - return self.is_terminated() and (self.num_cancelled_works > 0) - - def is_suspended(self): - """ - *** Function called by Marshaller agent. - """ - return self.is_terminated() and (self.num_suspended_works > 0) - - def get_terminated_msg(self): - """ - *** Function called by Marshaller agent. - """ - if self.last_work: - return self.works[self.last_work].get_terminated_msg() - return None - - def add_proxy(self): - self.proxy = get_proxy() - if not self.proxy: - raise Exception("Cannot get local proxy") - - def get_proxy(self): - return self.proxy - - -class LoopWorkflow(Base): - def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None, logger=None): - # super(Workflow, self).__init__(name=name, workload_id=workload_id, lifetime=lifetime, pending_time=pending_time, logger=logger) - self.logger = logger - if self.logger is None: - self.setup_logger() - - self.template = Workflow(name=name, workload_id=workload_id, lifetime=lifetime, pending_time=pending_time, logger=logger) - self.num_run = 0 - self.runs = {} - self.loop_condition_position = 'end' - - def setup_logger(self): - """ - Setup logger - """ - self.logger = logging.getLogger(self.get_class_name()) - - @property - def metadata(self): - run_metadata = {'num_run': self.num_run, - 'runs': {}} - for run_id in self.runs: - run_metadata['runs'][run_id] = self.runs[run_id].metadata - return run_metadata - - @metadata.setter - def metadata(self, value): - run_metadata = value - self.num_run = run_metadata['num_run'] - runs = run_metadata['runs'] - for run_id in runs: - self.runs[run_id] = self.template.copy() - self.runs[run_id].metadata = runs[run_id] - - def set_workload_id(self, workload_id): - self.template.workload_id = workload_id - # self.dynamic.workload_id = workload_id - - def get_workload_id(self): - return self.template.workload_id - - def add_work(self, work, initial=False, primary=False): - self.template.add_work(work, initial, primary) - - def add_condition(self, cond): - self.template.add_condition(cond) - - def get_new_works(self): - self.sync_works() - self.runs[str(self.num_run)].get_new_works() - - def get_current_works(self): - self.sync_works() - self.runs[str(self.num_run)].get_current_works() - - def get_all_works(self): - self.sync_works() - self.runs[str(self.num_run)].get_all_works() - - def get_primary_initial_collection(self): - self.runs[str(self.num_run)].get_primary_initial_collection() - - def resume_works(self): - self.runs[str(self.num_run)].resume_works() - - def clean_works(self): - self.runs[str(self.num_run)].clean_works() - - def is_terminated(self): - self.runs[str(self.num_run)].is_terminated() - - def is_finished(self): - self.runs[str(self.num_run)].is_finished() - - def is_failed(self): - self.runs[str(self.num_run)].is_failed() - - def is_expired(self): - self.runs[str(self.num_run)].is_expired() - - def is_cancelled(self): - self.runs[str(self.num_run)].is_cancelled() - - def is_suspended(self): - self.runs[str(self.num_run)].is_suspended() - - def get_terminated_msg(self): - self.runs[str(self.num_run)].get_terminated_msg() - - def add_proxy(self): - self.template.add_proxy() - - def get_proxy(self): - self.template.get_proxy() - - def add_loop_condition(self, condition, position='end'): - if not position or position != 'begin': - position = 'end' - position = 'end' # force position to end currently. position = 'begin' is not supported now. - self.template.add_loop_condition(condition, position=position) - self.loop_condition_position = position - - def sync_works(self): - # position is end. - if str(self.num_run) not in self.runs: - self.runs[str(self.num_run)] = self.template.copy() - - self.runs[str(self.num_run)].sync_works() - if self.runs[str(self.num_run)].is_terminated(): - if self.runs[str(self.num_run)].has_loop_condition(): - if self.runs[str(self.num_run)].get_loop_condition_status(): - self.num_run += 1 - self.runs[str(self.num_run)] = self.template.copy() - - -class SubWorkflow(LoopWorkflow): - def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None, logger=None): - """ - Init a workflow. - """ - super(SubWorkflow, self).__init__(name=name, workload_id=workload_id, lifetime=lifetime, pending_time=pending_time, logger=logger) - - -class CompositeWorkflow(LoopWorkflow): - def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None, logger=None): - """ - Init a workflow. - """ - super(CompositeWorkflow, self).__init__(name=name, workload_id=workload_id, lifetime=lifetime, pending_time=pending_time, logger=logger) diff --git a/workflow/lib/idds/workflow/workflowv2.py b/workflow/lib/idds/workflow/workflowv2.py deleted file mode 100644 index e2a75615..00000000 --- a/workflow/lib/idds/workflow/workflowv2.py +++ /dev/null @@ -1,1713 +0,0 @@ -#!/usr/bin/env python -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0OA -# -# Authors: -# - Wen Guan, , 2020 - 2021 - -import copy -import datetime -import logging -import inspect -import random -import time -import uuid - - -from idds.common import exceptions -from idds.common.constants import IDDSEnum, WorkStatus -from idds.common.utils import json_dumps, setup_logging, get_proxy -from idds.common.utils import str_to_date -from .base import Base -from .work import Work - - -setup_logging(__name__) - - -class ConditionOperator(IDDSEnum): - And = 0 - Or = 1 - - -class ConditionTrigger(IDDSEnum): - NotTriggered = 0 - ToTrigger = 1 - Triggered = 2 - - -class CompositeCondition(Base): - def __init__(self, operator=ConditionOperator.And, conditions=[], true_works=None, false_works=None, logger=None): - self._conditions = [] - self._true_works = [] - self._false_works = [] - - super(CompositeCondition, self).__init__() - - self.internal_id = str(uuid.uuid4())[:8] - self.template_id = self.internal_id - # self.template_id = str(uuid.uuid4())[:8] - - self.logger = logger - if self.logger is None: - self.setup_logger() - - if conditions is None: - conditions = [] - if true_works is None: - true_works = [] - if false_works is None: - false_works = [] - if conditions and type(conditions) not in [tuple, list]: - conditions = [conditions] - if true_works and type(true_works) not in [tuple, list]: - true_works = [true_works] - if false_works and type(false_works) not in [tuple, list]: - false_works = [false_works] - self.validate_conditions(conditions) - - self.operator = operator - self.conditions = [] - self.true_works = [] - self.false_works = [] - - self.conditions = conditions - self.true_works = true_works - self.false_works = false_works - - def get_class_name(self): - return self.__class__.__name__ - - def get_internal_id(self): - return self.internal_id - - def get_template_id(self): - return self.template_id - - def copy(self): - new_cond = copy.deepcopy(self) - return new_cond - - def __deepcopy__(self, memo): - logger = self.logger - self.logger = None - - cls = self.__class__ - result = cls.__new__(cls) - - memo[id(self)] = result - - # Deep copy all other attributes - for k, v in self.__dict__.items(): - setattr(result, k, copy.deepcopy(v, memo)) - - self.logger = logger - result.logger = logger - return result - - @property - def conditions(self): - # return self.get_metadata_item('true_works', []) - return self._conditions - - @conditions.setter - def conditions(self, value): - self._conditions = value - - @property - def true_works(self): - # return self.get_metadata_item('true_works', []) - return self._true_works - - @true_works.setter - def true_works(self, value): - self._true_works = value - true_work_meta = self.get_metadata_item('true_works', {}) - for work in value: - if work is None: - continue - if isinstance(work, Work): - if work.get_internal_id() not in true_work_meta: - true_work_meta[work.get_internal_id()] = {'triggered': False} - elif isinstance(work, CompositeCondition): - if work.get_internal_id() not in true_work_meta: - true_work_meta[work.get_internal_id()] = {'triggered': False} - elif isinstance(work, Workflow): - if work.get_internal_id() not in true_work_meta: - true_work_meta[work.get_internal_id()] = {'triggered': False} - self.add_metadata_item('true_works', true_work_meta) - - @property - def false_works(self): - # return self.get_metadata_item('false_works', []) - return self._false_works - - @false_works.setter - def false_works(self, value): - self._false_works = value - false_work_meta = self.get_metadata_item('false_works', {}) - for work in value: - if work is None: - continue - if isinstance(work, Work): - if work.get_internal_id() not in false_work_meta: - false_work_meta[work.get_internal_id()] = {'triggered': False} - elif isinstance(work, CompositeCondition): - if work.get_internal_id() not in false_work_meta: - false_work_meta[work.get_internal_id()] = {'triggered': False} - elif isinstance(work, Workflow): - if work.get_internal_id() not in false_work_meta: - false_work_meta[work.get_internal_id()] = {'triggered': False} - self.add_metadata_item('false_works', false_work_meta) - - def validate_conditions(self, conditions): - if type(conditions) not in [tuple, list]: - raise exceptions.IDDSException("conditions must be list") - for cond in conditions: - assert(inspect.ismethod(cond)) - - def add_condition(self, cond): - assert(inspect.ismethod(cond)) - assert(isinstance(cond.__self__, Work)) - - # self.conditions.append({'condition': cond, 'current_work': cond.__self__}) - - self._conditions.append(cond) - - def load_metadata(self): - # conditions = self.get_metadata_item('conditions', []) - # true_works_meta = self.get_metadata_item('true_works', {}) - # false_works_meta = self.get_metadata_item('false_works', {}) - pass - - def to_dict(self): - # print('to_dict') - ret = {'class': self.__class__.__name__, - 'module': self.__class__.__module__, - 'attributes': {}} - for key, value in self.__dict__.items(): - # print(key) - # print(value) - # if not key.startswith('__') and not key.startswith('_'): - if not key.startswith('__'): - if key == 'logger': - value = None - elif key == '_conditions': - new_value = [] - for cond in value: - if inspect.ismethod(cond): - new_cond = {'idds_method': cond.__name__, - 'idds_method_internal_id': cond.__self__.get_internal_id()} - else: - new_cond = cond - new_value.append(new_cond) - value = new_value - elif key in ['_true_works', '_false_works']: - new_value = [] - for w in value: - if isinstance(w, Work): - new_w = w.get_internal_id() - elif isinstance(w, CompositeCondition): - new_w = w.to_dict() - elif isinstance(w, Workflow): - new_w = w.to_dict() - else: - new_w = w - new_value.append(new_w) - value = new_value - else: - value = self.to_dict_l(value) - ret['attributes'][key] = value - return ret - - def get_work_from_id(self, work_id, works): - return works[work_id] - - def load_conditions(self, works): - new_conditions = [] - for cond in self.conditions: - if callable(cond): - new_conditions.append(cond) - else: - if 'idds_method' in cond and 'idds_method_internal_id' in cond: - internal_id = cond['idds_method_internal_id'] - work = self.get_work_from_id(internal_id, works) - if work is not None: - new_cond = getattr(work, cond['idds_method']) - else: - self.logger.error("Work cannot be found for %s" % (internal_id)) - new_cond = cond - else: - new_cond = cond - new_conditions.append(new_cond) - self.conditions = new_conditions - - new_true_works = [] - for w in self.true_works: - if isinstance(w, CompositeCondition) or isinstance(w, Workflow): - # work = w.load_conditions(works, works_template) - w.load_conditions(works) - work = w - elif type(w) in [str]: - work = self.get_work_from_id(w, works) - if work is None: - self.logger.error("Work cannot be found for %s" % str(w)) - work = w - else: - self.logger.error("Work cannot be found for %s" % str(w)) - work = w - new_true_works.append(work) - self.true_works = new_true_works - - new_false_works = [] - for w in self.false_works: - if isinstance(w, CompositeCondition) or isinstance(w, Workflow): - # work = w.load_condtions(works, works_template) - w.load_conditions(works) - work = w - elif type(w) in [str]: - work = self.get_work_from_id(w, works) - if work is None: - self.logger.error("Work cannot be found for %s" % str(w)) - work = w - else: - self.logger.error("Work cannot be found for %s" % str(w)) - work = w - new_false_works.append(work) - self.false_works = new_false_works - - def all_works(self): - works = [] - works = works + self.all_pre_works() - works = works + self.all_next_works() - return works - - def all_condition_ids(self): - works = [] - for cond in self.conditions: - if inspect.ismethod(cond): - works.append(cond.__self__.get_internal_id()) - else: - self.logger.error("cond cannot be recognized: %s" % str(cond)) - works.append(cond) - for work in self.true_works + self.false_works: - if isinstance(work, CompositeCondition): - works = works + work.all_condition_ids() - return works - - def all_pre_works(self): - works = [] - for cond in self.conditions: - if inspect.ismethod(cond): - works.append(cond.__self__) - else: - self.logger.error("cond cannot be recognized: %s" % str(cond)) - works.append(cond) - for work in self.true_works + self.false_works: - if isinstance(work, CompositeCondition): - works = works + work.all_pre_works() - return works - - def all_next_works(self): - works = [] - for work in self.true_works + self.false_works: - if isinstance(work, CompositeCondition): - works = works + work.all_next_works() - else: - works.append(work) - return works - - def get_current_cond_status(self, cond): - if callable(cond): - if cond(): - return True - else: - return False - else: - if cond: - return True - else: - return False - - def get_cond_status(self): - if self.operator == ConditionOperator.And: - for cond in self.conditions: - if not self.get_current_cond_status(cond): - return False - return True - else: - for cond in self.conditions: - if self.get_current_cond_status(cond): - return True - return False - - def get_condition_status(self): - return self.get_cond_status() - - def is_condition_true(self): - if self.get_cond_status(): - return True - return False - - def is_condition_false(self): - if not self.get_cond_status(): - return True - return False - - def get_next_works(self, trigger=ConditionTrigger.NotTriggered): - works = [] - if self.get_cond_status(): - true_work_meta = self.get_metadata_item('true_works', {}) - for work in self.true_works: - if isinstance(work, CompositeCondition): - works = works + work.get_next_works(trigger=trigger) - else: - if work.get_internal_id() not in true_work_meta: - true_work_meta[work.get_internal_id()] = {'triggered': False} - if trigger == ConditionTrigger.ToTrigger: - if not true_work_meta[work.get_internal_id()]['triggered']: - true_work_meta[work.get_internal_id()]['triggered'] = True - works.append(work) - elif trigger == ConditionTrigger.NotTriggered: - if not true_work_meta[work.get_internal_id()]['triggered']: - works.append(work) - elif trigger == ConditionTrigger.Triggered: - if true_work_meta[work.get_internal_id()]['triggered']: - works.append(work) - self.add_metadata_item('true_works', true_work_meta) - else: - false_work_meta = self.get_metadata_item('false_works', {}) - for work in self.false_works: - if isinstance(work, CompositeCondition): - works = works + work.get_next_works(trigger=trigger) - else: - if work.get_internal_id() not in false_work_meta: - false_work_meta[work.get_internal_id()] = {'triggered': False} - if trigger == ConditionTrigger.ToTrigger: - if not false_work_meta[work.get_internal_id()]['triggered']: - false_work_meta[work.get_internal_id()]['triggered'] = True - works.append(work) - elif trigger == ConditionTrigger.NotTriggered: - if not false_work_meta[work.get_internal_id()]['triggered']: - works.append(work) - elif trigger == ConditionTrigger.Triggered: - if false_work_meta[work.get_internal_id()]['triggered']: - works.append(work) - self.add_metadata_item('false_works', false_work_meta) - return works - - -class AndCondition(CompositeCondition): - def __init__(self, conditions=[], true_works=None, false_works=None, logger=None): - super(AndCondition, self).__init__(operator=ConditionOperator.And, - conditions=conditions, - true_works=true_works, - false_works=false_works, - logger=logger) - - -class OrCondition(CompositeCondition): - def __init__(self, conditions=[], true_works=None, false_works=None, logger=None): - super(OrCondition, self).__init__(operator=ConditionOperator.Or, - conditions=conditions, - true_works=true_works, - false_works=false_works, - logger=logger) - - -class Condition(CompositeCondition): - def __init__(self, cond=None, current_work=None, true_work=None, false_work=None, logger=None): - super(Condition, self).__init__(operator=ConditionOperator.And, - conditions=[cond] if cond else [], - true_works=[true_work] if true_work else [], - false_works=[false_work] if false_work else [], - logger=logger) - - # to support load from old conditions - @property - def cond(self): - # return self.get_metadata_item('true_works', []) - return self.conditions[0] if len(self.conditions) >= 1 else None - - @cond.setter - def cond(self, value): - self.conditions = [value] - - @property - def true_work(self): - # return self.get_metadata_item('true_works', []) - return self.true_works if len(self.true_works) >= 1 else None - - @true_work.setter - def true_work(self, value): - self.true_works = [value] - - @property - def false_work(self): - # return self.get_metadata_item('true_works', []) - return self.false_works if len(self.false_works) >= 1 else None - - @false_work.setter - def false_work(self, value): - self.false_works = [value] - - -class TemplateCondition(CompositeCondition): - def __init__(self, cond=None, current_work=None, true_work=None, false_work=None, logger=None): - if true_work is not None and not isinstance(true_work, Work): - raise exceptions.IDDSException("true_work can only be set with Work class") - if false_work is not None and not isinstance(false_work, Work): - raise exceptions.IDDSException("false_work can only be set with Work class") - - super(TemplateCondition, self).__init__(operator=ConditionOperator.And, - conditions=[cond] if cond else [], - true_works=[true_work] if true_work else [], - false_works=[false_work] if false_work else [], - logger=logger) - - def validate_conditions(self, conditions): - if type(conditions) not in [tuple, list]: - raise exceptions.IDDSException("conditions must be list") - if len(conditions) > 1: - raise exceptions.IDDSException("Condition class can only support one condition. To support multiple condition, please use CompositeCondition.") - for cond in conditions: - assert(inspect.ismethod(cond)) - assert(isinstance(cond.__self__, Work)) - - def add_condition(self, cond): - raise exceptions.IDDSException("Condition class doesn't support add_condition. To support multiple condition, please use CompositeCondition.") - - -class ParameterLink(Base): - def __init__(self, parameters): - self.parameters = parameters - self.internal_id = str(uuid.uuid4())[:8] - self.template_id = self.internal_id - - def get_internal_id(self): - return self.internal_id - - def get_parameter_value(self, work, p): - p_f = getattr(work, p, 'None') - if p_f: - if callable(p_f): - return p_f() - else: - return p_f - else: - return None - - def set_parameters(self, work): - p_values = {} - for p in self.parameters: - p_values[p] = self.get_parameter_value(work, p) - self.add_metadata_item('parameters', p_values) - - def get_parameters(self): - return self.get_metadata_item('parameters', {}) - - -class WorkflowBase(Base): - - def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None, logger=None): - """ - Init a workflow. - """ - self._works = {} - self._conditions = {} - self._work_conds = {} - - self.parameter_links = {} - self.parameter_links_source = {} - self.parameter_links_destination = {} - - super(WorkflowBase, self).__init__() - - self.internal_id = str(uuid.uuid4())[:8] - self.template_work_id = self.internal_id - # self.template_work_id = str(uuid.uuid4())[:8] - self.lifetime = lifetime - self.pending_time = pending_time - - if name: - self._name = name + "." + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S_%f") + str(random.randint(1, 1000)) - else: - self._name = 'idds.workflow.' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S_%f") + str(random.randint(1, 1000)) - - if workload_id is None: - workload_id = int(time.time()) - self.workload_id = workload_id - - self.logger = logger - if self.logger is None: - self.setup_logger() - - self._works = {} - self.works = {} - self.work_sequence = {} # order list - - self.terminated_works = [] - self.initial_works = [] - # if the primary initial_work is not set, it's the first initial work. - self.primary_initial_work = None - self.independent_works = [] - - self.first_initial = False - self.new_to_run_works = [] - self.current_running_works = [] - - self.num_subfinished_works = 0 - self.num_finished_works = 0 - self.num_failed_works = 0 - self.num_cancelled_works = 0 - self.num_suspended_works = 0 - self.num_expired_works = 0 - self.num_total_works = 0 - - self.last_work = None - - self.last_updated_at = datetime.datetime.utcnow() - self.expired = False - - self.to_update_transforms = {} - - # user defined Condition class - self.user_defined_conditions = {} - - self.username = None - self.userdn = None - self.proxy = None - - self._loop_condition_position = 'end' - self.loop_condition = None - - """ - self._running_data_names = [] - for name in ['internal_id', 'template_work_id', 'workload_id', 'work_sequence', 'terminated_works', - 'first_initial', 'new_to_run_works', 'current_running_works', - 'num_subfinished_works', 'num_finished_works', 'num_failed_works', 'num_cancelled_works', 'num_suspended_works', - 'num_expired_works', 'num_total_works', 'last_work']: - self._running_data_names.append(name) - for name in ['works']: - self._running_data_names.append(name) - """ - - @property - def name(self): - return self._name - - @name.setter - def name(self, value): - self._name = value - - def get_template_work_id(self): - return self.template_work_id - - def get_template_id(self): - return self.template_work_id - - @property - def workload_id(self): - return self.get_metadata_item('workload_id') - - @workload_id.setter - def workload_id(self, value): - self.add_metadata_item('workload_id', value) - - @property - def lifetime(self): - # return self.get_metadata_item('lifetime', None) - return getattr(self, '_lifetime', None) - - @lifetime.setter - def lifetime(self, value): - # self.add_metadata_item('lifetime', value) - self._lifetime = value - - @property - def pending_time(self): - # return self.get_metadata_item('pending_time', None) - return getattr(self, '_pending_time', None) - - @pending_time.setter - def pending_time(self, value): - # self.add_metadata_item('pending_time', value) - self._pending_time = value - - @property - def last_updated_at(self): - last_updated_at = self.get_metadata_item('last_updated_at', None) - if last_updated_at and type(last_updated_at) in [str]: - last_updated_at = str_to_date(last_updated_at) - return last_updated_at - - @last_updated_at.setter - def last_updated_at(self, value): - self.add_metadata_item('last_updated_at', value) - - def has_new_updates(self): - self.last_updated_at = datetime.datetime.utcnow() - - @property - def expired(self): - t = self.get_metadata_item('expired', False) - if type(t) in [bool]: - return t - elif type(t) in [str] and t.lower() in ['true']: - return True - else: - return False - - @expired.setter - def expired(self, value): - self.add_metadata_item('expired', value) - - @property - def works(self): - return self._works - - @works.setter - def works(self, value): - self._works = value - work_metadata = {} - if self._works: - for k in self._works: - work = self._works[k] - if isinstance(work, Workflow): - work_metadata[k] = {'type': 'workflow', - 'metadata': work.metadata} - else: - work_metadata[k] = {'type': 'work', - 'work_id': work.work_id, - 'workload_id': work.workload_id, - 'status': work.status, - 'substatus': work.substatus, - 'transforming': work.transforming} - self.add_metadata_item('works', work_metadata) - - def refresh_works(self): - work_metadata = {} - if self._works: - for k in self._works: - work = self._works[k] - if isinstance(work, Workflow): - work.refresh_works() - work_metadata[k] = {'type': 'workflow', - 'metadata': work.metadata} - else: - work_metadata[k] = {'type': 'work', - 'work_id': work.work_id, - 'workload_id': work.workload_id, - 'status': work.status, - 'substatus': work.substatus, - 'transforming': work.transforming} - if work.last_updated_at and (not self.last_updated_at or work.last_updated_at > self.last_updated_at): - self.last_updated_at = work.last_updated_at - self.add_metadata_item('works', work_metadata) - - def load_works(self): - work_metadata = self.get_metadata_item('works', {}) - for k in self._works: - if k in work_metadata: - if work_metadata[k]['type'] == 'work': - self._works[k].work_id = work_metadata[k]['work_id'] - self._works[k].workload_id = work_metadata[k]['workload_id'] - self._works[k].transforming = work_metadata[k]['transforming'] - self._works[k].status = work_metadata[k]['status'] - self._works[k].substatus = work_metadata[k]['substatus'] - elif work_metadata[k]['type'] == 'workflow': - self._works[k].metadata = work_metadata[k]['metadata'] - - work = self._works[k] - if work.last_updated_at and (not self.last_updated_at or work.last_updated_at > self.last_updated_at): - self.last_updated_at = work.last_updated_at - - @property - def conditions(self): - return self._conditions - - @conditions.setter - def conditions(self, value): - self._conditions = value - conditions_metadata = {} - if self._conditions: - for k in self._conditions: - conditions_metadata[k] = self._conditions[k].metadata - self.add_metadata_item('conditions', conditions_metadata) - - @property - def work_conds(self): - return self._work_conds - - @work_conds.setter - def work_conds(self, value): - self._work_conds = value - # self.add_metadata_item('work_conds', value) - - def load_work_conditions(self): - conditions_metadata = self.get_metadata_item('conditions', {}) - for cond_internal_id in self._conditions: - if cond_internal_id in conditions_metadata: - self.conditions[cond_internal_id].metadata = conditions_metadata[cond_internal_id] - self.conditions[cond_internal_id].load_conditions(self.works) - - # work_conds = self.get_metadata_item('work_conds', {}) - # self._work_conds = work_conds - - @property - def loop_condition(self): - return self._loop_condition - - @loop_condition.setter - def loop_condition(self, value): - # self._loop_condition_position = position - self._loop_condition = value - if self._loop_condition: - self.add_metadata_item('loop_condition', self._loop_condition.get_condition_status()) - - @property - def work_sequence(self): - return self.get_metadata_item('work_sequence', {}) - - @work_sequence.setter - def work_sequence(self, value): - self.add_metadata_item('work_sequence', value) - - @property - def terminated_works(self): - return self.get_metadata_item('terminated_works', []) - - @terminated_works.setter - def terminated_works(self, value): - self.add_metadata_item('terminated_works', value) - - @property - def first_initial(self): - return self.get_metadata_item('first_initial', False) - - @first_initial.setter - def first_initial(self, value): - self.add_metadata_item('first_initial', value) - - @property - def new_to_run_works(self): - return self.get_metadata_item('new_to_run_works', []) - - @new_to_run_works.setter - def new_to_run_works(self, value): - self.add_metadata_item('new_to_run_works', value) - - @property - def current_running_works(self): - return self.get_metadata_item('current_running_works', []) - - @current_running_works.setter - def current_running_works(self, value): - self.add_metadata_item('current_running_works', value) - - @property - def num_subfinished_works(self): - return self.get_metadata_item('num_subfinished_works', 0) - - @num_subfinished_works.setter - def num_subfinished_works(self, value): - self.add_metadata_item('num_subfinished_works', value) - - @property - def num_finished_works(self): - return self.get_metadata_item('num_finished_works', 0) - - @num_finished_works.setter - def num_finished_works(self, value): - self.add_metadata_item('num_finished_works', value) - - @property - def num_failed_works(self): - return self.get_metadata_item('num_failed_works', 0) - - @num_failed_works.setter - def num_failed_works(self, value): - self.add_metadata_item('num_failed_works', value) - - @property - def num_cancelled_works(self): - return self.get_metadata_item('num_cancelled_works', 0) - - @num_cancelled_works.setter - def num_cancelled_works(self, value): - self.add_metadata_item('num_cancelled_works', value) - - @property - def num_suspended_works(self): - return self.get_metadata_item('num_suspended_works', 0) - - @num_suspended_works.setter - def num_suspended_works(self, value): - self.add_metadata_item('num_suspended_works', value) - - @property - def num_expired_works(self): - return self.get_metadata_item('num_expired_works', 0) - - @num_expired_works.setter - def num_expired_works(self, value): - self.add_metadata_item('num_expired_works', value) - - @property - def num_total_works(self): - return self.get_metadata_item('num_total_works', 0) - - @num_total_works.setter - def num_total_works(self, value): - self.add_metadata_item('num_total_works', value) - - @property - def last_work(self): - return self.get_metadata_item('last_work', None) - - @last_work.setter - def last_work(self, value): - self.add_metadata_item('last_work', value) - - @property - def to_update_transforms(self): - return self.get_metadata_item('to_update_transforms', {}) - - @to_update_transforms.setter - def to_update_transforms(self, value): - self.add_metadata_item('to_update_transforms', value) - - def load_metadata(self): - self.load_works() - self.load_work_conditions() - self.load_parameter_links() - - def get_class_name(self): - return self.__class__.__name__ - - def setup_logger(self): - """ - Setup logger - """ - self.logger = logging.getLogger(self.get_class_name()) - - def log_info(self, info): - if self.logger is None: - self.setup_logger() - self.logger.info(info) - - def log_debug(self, info): - if self.logger is None: - self.setup_logger() - self.logger.debug(info) - - def get_internal_id(self): - return self.internal_id - - def copy(self): - new_wf = copy.deepcopy(self) - return new_wf - - def __deepcopy__(self, memo): - logger = self.logger - self.logger = None - - cls = self.__class__ - result = cls.__new__(cls) - - memo[id(self)] = result - - # Deep copy all other attributes - for k, v in self.__dict__.items(): - setattr(result, k, copy.deepcopy(v, memo)) - - self.logger = logger - result.logger = logger - return result - - def get_works(self): - return self.works - - def get_new_work_to_run(self, work_id, new_parameters=None): - # 1. initialize works - # template_id = work.get_template_id() - work = self.works[work_id] - if isinstance(work, Workflow): - work.sync_works() - - work.sequence_id = self.num_total_works - - works = self.works - self.works = works - # self.work_sequence.append(new_work.get_internal_id()) - self.work_sequence[str(self.num_total_works)] = work.get_internal_id() - self.num_total_works += 1 - self.new_to_run_works.append(work.get_internal_id()) - self.last_work = work.get_internal_id() - else: - if new_parameters: - work.set_parameters(new_parameters) - work.sequence_id = self.num_total_works - - work.initialize_work() - works = self.works - self.works = works - # self.work_sequence.append(new_work.get_internal_id()) - self.work_sequence[str(self.num_total_works)] = work.get_internal_id() - self.num_total_works += 1 - self.new_to_run_works.append(work.get_internal_id()) - self.last_work = work.get_internal_id() - - return work - - def register_user_defined_condition(self, condition): - cond_src = inspect.getsource(condition) - self.user_defined_conditions[condition.__name__] = cond_src - - def load_user_defined_condition(self): - # try: - # Condition() - # except NameError: - # global Condition - # import Condition - - for cond_src_name in self.user_defined_conditions: - # global cond_src_name - exec(self.user_defined_conditions[cond_src_name]) - - def set_workload_id(self, workload_id): - self.workload_id = workload_id - - def get_workload_id(self): - return self.workload_id - - def add_initial_works(self, work): - self.initial_works.append(work.get_internal_id()) - if self.primary_initial_work is None: - self.primary_initial_work = work.get_internal_id() - - def add_work(self, work, initial=False, primary=False): - self.first_initial = False - self.works[work.get_internal_id()] = work - if initial: - if primary: - self.primary_initial_work = work.get_internal_id() - self.add_initial_works(work) - - self.independent_works.append(work.get_internal_id()) - - def add_condition(self, cond): - self.first_initial = False - cond_works = cond.all_works() - for cond_work in cond_works: - assert(cond_work.get_internal_id() in self.get_works()) - - conditions = self.conditions - conditions[cond.get_internal_id()] = cond - self.conditions = conditions - - # if cond.current_work not in self.work_conds: - # self.work_conds[cond.current_work] = [] - # self.work_conds[cond.current_work].append(cond) - work_conds = self.work_conds - for work in cond.all_pre_works(): - if work.get_internal_id() not in work_conds: - work_conds[work.get_internal_id()] = [] - work_conds[work.get_internal_id()].append(cond.get_internal_id()) - self.work_conds = work_conds - - # if a work is a true_work or false_work of a condition, - # should remove it from independent_works - cond_next_works = cond.all_next_works() - for next_work in cond_next_works: - if next_work.get_internal_id() in self.independent_works: - self.independent_works.remove(next_work.get_internal_id()) - - def add_parameter_link(self, work_source, work_destinations, parameter_link): - self.parameter_links[parameter_link.get_internal_id()] = parameter_link - if work_source.get_internal_id() not in self.parameter_links_source: - self.parameter_links_source[work_source.get_internal_id()] = [] - self.parameter_links_source[work_source.get_internal_id()].append(parameter_link.get_internal_id()) - - if type(work_destinations) not in [list, tuple]: - work_destinations = [] - for work_destination in work_destinations: - if work_destination.get_internal_id() not in self.parameter_links_destination: - self.parameter_links_destination[work_destination.get_internal_id()] = [] - self.parameter_links_destination[work_destination.get_internal_id()].append(parameter_link.get_internal_id()) - - def set_source_parameters(self, internal_id): - work = self.works[internal_id] - p_metadata = {} - if internal_id in self.parameter_links_source: - for p_id in self.parameter_links_source[internal_id]: - p_link = self.parameter_links[p_id] - p_link.set_parameters(work) - p_metadata[p_id] = p_link.metadata - self.add_metadata_item('parameter_links', p_metadata) - - def get_destination_parameters(self, internal_id): - # work = self.works[internal_id] - parameters = {} - if internal_id in self.parameter_links_destination: - for p_id in self.parameter_links_destination[internal_id]: - p_link = self.parameter_links[p_id] - parameters.update(p_link.get_parameters()) - return parameters - - def load_parameter_links(self): - p_metadata = self.get_metadata_item('parameter_links', {}) - for p_id in self.parameter_links: - if p_id in p_metadata: - self.parameter_links[p_id].metadata = p_metadata[p_id] - - def enable_next_works(self, work, cond): - self.log_debug("Checking Work %s condition: %s" % (work.get_internal_id(), - json_dumps(cond, sort_keys=True, indent=4))) - # load_conditions should cover it. - # if cond and self.is_class_method(cond.cond): - # # cond_work_id = self.works[cond.cond['idds_method_class_id']] - # cond.cond = getattr(work, cond.cond['idds_method']) - - self.log_info("Work %s condition: %s" % (work.get_internal_id(), cond.conditions)) - next_works = cond.get_next_works(trigger=ConditionTrigger.ToTrigger) - self.log_info("Work %s condition status %s" % (work.get_internal_id(), cond.get_cond_status())) - self.log_info("Work %s next works %s" % (work.get_internal_id(), str(next_works))) - new_next_works = [] - if next_works is not None: - for next_work in next_works: - parameters = self.get_destination_parameters(next_work.get_internal_id()) - new_next_work = self.get_new_work_to_run(next_work.get_internal_id(), parameters) - work.add_next_work(new_next_work.get_internal_id()) - # cond.add_condition_work(new_next_work) ####### TODO: - new_next_works.append(new_next_work) - return new_next_works - - def add_loop_condition(self, condition, position='end'): - self.loop_condition_position = position - self.loop_condition = condition - - def has_loop_condition(self): - if self.loop_condition: - return True - return False - - def get_loop_condition_status(self): - if self.has_loop_condition(): - self.loop_condition.load_conditions(self.works) - return self.loop_condition.get_condition_status() - return False - - def __str__(self): - return str(json_dumps(self)) - - def get_new_works(self): - """ - *** Function called by Marshaller agent. - - new works to be ready to start - """ - self.sync_works() - works = [] - for k in self.new_to_run_works: - if isinstance(self.works[k], Work): - works.append(self.works[k]) - if isinstance(self.works[k], Workflow): - works = works + self.works[k].get_new_works() - for k in self.current_running_works: - if isinstance(self.works[k], Workflow): - works = works + self.works[k].get_new_works() - return works - - def get_current_works(self): - """ - *** Function called by Marshaller agent. - - Current running works - """ - self.sync_works() - works = [] - for k in self.current_running_works: - if isinstance(self.works[k], Work): - works.append(self.works[k]) - if isinstance(self.works[k], Workflow): - works = works + self.works[k].get_current_works() - return works - - def get_all_works(self): - """ - *** Function called by Marshaller agent. - - Current running works - """ - self.sync_works() - - works = [] - for k in self.works: - if isinstance(self.works[k], Work): - works.append(self.works[k]) - if isinstance(self.works[k], Workflow): - works = works + self.works[k].get_all_works() - return works - - def get_primary_initial_collection(self): - """ - *** Function called by Clerk agent. - """ - - if self.primary_initial_work: - return self.get_works()[self.primary_initial_work].get_primary_input_collection() - elif self.initial_works: - return self.get_works()[self.initial_works[0]].get_primary_input_collection() - elif self.independent_works: - return self.get_works()[self.independent_works[0]].get_primary_input_collection() - else: - keys = self.get_works().keys() - return self.get_works()[keys[0]].get_primary_input_collection() - return None - - def get_dependency_works(self, work_id, depth, max_depth): - if depth > max_depth: - return [] - - deps = [] - for dep_work_id in self.work_dependencies[work_id]: - deps.append(dep_work_id) - l_deps = self.get_dependency_works(dep_work_id, depth + 1, max_depth) - deps += l_deps - deps = list(dict.fromkeys(deps)) - return deps - - def order_independent_works(self): - ind_work_ids = self.independent_works - self.independent_works = [] - self.work_dependencies = {} - for ind_work_id in ind_work_ids: - work = self.works[ind_work_id] - self.work_dependencies[ind_work_id] = [] - for ind_work_id1 in ind_work_ids: - if ind_work_id == ind_work_id1: - continue - work1 = self.works[ind_work_id1] - if work.depend_on(work1): - self.work_dependencies[ind_work_id].append(ind_work_id1) - self.log_debug('work dependencies 1: %s' % str(self.work_dependencies)) - - max_depth = len(ind_work_ids) + 1 - work_dependencies = copy.deepcopy(self.work_dependencies) - for work_id in work_dependencies: - deps = self.get_dependency_works(work_id, 0, max_depth) - self.work_dependencies[work_id] = deps - self.log_debug('work dependencies 2: %s' % str(self.work_dependencies)) - - while True: - for work_id in self.work_dependencies: - if work_id not in self.independent_works and len(self.work_dependencies[work_id]) == 0: - self.independent_works.append(work_id) - for work_id in self.independent_works: - if work_id in self.work_dependencies: - del self.work_dependencies[work_id] - for work_id in self.work_dependencies: - for in_work_id in self.independent_works: - if in_work_id in self.work_dependencies[work_id]: - self.work_dependencies[work_id].remove(in_work_id) - if not self.work_dependencies: - break - self.log_debug('independent_works: %s' % str(self.independent_works)) - - def first_initialize(self): - # set new_to_run works - if not self.first_initial: - self.first_initial = True - self.order_independent_works() - if self.initial_works: - tostart_works = self.initial_works - elif self.independent_works: - tostart_works = self.independent_works - else: - tostart_works = list(self.get_works().keys()) - tostart_works = [tostart_works[0]] - - for work_id in tostart_works: - self.get_new_work_to_run(work_id) - - def sync_works(self): - self.first_initialize() - - self.refresh_works() - - for k in self.works: - work = self.works[k] - self.log_debug("work %s is_terminated(%s:%s)" % (work.get_internal_id(), work.is_terminated(), work.get_status())) - - for work in [self.works[k] for k in self.new_to_run_works]: - if work.transforming: - self.new_to_run_works.remove(work.get_internal_id()) - self.current_running_works.append(work.get_internal_id()) - - for work in [self.works[k] for k in self.current_running_works]: - if isinstance(work, Workflow): - work.sync_works() - - if work.is_terminated(): - self.set_source_parameters(work.get_internal_id()) - - if work.get_internal_id() in self.work_conds: - self.log_debug("Work %s has condition dependencies %s" % (work.get_internal_id(), - json_dumps(self.work_conds[work.get_internal_id()], sort_keys=True, indent=4))) - for cond_id in self.work_conds[work.get_internal_id()]: - cond = self.conditions[cond_id] - self.log_debug("Work %s has condition dependencie %s" % (work.get_internal_id(), - json_dumps(cond, sort_keys=True, indent=4))) - self.enable_next_works(work, cond) - - if work.is_terminated(): - self.log_info("Work %s is terminated(%s)" % (work.get_internal_id(), work.get_status())) - self.log_debug("Work conditions: %s" % json_dumps(self.work_conds, sort_keys=True, indent=4)) - if work.get_internal_id() not in self.work_conds: - # has no next work - self.log_info("Work %s has no condition dependencies" % work.get_internal_id()) - self.terminated_works.append(work.get_internal_id()) - self.current_running_works.remove(work.get_internal_id()) - else: - # self.log_debug("Work %s has condition dependencies %s" % (work.get_internal_id(), - # json_dumps(self.work_conds[work.get_template_id()], sort_keys=True, indent=4))) - # for cond in self.work_conds[work.get_template_id()]: - # self.enable_next_works(work, cond) - self.terminated_works.append(work.get_internal_id()) - self.current_running_works.remove(work.get_internal_id()) - - if work.is_finished(): - self.num_finished_works += 1 - elif work.is_subfinished(): - self.num_subfinished_works += 1 - elif work.is_failed(): - self.num_failed_works += 1 - elif work.is_expired(): - self.num_expired_works += 1 - elif work.is_cancelled(): - self.num_cancelled_works += 1 - elif work.is_suspended(): - self.num_suspended_works += 1 - - # if work.is_terminated(): - # # if it's a loop workflow, to generate new loop - # if isinstance(work, Workflow): - # work.sync_works() - log_str = "num_total_works: %s" % self.num_total_works - log_str += ", num_finished_works: %s" % self.num_finished_works - log_str += ", num_subfinished_works: %s" % self.num_subfinished_works - log_str += ", num_failed_works: %s" % self.num_failed_works - log_str += ", num_expired_works: %s" % self.num_expired_works - log_str += ", num_cancelled_works: %s" % self.num_cancelled_works - log_str += ", num_suspended_works: %s" % self.num_suspended_works - self.log_debug(log_str) - - def resume_works(self): - self.num_subfinished_works = 0 - self.num_finished_works = 0 - self.num_failed_works = 0 - self.num_cancelled_works = 0 - self.num_suspended_works = 0 - self.num_expired_works = 0 - - self.last_updated_at = datetime.datetime.utcnow() - - t_works = self.terminated_works - self.terminated_works = [] - self.current_running_works = self.current_running_works + t_works - for work in [self.works[k] for k in self.current_running_works]: - if isinstance(work, Workflow): - work.resume_works() - else: - work.resume_work() - - def clean_works(self): - self.num_subfinished_works = 0 - self.num_finished_works = 0 - self.num_failed_works = 0 - self.num_cancelled_works = 0 - self.num_suspended_works = 0 - self.num_expired_works = 0 - self.num_total_works = 0 - - self.last_updated_at = datetime.datetime.utcnow() - - self.terminated_works = [] - self.current_running_works = [] - self.works = {} - self.work_sequence = {} # order list - - self.first_initial = False - self.new_to_run_works = [] - - def get_exact_workflows(self): - """ - *** Function called by Clerk agent. - - TODO: The primary dataset for the initial work is a dataset with '*'. - workflow.primary_initial_collection = 'some datasets with *' - collections = get_collection(workflow.primary_initial_collection) - wfs = [] - for coll in collections: - wf = self.copy() - wf.name = self.name + "_" + number - wf.primary_initial_collection = coll - wfs.append(wf) - return wfs - """ - return [self] - - def is_terminated(self): - """ - *** Function called by Marshaller agent. - """ - self.sync_works() - if len(self.new_to_run_works) == 0 and len(self.current_running_works) == 0: - return True - return False - - def is_finished(self): - """ - *** Function called by Marshaller agent. - """ - return self.is_terminated() and self.num_finished_works == self.num_total_works - - def is_subfinished(self): - """ - *** Function called by Marshaller agent. - """ - return self.is_terminated() and (self.num_finished_works + self.num_subfinished_works > 0 and self.num_finished_works + self.num_subfinished_works <= self.num_total_works) - - def is_failed(self): - """ - *** Function called by Marshaller agent. - """ - return self.is_terminated() and (self.num_failed_works > 0) and (self.num_cancelled_works == 0) and (self.num_suspended_works == 0) and (self.num_expired_works == 0) - - def is_to_expire(self, expired_at=None, pending_time=None, request_id=None): - if self.expired: - # it's already expired. avoid sending duplicated messages again and again. - return False - if expired_at: - if type(expired_at) in [str]: - expired_at = str_to_date(expired_at) - if expired_at < datetime.datetime.utcnow(): - self.logger.info("Request(%s) expired_at(%s) is smaller than utc now(%s), expiring" % (request_id, - expired_at, - datetime.datetime.utcnow())) - return True - - act_pending_time = None - if self.pending_time: - # in days - act_pending_time = float(self.pending_time) - else: - if pending_time: - act_pending_time = float(pending_time) - if act_pending_time: - act_pending_seconds = int(86400 * act_pending_time) - if self.last_updated_at + datetime.timedelta(seconds=act_pending_seconds) < datetime.datetime.utcnow(): - log_str = "Request(%s) last updated at(%s) + pending seconds(%s)" % (request_id, - self.last_updated_at, - act_pending_seconds) - log_str += " is smaller than utc now(%s), expiring" % (datetime.datetime.utcnow()) - self.logger.info(log_str) - return True - - return False - - def is_expired(self): - """ - *** Function called by Marshaller agent. - """ - # return self.is_terminated() and (self.num_expired_works > 0) - return self.is_terminated() and self.expired - - def is_cancelled(self): - """ - *** Function called by Marshaller agent. - """ - return self.is_terminated() and (self.num_cancelled_works > 0) - - def is_suspended(self): - """ - *** Function called by Marshaller agent. - """ - return self.is_terminated() and (self.num_suspended_works > 0) - - def get_terminated_msg(self): - """ - *** Function called by Marshaller agent. - """ - if self.last_work: - return self.works[self.last_work].get_terminated_msg() - return None - - def get_status(self): - if self.is_terminated(): - if self.is_finished(): - return WorkStatus.Finished - elif self.is_subfinished(): - return WorkStatus.SubFinished - elif self.is_failed(): - return WorkStatus.Failed - elif self.is_expired(): - return WorkStatus.Expired - elif self.is_cancelled(): - return WorkStatus.Cancelled - elif self.is_suspended(): - return WorkStatus.Suspended - return WorkStatus.Transforming - - def depend_on(self, work): - return False - - def add_proxy(self): - self.proxy = get_proxy() - if not self.proxy: - raise Exception("Cannot get local proxy") - - def get_proxy(self): - return self.proxy - - -class Workflow(Base): - def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None, logger=None): - # super(Workflow, self).__init__(name=name, workload_id=workload_id, lifetime=lifetime, pending_time=pending_time, logger=logger) - self.logger = logger - if self.logger is None: - self.setup_logger() - - self.template = WorkflowBase(name=name, workload_id=workload_id, lifetime=lifetime, pending_time=pending_time, logger=logger) - self.num_run = 0 - self.runs = {} - self.loop_condition_position = 'end' - - def setup_logger(self): - # Setup logger - self.logger = logging.getLogger(self.get_class_name()) - - def __deepcopy__(self, memo): - logger = self.logger - self.logger = None - - cls = self.__class__ - result = cls.__new__(cls) - - memo[id(self)] = result - - # Deep copy all other attributes - for k, v in self.__dict__.items(): - setattr(result, k, copy.deepcopy(v, memo)) - - self.logger = logger - result.logger = logger - return result - - @property - def metadata(self): - run_metadata = {'num_run': self.num_run, - 'runs': {}} - for run_id in self.runs: - run_metadata['runs'][run_id] = self.runs[run_id].metadata - return run_metadata - - @metadata.setter - def metadata(self, value): - run_metadata = value - self.num_run = run_metadata['num_run'] - runs = run_metadata['runs'] - for run_id in runs: - self.runs[run_id] = self.template.copy() - self.runs[run_id].metadata = runs[run_id] - # self.add_metadata_item('runs', ) - - @property - def independent_works(self): - if self.runs: - return self.runs[str(self.num_run)].independent_works - return self.template.independent_works - - @independent_works.setter - def independent_works(self, value): - if self.runs: - self.runs[str(self.num_run)].independent_works = value - self.template.independent_works = value - - @property - def last_updated_at(self): - if self.runs: - return self.runs[str(self.num_run)].last_updated_at - return None - - @last_updated_at.setter - def last_updated_at(self, value): - if self.runs: - self.runs[str(self.num_run)].last_updated_at = value - - @property - def transforming(self): - if self.runs and str(self.num_run) in self.runs: - return True - return False - - @transforming.setter - def transforming(self, value): - if self.num_run < 1: - self.num_run = 1 - if str(self.num_run) not in self.runs: - self.runs[str(self.num_run)] = self.template.copy() - - def set_workload_id(self, workload_id): - if self.runs: - self.runs[str(self.num_run)].workload_id = workload_id - else: - self.template.workload_id = workload_id - # self.dynamic.workload_id = workload_id - - def get_internal_id(self): - if self.runs: - return self.runs[str(self.num_run)].get_internal_id() - return self.template.get_internal_id() - - def get_workload_id(self): - if self.runs: - return self.runs[str(self.num_run)].workload_id - return self.template.workload_id - - def add_work(self, work, initial=False, primary=False): - self.template.add_work(work, initial, primary) - - def add_condition(self, cond): - self.template.add_condition(cond) - - def get_new_works(self): - self.sync_works() - if self.runs: - return self.runs[str(self.num_run)].get_new_works() - return [] - - def get_current_works(self): - self.sync_works() - if self.runs: - return self.runs[str(self.num_run)].get_current_works() - return [] - - def get_all_works(self): - self.sync_works() - if self.runs: - return self.runs[str(self.num_run)].get_all_works() - return [] - - def get_primary_initial_collection(self): - if self.runs: - return self.runs[str(self.num_run)].get_primary_initial_collection() - return self.template.get_primary_initial_collection() - - def resume_works(self): - if self.runs: - self.runs[str(self.num_run)].resume_works() - - def clean_works(self): - if self.runs: - self.runs[str(self.num_run)].clean_works() - - def is_terminated(self): - if self.runs: - if self.runs[str(self.num_run)].is_terminated(): - if not self.runs[str(self.num_run)].has_loop_condition() or not self.runs[str(self.num_run)].get_loop_condition_status(): - return True - return False - - def is_finished(self): - if self.is_terminated(): - return self.runs[str(self.num_run)].is_finished() - return False - - def is_subfinished(self): - if self.is_terminated(): - return self.runs[str(self.num_run)].is_subfinished() - return False - - def is_failed(self): - if self.is_terminated(): - return self.runs[str(self.num_run)].is_failed() - return False - - def is_expired(self): - if self.is_terminated(): - return self.runs[str(self.num_run)].is_expired() - return False - - def is_cancelled(self): - if self.is_terminated(): - return self.runs[str(self.num_run)].is_cancelled() - return False - - def is_suspended(self): - if self.is_terminated(): - return self.runs[str(self.num_run)].is_suspended() - return False - - def get_terminated_msg(self): - if self.is_terminated(): - return self.runs[str(self.num_run)].get_terminated_msg() - return None - - def get_status(self): - if not self.runs: - return WorkStatus.New - if not self.is_terminated(): - return WorkStatus.Transforming - return self.runs[str(self.num_run)].get_status() - - def depend_on(self, work): - return self.template.depend_on(work) - - def add_proxy(self): - self.template.add_proxy() - - def get_proxy(self): - self.template.get_proxy() - - def add_loop_condition(self, condition, position='end'): - if not position or position != 'begin': - position = 'end' - position = 'end' # force position to end currently. position = 'begin' is not supported now. - self.template.add_loop_condition(condition, position=position) - self.loop_condition_position = position - - def refresh_works(self): - if self.runs: - self.runs[str(self.num_run)].refresh_works() - - def sync_works(self): - # position is end. - if self.num_run < 1: - self.num_run = 1 - if str(self.num_run) not in self.runs: - self.runs[str(self.num_run)] = self.template.copy() - - self.runs[str(self.num_run)].sync_works() - - if self.runs[str(self.num_run)].is_terminated(): - if self.runs[str(self.num_run)].has_loop_condition(): - if self.runs[str(self.num_run)].get_loop_condition_status(): - self.num_run += 1 - self.runs[str(self.num_run)] = self.template.copy() - - -class SubWorkflow(Workflow): - def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None, logger=None): - # Init a workflow. - super(SubWorkflow, self).__init__(name=name, workload_id=workload_id, lifetime=lifetime, pending_time=pending_time, logger=logger) - - -class LoopWorkflow(Workflow): - def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None, logger=None): - # Init a workflow. - super(LoopWorkflow, self).__init__(name=name, workload_id=workload_id, lifetime=lifetime, pending_time=pending_time, logger=logger) diff --git a/workflow/lib/idds/workflowv2/datawork.py b/workflow/lib/idds/workflowv2/datawork.py new file mode 100644 index 00000000..0b7a955d --- /dev/null +++ b/workflow/lib/idds/workflowv2/datawork.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2022 + +from .work import Work + + +class DataWork(Work): + def __init__(self, executable=None, arguments=None, parameters=None, setup=None, work_type=None, + work_tag=None, exec_type='local', sandbox=None, request_id=None, work_id=None, work_name=None, + primary_input_collection=None, other_input_collections=None, input_collections=None, + primary_output_collection=None, other_output_collections=None, output_collections=None, + log_collections=None, release_inputs_after_submitting=False, username=None, + agent_attributes=None, is_template=False, + logger=None): + super(DataWork, self).__init__(executable=executable, arguments=arguments, + parameters=parameters, setup=setup, work_type=work_type, + exec_type=exec_type, sandbox=sandbox, work_id=work_id, + primary_input_collection=primary_input_collection, + other_input_collections=other_input_collections, + primary_output_collection=primary_output_collection, + other_output_collections=other_output_collections, + input_collections=input_collections, + output_collections=output_collections, + log_collections=log_collections, + agent_attributes=agent_attributes, + logger=logger) diff --git a/workflow/lib/idds/workflowv2/processingwork.py b/workflow/lib/idds/workflowv2/processingwork.py new file mode 100644 index 00000000..da90c91e --- /dev/null +++ b/workflow/lib/idds/workflowv2/processingwork.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2022 + +from idds.common.constants import TransformType + +from .work import Work + + +class ProcessingWork(Work): + def __init__(self, executable=None, arguments=None, parameters=None, setup=None, work_type=None, + work_tag=None, exec_type='local', sandbox=None, request_id=None, work_id=None, work_name=None, + primary_input_collection=None, other_input_collections=None, input_collections=None, + primary_output_collection=None, other_output_collections=None, output_collections=None, + log_collections=None, release_inputs_after_submitting=False, username=None, + agent_attributes=None, is_template=False, + logger=None): + super(ProcessingWork, self).__init__(executable=executable, arguments=arguments, + parameters=parameters, setup=setup, work_type=TransformType.Processing, + exec_type=exec_type, sandbox=sandbox, work_id=work_id, + primary_input_collection=primary_input_collection, + other_input_collections=other_input_collections, + primary_output_collection=primary_output_collection, + other_output_collections=other_output_collections, + input_collections=input_collections, + output_collections=output_collections, + log_collections=log_collections, + agent_attributes=agent_attributes, + logger=logger) diff --git a/workflow/lib/idds/workflowv2/work.py b/workflow/lib/idds/workflowv2/work.py index 8e14595d..fc5300a7 100644 --- a/workflow/lib/idds/workflowv2/work.py +++ b/workflow/lib/idds/workflowv2/work.py @@ -19,6 +19,7 @@ from idds.common import exceptions from idds.common.constants import (WorkStatus, ProcessingStatus, CollectionStatus, CollectionType) +from idds.common.constants import get_work_status_from_transform_processing_status from idds.common.utils import setup_logging from idds.common.utils import str_to_date # from idds.common.utils import json_dumps @@ -63,6 +64,11 @@ def __init__(self, scope=None, name=None, coll_type=CollectionType.Dataset, coll self.status = CollectionStatus.New self.substatus = CollectionStatus.New + self.total_files = 0 + self.processed_files = 0 + self.processing_files = 0 + self.bytes = 0 + @property def internal_id(self): return self.get_metadata_item('internal_id') @@ -134,6 +140,11 @@ def collection(self, value): self.status = self._collection['status'] self.substatus = self._collection['substatus'] + self.total_files = self._collection['total_files'] + self.processed_files = self._collection['processed_files'] + self.processing_files = self._collection['processing_files'] + self.bytes = self._collection['bytes'] + def to_origin_dict(self): return {'scope': self.scope, 'name': self.name} @@ -369,6 +380,14 @@ def external_id(self): def external_id(self, value): self.add_metadata_item('external_id', value) + @property + def old_external_id(self): + return self.get_metadata_item('old_external_id', []) + + @old_external_id.setter + def old_external_id(self, value): + self.add_metadata_item('old_external_id', value) + @property def task_name(self): return self.get_metadata_item('task_name', None) @@ -559,6 +578,8 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, self.or_custom_conditions = {} self.and_custom_conditions = {} + self.sliced_global_parameters = None + """ self._running_data_names = [] for name in ['internal_id', 'template_work_id', 'initialized', 'sequence_id', 'parameters', 'work_id', 'transforming', 'workdir', @@ -572,6 +593,11 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, self._running_data_names.append(name) """ + def get_logger(self): + if self.logger is None: + self.logger = self.setup_logger() + return self.logger + def get_class_name(self): return self.__class__.__name__ @@ -663,6 +689,14 @@ def work_id(self): def work_id(self, value): self.add_metadata_item('work_id', value) + @property + def parent_workload_id(self): + return self.get_metadata_item('parent_workload_id', None) + + @parent_workload_id.setter + def parent_workload_id(self, value): + self.add_metadata_item('parent_workload_id', value) + @property def transforming(self): return self.get_metadata_item('transforming', False) @@ -1036,10 +1070,28 @@ def get_work_name(self): def get_is_template(self): self.is_template - def sync_global_parameters(self, global_parameters): + def sync_global_parameters(self, global_parameters, sliced_global_parameters=None): + if sliced_global_parameters: + self.sliced_global_parameters = sliced_global_parameters + if global_parameters: for key in global_parameters: - setattr(self, key, global_parameters[key]) + sliced_index = None + sliced_name = None + if self.sliced_global_parameters and key in self.sliced_global_parameters: + sliced_index = self.sliced_global_parameters[key]['index'] + sliced_name = self.sliced_global_parameters[key]['name'] + if type(global_parameters[key]) in [list, tuple] and sliced_index < len(global_parameters[key]): + pass + else: + sliced_index = None + if not sliced_name: + sliced_name = key + + if sliced_index is None: + setattr(self, sliced_name, global_parameters[key]) + else: + setattr(self, sliced_name, global_parameters[key][sliced_index]) def get_global_parameter_from_output_data(self, key): self.logger.debug("get_global_parameter_from_output_data, key: %s, output_data: %s" % (key, str(self.output_data))) @@ -1093,6 +1145,9 @@ def get_custom_condition_status_value_bool(self, key): return False def get_custom_condition_status_value(self, key): + if self.output_data and key in self.output_data: + return self.output_data[key] + user_key = "user_" + key if hasattr(self, user_key): key = user_key @@ -1135,6 +1190,7 @@ def setup_logger(self): Setup logger """ self.logger = logging.getLogger(self.get_class_name()) + return self.logger def add_errors(self, error): self.errors.append(error) @@ -1296,7 +1352,7 @@ def is_started(self): return self.started def is_running(self): - if self.status in [WorkStatus.Running]: + if self.status in [WorkStatus.Running, WorkStatus.Transforming]: return True return False @@ -1504,7 +1560,7 @@ def add_other_output_collections(self, colls): def get_other_output_collections(self): return [self.collections[k] for k in self._other_output_collections] - def get_input_collections(self): + def get_input_collections(self, poll_externel=False): """ *** Function called by Transformer agent. """ @@ -1550,6 +1606,9 @@ def get_internal_collections(self, coll): return [] return [] + def poll_external_collection(self, coll): + return coll + def poll_internal_collection(self, coll): try: if coll.status in [CollectionStatus.Closed]: @@ -1649,6 +1708,15 @@ def get_log_collections(self): def set_has_new_inputs(self, yes=True): self.has_new_inputs = yes + def has_dependency(self): + return False + + def get_parent_work_names(self): + return [] + + def get_parent_workload_ids(self): + return [] + def get_new_input_output_maps(self, mapped_input_output_maps={}): """ *** Function called by Transformer agent. @@ -1958,7 +2026,7 @@ def submit_processing(self, processing): """ raise exceptions.NotImplementedException - def abort_processing(self, processing): + def abort_processing_old(self, processing): """ *** Function called by Carrier agent. """ @@ -1980,7 +2048,7 @@ def suspend_processing(self, processing): proc = processing['processing_metadata']['processing'] proc.tosuspend = True - def resume_processing(self, processing): + def resume_processing_old(self, processing): """ *** Function called by Carrier agent. """ @@ -2070,7 +2138,7 @@ def syn_work_status(self, input_output_maps, all_updates_flushed=True, output_st self.started = True self.logger.debug("syn_work_status(%s): work.status: %s" % (str(self.get_processing_ids()), str(self.status))) - def sync_work_data(self, status, substatus, work): + def sync_work_data(self, status, substatus, work, workload_id=None, output_data=None): # self.status = work.status work.work_id = self.work_id work.transforming = self.transforming @@ -2078,12 +2146,20 @@ def sync_work_data(self, status, substatus, work): # clerk will update next_works while transformer doesn't. # synchronizing work metadata from transformer to clerk needs to keep it at first. next_works = self.next_works - self.metadata = work.metadata + # self.metadata = work.metadata self.next_works = next_works self.status_statistics = work.status_statistics - self.processings = work.processings - self.output_data = work.output_data + # self.processings = work.processings + if output_data: + self.output_data = output_data + else: + self.output_data = work.output_data + + self.status = get_work_status_from_transform_processing_status(status) + self.substatus = get_work_status_from_transform_processing_status(substatus) + if workload_id: + self.workload_id = workload_id """ self.status = WorkStatus(status.value) @@ -2105,6 +2181,14 @@ def sync_work_data(self, status, substatus, work): self.suspended_processings = work.suspended_processings """ + def abort_processing(self, processing, log_prefix=''): + msg = "abort processing is not implemented" + self.logger.error(log_prefix + msg) + + def resume_processing(self, processing, log_prefix=''): + msg = "resume processing is not implemented" + self.logger.error(log_prefix + msg) + def add_proxy(self, proxy): self.proxy = proxy diff --git a/workflow/lib/idds/workflowv2/workflow.py b/workflow/lib/idds/workflowv2/workflow.py index 392e8933..c44ac5d6 100644 --- a/workflow/lib/idds/workflowv2/workflow.py +++ b/workflow/lib/idds/workflowv2/workflow.py @@ -13,7 +13,6 @@ import logging import inspect import random -import time import uuid @@ -615,7 +614,8 @@ def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None self._name = 'idds.workflow.' + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S_%f") + str(random.randint(1, 1000)) if workload_id is None: - workload_id = int(time.time()) + # workload_id = int(time.time()) + pass self.workload_id = workload_id self.logger = logger @@ -667,6 +667,7 @@ def __init__(self, name=None, workload_id=None, lifetime=None, pending_time=None self.global_parameters = {} + self.to_cancel = False """ self._running_data_names = [] for name in ['internal_id', 'template_work_id', 'workload_id', 'work_sequence', 'terminated_works', @@ -881,6 +882,29 @@ def global_parameters(self, value): def set_global_parameters(self, value): self.global_parameters = value + @property + def sliced_global_parameters(self): + self._sliced_global_parameters = self.get_metadata_item('sliced_gp', {}) + return self._sliced_global_parameters + + @sliced_global_parameters.setter + def sliced_global_parameters(self, value): + self._sliced_global_parameters = value + gp_metadata = {} + if self._sliced_global_parameters: + for key in self._sliced_global_parameters: + if key.startswith("user_"): + gp_metadata[key] = self._sliced_global_parameters[key] + else: + self.logger.warn("Only parameters start with 'user_' can be set as global parameters. The parameter '%s' will be ignored." % (key)) + self.add_metadata_item('sliced_gp', gp_metadata) + + def set_sliced_global_parameters(self, source, name=None, index=0): + sliced_global_parameters = self.sliced_global_parameters + sliced_global_parameters[source] = {'name': name, 'index': index} + # to trigger the setter function + self.sliced_global_parameters = self.sliced_global_parameters + def sync_global_parameters_from_work(self, work): self.log_debug("work %s is_terminated, global_parameters: %s" % (work.get_internal_id(), str(self.global_parameters))) if self.global_parameters: @@ -928,6 +952,14 @@ def first_initial(self): def first_initial(self, value): self.add_metadata_item('first_initial', value) + @property + def to_start_works(self): + return self.get_metadata_item('to_start_works', []) + + @to_start_works.setter + def to_start_works(self, value): + self.add_metadata_item('to_start_works', value) + @property def new_to_run_works(self): return self.get_metadata_item('new_to_run_works', []) @@ -1032,6 +1064,14 @@ def num_run(self): def num_run(self, value): self.add_metadata_item('num_run', value) + @property + def to_cancel(self): + return self.get_metadata_item('to_cancel', False) + + @to_cancel.setter + def to_cancel(self, value): + self.add_metadata_item('to_cancel', value) + def load_metadata(self): self.load_works() self.load_work_conditions() @@ -1087,11 +1127,13 @@ def get_new_work_to_run(self, work_id, new_parameters=None): # 1. initialize works # template_id = work.get_template_id() work = self.works[work_id] + work.workload_id = None + if isinstance(work, Workflow): - work.sync_works() + work.parent_num_run = self.num_run + work.sync_works(to_cancel=self.to_cancel) work.sequence_id = self.num_total_works - work.parent_num_run = self.num_run works = self.works self.works = works @@ -1108,8 +1150,13 @@ def get_new_work_to_run(self, work_id, new_parameters=None): work.num_run = self.num_run work.initialize_work() - work.sync_global_parameters(self.global_parameters) + work.sync_global_parameters(self.global_parameters, self.sliced_global_parameters) work.renew_parameters_from_attributes() + if work.parent_workload_id is None and self.num_total_works > 0: + last_work_id = self.work_sequence[str(self.num_total_works - 1)] + last_work = self.works[last_work_id] + work.parent_workload_id = last_work.workload_id + last_work.add_next_work(work.get_internal_id()) works = self.works self.works = works # self.work_sequence.append(new_work.get_internal_id()) @@ -1127,7 +1174,7 @@ def get_new_parameters_for_work(self, work): work.sequence_id = self.num_total_works work.initialize_work() - work.sync_global_parameters(self.global_parameters) + work.sync_global_parameters(self.global_parameters, self.sliced_global_parameters) work.renew_parameters_from_attributes() works = self.works self.works = works @@ -1311,6 +1358,7 @@ def enable_next_works(self, work, cond): # parameters = self.get_destination_parameters(next_work.get_internal_id()) new_next_work = self.get_new_work_to_run(next_work.get_internal_id()) work.add_next_work(new_next_work.get_internal_id()) + new_next_work.parent_workload_id = work.workload_id # cond.add_condition_work(new_next_work) ####### TODO: new_next_works.append(new_next_work) return new_next_works @@ -1340,8 +1388,22 @@ def get_new_works(self): new works to be ready to start """ - self.sync_works() + if self.to_cancel: + return [] + + self.sync_works(to_cancel=self.to_cancel) works = [] + + if self.to_start_works: + init_works = self.init_works + to_start_works = self.to_start_works + work_id = to_start_works.pop(0) + self.to_start_works = to_start_works + self.get_new_work_to_run(work_id) + if not init_works: + init_works.append(work_id) + self.init_works = init_works + for k in self.new_to_run_works: if isinstance(self.works[k], Work): self.works[k] = self.get_new_parameters_for_work(self.works[k]) @@ -1359,7 +1421,7 @@ def get_current_works(self): Current running works """ - self.sync_works() + self.sync_works(to_cancel=self.to_cancel) works = [] for k in self.current_running_works: if isinstance(self.works[k], Work): @@ -1374,7 +1436,7 @@ def get_all_works(self): Current running works """ - self.sync_works() + self.sync_works(to_cancel=self.to_cancel) works = [] for k in self.works: @@ -1492,14 +1554,15 @@ def first_initialize(self): tostart_works = list(self.get_works().keys()) tostart_works = [tostart_works[0]] - init_works = [] + to_start_works = self.to_start_works for work_id in tostart_works: - self.get_new_work_to_run(work_id) - init_works.append(work_id) - self.init_works = init_works + to_start_works.append(work_id) + self.to_start_works = to_start_works self.log_debug("first initialized") - def sync_works(self): + def sync_works(self, to_cancel=False): + if to_cancel: + self.to_cancel = to_cancel self.log_debug("synchroning works") self.first_initialize() @@ -1516,7 +1579,7 @@ def sync_works(self): for work in [self.works[k] for k in self.current_running_works]: if isinstance(work, Workflow): - work.sync_works() + work.sync_works(to_cancel=self.to_cancel) if work.is_terminated(): self.log_debug("work %s is_terminated, sync_global_parameters_from_work" % (work.get_internal_id())) @@ -1572,12 +1635,15 @@ def sync_works(self): log_str += ", num_expired_works: %s" % self.num_expired_works log_str += ", num_cancelled_works: %s" % self.num_cancelled_works log_str += ", num_suspended_works: %s" % self.num_suspended_works + log_str += ", new_to_run_works: %s" % len(self.new_to_run_works) + log_str += ", current_running_works: %s" % len(self.current_running_works) self.log_debug(log_str) self.refresh_works() self.log_debug("synchronized works") def resume_works(self): + self.to_cancel = False self.num_subfinished_works = 0 self.num_finished_works = 0 self.num_failed_works = 0 @@ -1716,8 +1782,8 @@ def is_terminated(self): """ *** Function called by Marshaller agent. """ - self.sync_works() - if len(self.new_to_run_works) == 0 and len(self.current_running_works) == 0: + self.sync_works(to_cancel=self.to_cancel) + if (self.to_cancel or len(self.new_to_run_works) == 0) and len(self.current_running_works) == 0: return True return False @@ -1960,16 +2026,28 @@ def lifetime(self): def lifetime(self, value): self.template.lifetime = value + @property + def to_cancel(self): + return self.template.to_cancel + + @to_cancel.setter + def to_cancel(self, value): + if self.runs: + self.runs[str(self.num_run)].to_cancel = value + self.template.to_cancel = value + @property def num_run(self): if self.parent_num_run: - return self.parent_num_run * 100 + self._num_run + # return self.parent_num_run * 100 + self._num_run + pass return self._num_run @num_run.setter def num_run(self, value): if self.parent_num_run: - self._num_run = value - self.parent_num_run * 100 + # self._num_run = value - self.parent_num_run * 100 + self._num_run = value else: self._num_run = value @@ -1985,6 +2063,7 @@ def transforming(self, value): self._num_run = 1 if str(self.num_run) not in self.runs: self.runs[str(self.num_run)] = self.template.copy() + self.runs[str(self.num_run)].num_run = self.num_run if self.runs[str(self.num_run)].has_loop_condition(): self.runs[str(self.num_run)].num_run = self.num_run if self._num_run > 1: @@ -2032,6 +2111,9 @@ def refresh_parameter_links(self): def set_global_parameters(self, value): self.template.set_global_parameters(value) + def set_sliced_global_parameters(self, source, name=None, index=0): + self.template.set_sliced_global_parameters(source, name=name, index=index) + def sync_global_parameters_from_work(self, work): if self.runs: return self.runs[str(self.num_run)].sync_global_parameters_from_work(work) @@ -2039,20 +2121,20 @@ def sync_global_parameters_from_work(self, work): def get_new_works(self): self.log_debug("synchronizing works") - self.sync_works() + self.sync_works(to_cancel=self.to_cancel) self.log_debug("synchronized works") if self.runs: return self.runs[str(self.num_run)].get_new_works() return [] def get_current_works(self): - self.sync_works() + self.sync_works(to_cancel=self.to_cancel) if self.runs: return self.runs[str(self.num_run)].get_current_works() return [] def get_all_works(self): - self.sync_works() + self.sync_works(to_cancel=self.to_cancel) if self.runs: return self.runs[str(self.num_run)].get_all_works() return [] @@ -2065,6 +2147,7 @@ def get_primary_initial_collection(self): def resume_works(self): if self.runs: self.runs[str(self.num_run)].resume_works() + self.template.to_cancel = False def clean_works(self): # if self.runs: @@ -2147,34 +2230,40 @@ def refresh_works(self): if self.runs: self.runs[str(self.num_run)].refresh_works() - def sync_works(self): + def sync_works(self, to_cancel=False): + if to_cancel: + self.to_cancel = to_cancel # position is end. - if self._num_run < 1: - self._num_run = 1 + if self.num_run < 1: + self.num_run = 1 if str(self.num_run) not in self.runs: self.runs[str(self.num_run)] = self.template.copy() + self.runs[str(self.num_run)].num_run = self.num_run if self.runs[str(self.num_run)].has_loop_condition(): - self.runs[str(self.num_run)].num_run = self._num_run - if self._num_run > 1: + self.runs[str(self.num_run)].num_run = self.num_run + if self.num_run > 1: p_metadata = self.runs[str(self.num_run - 1)].get_metadata_item('parameter_links') self.runs[str(self.num_run)].add_metadata_item('parameter_links', p_metadata) - self.runs[str(self.num_run)].sync_works() + self.runs[str(self.num_run)].sync_works(to_cancel=to_cancel) if self.runs[str(self.num_run)].is_terminated(): - if self.runs[str(self.num_run)].has_loop_condition(): - if self.runs[str(self.num_run)].get_loop_condition_status(): - self.logger.info("num_run %s get_loop_condition_status %s, start next run" % (self.num_run, self.runs[str(self.num_run)].get_loop_condition_status())) - self._num_run += 1 - self.runs[str(self.num_run)] = self.template.copy() + if to_cancel: + self.logger.info("num_run %s, to cancel" % self.num_run) + else: + if self.runs[str(self.num_run)].has_loop_condition(): + if self.runs[str(self.num_run)].get_loop_condition_status(): + self.logger.info("num_run %s get_loop_condition_status %s, start next run" % (self.num_run, self.runs[str(self.num_run)].get_loop_condition_status())) + self._num_run += 1 + self.runs[str(self.num_run)] = self.template.copy() - self.runs[str(self.num_run)].num_run = self._num_run - p_metadata = self.runs[str(self.num_run - 1)].get_metadata_item('parameter_links') - self.runs[str(self.num_run)].add_metadata_item('parameter_links', p_metadata) + self.runs[str(self.num_run)].num_run = self.num_run + p_metadata = self.runs[str(self.num_run - 1)].get_metadata_item('parameter_links') + self.runs[str(self.num_run)].add_metadata_item('parameter_links', p_metadata) - self.runs[str(self.num_run)].global_parameters = self.runs[str(self.num_run - 1)].global_parameters - else: - self.logger.info("num_run %s get_loop_condition_status %s, terminated loop" % (self.num_run, self.runs[str(self.num_run)].get_loop_condition_status())) + self.runs[str(self.num_run)].global_parameters = self.runs[str(self.num_run - 1)].global_parameters + else: + self.logger.info("num_run %s get_loop_condition_status %s, terminated loop" % (self.num_run, self.runs[str(self.num_run)].get_loop_condition_status())) def get_relation_map(self): if not self.runs: