From 56547c2795702d04b422638eac13b183cdf046b3 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Wed, 7 Jun 2023 14:32:54 +0200 Subject: [PATCH 01/19] avoid too mang logging --- main/lib/idds/agents/common/eventbus/eventbus.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main/lib/idds/agents/common/eventbus/eventbus.py b/main/lib/idds/agents/common/eventbus/eventbus.py index 79502f3f..9235e48a 100644 --- a/main/lib/idds/agents/common/eventbus/eventbus.py +++ b/main/lib/idds/agents/common/eventbus/eventbus.py @@ -69,7 +69,8 @@ def backend(self): self._backend = self._backup_backend self.logger.critical("MsgEventBusBackend failed, switch to use BaseEventBusBackendOpt") elif self._orig_backend and isinstance(self._orig_backend, MsgEventBusBackend) and self._orig_backend.is_ok(): - self.logger.critical("MsgEventBusBackend is ok, switch back to use it") + if self._backend != self._orig_backend: + self.logger.critical("MsgEventBusBackend is ok, switch back to use it") self._backend = self._orig_backend # self._orig_backend = None return self._backend From 425e8170fe1510f5242a52e4685040843d04684c Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Fri, 9 Jun 2023 16:13:27 +0200 Subject: [PATCH 02/19] add doma tree and event map --- doma/lib/idds/doma/workflowv2/domaeventmap.py | 143 +++++++ doma/lib/idds/doma/workflowv2/domatree.py | 353 ++++++++++++++++++ .../tests/test_domapanda_lsst_workflow.py | 265 +++++++++++++ workflow/lib/idds/workflowv2/tree.py | 165 ++++++++ 4 files changed, 926 insertions(+) create mode 100644 doma/lib/idds/doma/workflowv2/domaeventmap.py create mode 100644 doma/lib/idds/doma/workflowv2/domatree.py create mode 100644 main/lib/idds/tests/test_domapanda_lsst_workflow.py create mode 100644 workflow/lib/idds/workflowv2/tree.py diff --git a/doma/lib/idds/doma/workflowv2/domaeventmap.py b/doma/lib/idds/doma/workflowv2/domaeventmap.py new file mode 100644 index 00000000..e5a7d1ce --- /dev/null +++ b/doma/lib/idds/doma/workflowv2/domaeventmap.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + +""" +Map between jobs and events +""" + + +import os +import pickle + + +class DomaEventMapJob(object): + def __init__(self, task_name, name, events, terminated_status=['finished', 'failed', 'missing']): + self.task_name = task_name + self.name = name + self.events = events + self.terminated_status = terminated_status + self.event_deps = {} + self.event_status = {} + + def construct_event_dependencies(self, job_event_map): + for event_index in self.events: + self.event_deps[event_index] = [] + job = self.events[event_index] + deps = job.deps + for dep in deps: + # dep is gwjob + event_dep = job_event_map[dep.name] + self.event_deps[event_index].append(event_dep) + + def set_event_status(self, event_index, status, reported): + self.event_status[str(event_index)] = {'status': status, 'reported': reported} + + def set_event_failed(self, event_index, reported=False): + self.set_event_status(event_index, 'failed', reported) + + def set_event_finished(self, event_index, reported=False): + self.set_event_status(event_index, 'finished', reported) + + def set_event_missing(self, event_index, reported=False): + self.set_event_status(event_index, 'missing', reported) + + def get_events_to_report(self): + to_report = {} + for event_index in self.event_status: + event_status = self.event_status[event_index] + if not event_status['reported']: + to_report[event_index] = event_status['status'] + return to_report + + def acknowledge_event_report(self, report): + for event_index in report: + self.event_status[str(event_index)]['reported'] = True + + def get_event(self, event_index): + event_index_str = str(event_index) + event = self.events.get(event_index_str, None) + return event + + def is_ok_to_process_event(self, event_index): + # when a job is released, the external dependencies should be fixed + # (except the events which are already marked as failed in panda). + # here we will only need to check internal dependencies + event_index_str = str(event_index) + event = self.events.get(event_index_str, None) + if not event: + return False + + deps = self.event_deps.get(event_index_str, []) + for dep in deps: + task_name = dep['group_label'] + job_name = dep['event_job'] + if task_name != self.task_name or job_name != self.name: + # external dependency, skip + continue + event_dep_index = dep['event_index'] + event_dep_status = self.event_status.get(event_dep_index, {}).get('status', None) + if not event_dep_status or event_dep_status not in self.terminated_status: + return False + return True + + +class DomaEventMapTask(object): + def __init__(self, name): + self.name = name + self.jobs = {} + + def add_job(self, job): + self.jobs[job.name] = job + + def get_job(self, job_name): + return self.jobs.get(job_name, None) + + +class DomaEventMap(object): + def __init__(self, name='doma_event_map.pickle', base_dir='./'): + if not name: + name = 'doma_event_map.pickle' + self.name = name + self.base_dir = base_dir + self.tasks = {} + + def add_task(self, task): + self.tasks[task.name] = task + + def get_task(self, task_name): + return self.tasks.get(task_name, None) + + def get_path(self): + if os.path.isabs(self.name): + path = self.name + else: + if self.base_dir: + path = os.path.join(self.base_dir, self.name) + else: + path = self.name + return path + + def save(self): + try: + path = self.get_path() + with open(path, 'wb') as fd: + pickle.dump(self.tasks, fd) + except Exception as ex: + print(ex) + raise Exception(ex) + + def load(self): + try: + path = self.get_path() + with open(path, 'rb') as fd: + self.tasks = pickle.load(fd) + except Exception as ex: + # print(ex) + raise Exception(ex) diff --git a/doma/lib/idds/doma/workflowv2/domatree.py b/doma/lib/idds/doma/workflowv2/domatree.py new file mode 100644 index 00000000..67dee9bc --- /dev/null +++ b/doma/lib/idds/doma/workflowv2/domatree.py @@ -0,0 +1,353 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + + +""" +Construct tree from a generic workflow +""" + +from idds.workflowv2.tree import JobNode, LabelNode, Tree + +from .domaeventmap import DomaEventMap, DomaEventMapTask, DomaEventMapJob + + +class DomaTree(Tree): + def __init__(self, name, **kwargs): + super(DomaTree, self).__init__(name, **kwargs) + self._label_grouping = {'default': {'grouping': True, 'grouping_max_jobs': 100}} + self.job_tree_roots = None + self.job_nodes = None + self.label_jobs = None + self.label_parent_labels = None + + self.label_tree_roots = None + self.label_nodes = None + + def set_grouping_attribute(self, label, grouping, max_jobs): + self._label_grouping[label] = {'grouping': grouping, 'max_jobs': max_jobs} + + def get_job_tree(self, generic_workflow): + job_tree_roots = [] + job_nodes = {} + + label_jobs = {} + label_parent_labels = {} + for job_label in generic_workflow.labels: + if job_label not in label_jobs: + label_jobs[job_label] = [] + label_parent_labels[job_label] = [] + jobs_by_label = generic_workflow.get_jobs_by_label(job_label) + for gwjob in jobs_by_label: + deps = [] + for parent_job_name in generic_workflow.predecessors(gwjob.name): + # deps.append(parent_job_name) + parent_job = generic_workflow.get_job(parent_job_name) + deps.append(parent_job) + if parent_job.label not in label_parent_labels[job_label]: + label_parent_labels[job_label].append(parent_job.label) + + node = JobNode(gwjob.name, label=job_label, gwjob=gwjob, deps=deps) + label_jobs[job_label].append(node) + job_nodes[gwjob.name] = node + + if not node.deps: + job_tree_roots.append(node) + else: + for dep in node.deps: + dep_node = job_nodes[dep.name] + dep_node.add_child(node) + return job_tree_roots, job_nodes, label_jobs, label_parent_labels + + def get_label_tree(self, generic_workflow, label_parent_labels, label_jobs): + label_tree_roots, label_nodes = [], {} + for job_label in generic_workflow.labels: + # jobs_by_label = generic_workflow.get_jobs_by_label(job_label) + jobs_by_label = label_jobs[job_label] + one_gwjob = jobs_by_label[0].gwjob + label_node = LabelNode(job_label, jobs=jobs_by_label, compute_cloud=one_gwjob.compute_cloud, + compute_site=one_gwjob.compute_site, queue=one_gwjob.queue, + request_memory=one_gwjob.request_memory, + one_gwjob=one_gwjob) + label_nodes[job_label] = label_node + if not label_parent_labels[job_label]: + label_tree_roots.append(label_node) + else: + for parent_label in label_parent_labels[job_label]: + parent_node = label_nodes[parent_label] + parent_node.add_child(label_node) + + # set level here + for root in label_tree_roots: + root.level = 0 + return label_tree_roots, label_nodes + + def get_ordered_nodes_by_level(self, roots): + level_dict = {} + has_nodes = True + left_nodes = roots + while has_nodes: + nodes = left_nodes + left_nodes = [] + has_nodes = False + for node in nodes: + if node.level not in level_dict: + level_dict[node.level] = [] + level_dict[node.level].append(node) + if node.children: + left_nodes.extend(node.children) + if left_nodes: + has_nodes = True + + return level_dict + + def group_label_level_dict(self, label_level_dict): + grouped_label_level_dict = {} + current_grouped_level, current_required_resource, current_number_jobs = 0, None, 0 + for level in label_level_dict: + for node in label_level_dict[level]: + whether_to_group = self.whether_to_group(node) + node.whether_to_group = whether_to_group + + if not whether_to_group: + # if there is a previous group, close it + if current_number_jobs: + current_grouped_level += 1 + + grouped_label_level_dict[str(current_grouped_level)] = [node] + current_grouped_level += 1 + current_required_resource, current_number_jobs = None, 0 + else: + num_jobs = len(node.jobs) + # self.logger.debug(node) + print(node) + max_events_per_job = self.get_max_events_per_job(node) + required_resource = "%s_%s_%s_%s_%s" % (node.compute_cloud, node.compute_site, node.queue, node.request_memory, max_events_per_job) + + if str(current_grouped_level) not in grouped_label_level_dict: + grouped_label_level_dict[str(current_grouped_level)] = [] + + if not current_number_jobs: + # new job + current_required_resource = required_resource + current_number_jobs = num_jobs + grouped_label_level_dict[str(current_grouped_level)].append(node) + elif current_required_resource != required_resource or num_jobs + current_number_jobs > max_events_per_job: + # close the current group + current_grouped_level += 1 + # create new group and wait for others to join this group + grouped_label_level_dict[str(current_grouped_level)] = [node] + current_required_resource = required_resource + current_number_jobs = num_jobs + # elif num_jobs >= max_events_per_job / 2: + # # close the current group + # current_grouped_level += 1 + # # create new group as a separate group + # grouped_label_level_dict[str(current_grouped_level)] = [node] + # # move to the next group + # current_grouped_level += 1 + # current_required_resource, current_number_jobs = None, 0 + else: + # group the current node to the previous group + grouped_label_level_dict[str(current_grouped_level)].append(node) + current_number_jobs += num_jobs + return grouped_label_level_dict + + def whether_to_group(self, node): + gwjob = node.one_gwjob + if gwjob: + return gwjob.attrs.get('grouping', True) + return self._label_grouping.get(node.name, {}).get('grouping', True) + + def get_max_events_per_job(self, node): + gwjob = node.one_gwjob + if gwjob: + return gwjob.attrs.get('grouping_max_jobs', 100) + return self._label_grouping.get(node.name, {}).get('grouping_max_jobs', 100) + + def split_big_node(self, node, max_events_per_job=1000): + job_nodes = node.jobs + groups = {} + for job_node in job_nodes: + group_id = job_node.get_potential_group_id() + if group_id not in groups: + groups[group_id] = [] + groups[group_id].append(job_node) + + job_chunks = [] + for group_id in groups: + group_jobs = groups[group_id] + if len(group_jobs) > max_events_per_job: + cluster_chunks = [group_jobs[i:i + max_events_per_job] for i in range(0, len(group_jobs), max_events_per_job)] + job_chunks.extend(cluster_chunks) + else: + job_chunks.append(group_jobs) + + # merge job chunks + merged_job_chunks = [] + current_job_chunk = None + for job_chunk in job_chunks: + if len(job_chunk) > max_events_per_job / 2: + merged_job_chunks.append(job_chunk) + else: + if current_job_chunk is None: + current_job_chunk = job_chunk + else: + if len(current_job_chunk) + len(job_chunk) <= max_events_per_job: + current_job_chunk.extend(job_chunk) + else: + merged_job_chunks.append(current_job_chunk) + current_job_chunk = job_chunk + if current_job_chunk: + merged_job_chunks.append(current_job_chunk) + + return merged_job_chunks + + def construct_grouped_jobs(self, grouped_label_level_dict): + group_jobs, group_label, events, event_index = {}, None, {}, 0 + for level in grouped_label_level_dict: + nodes = grouped_label_level_dict[level] + # one level is one task + group_label = "_".join([node.name for node in nodes]) + if len(nodes) > 1: + # mulitple node to be merged into one job + events, event_index = {}, 0 + group_id = "%s_0" % level + event_file = "eventservice_" + group_label + "_" + group_id + for node in nodes: + for job in node.jobs: + event_index_str = str(event_index) + event_index += 1 + events[event_index_str] = job + + job.group_label = group_label + job.event_file = event_file + job.event_index = event_index_str + job.group_id = group_id + group_jobs[group_label] = [{'name': event_file, 'events': events}] + else: + # there is only one big node + node = nodes[0] + max_events_per_job = self.get_max_events_per_job(node) + if len(node.jobs) <= max_events_per_job: + events, event_index = {}, 0 + group_id = "%s_0" % level + event_file = "eventservice_" + group_label + "_" + group_id + for job in node.jobs: + event_index_str = str(event_index) + event_index += 1 + + events[event_index_str] = job + + job.group_label = group_label + job.event_file = event_file + job.event_index = event_index_str + job.group_id = group_id + group_jobs[group_label] = [{'name': event_file, 'events': events}] + else: + chunks = self.split_big_node(node, max_events_per_job) + group_jobs[group_label] = [] + for i, chunk in enumerate(chunks): + events, event_index = {}, 0 + group_id = "%s_%s" % (level, i) + event_file = "eventservice_" + group_label + "_" + group_id + for job in chunk: + event_index_str = str(event_index) + event_index += 1 + + events[event_index_str] = job + job.group_id = group_id + + job.group_label = group_label + job.event_file = event_file + job.event_index = event_index_str + group_jobs[group_label].append({'name': event_file, 'events': events}) + + return group_jobs + + def from_generic_workflow(self, generic_workflow): + job_tree_roots, job_nodes, label_jobs, label_parent_labels = self.get_job_tree(generic_workflow) + self.job_tree_roots = job_tree_roots + self.job_nodes = job_nodes + self.label_jobs = label_jobs + self.label_parent_labels = label_parent_labels + print("job tree") + print(job_tree_roots) + print(job_nodes) + print(label_jobs) + print(label_parent_labels) + + label_tree_roots, label_nodes = self.get_label_tree(generic_workflow, label_parent_labels, label_jobs) + self.label_tree_roots = label_tree_roots + self.label_nodes = label_nodes + print("label tree") + print(label_tree_roots) + print(label_nodes) + + label_level_dict = self.get_ordered_nodes_by_level(label_tree_roots) + print("label_level_dict") + print(label_level_dict) + + grouped_label_level_dict = self.group_label_level_dict(label_level_dict) + print("grouped_label_level_dict") + print(grouped_label_level_dict) + + # self.logger.debug(grouped_label_level_dict) + grouped_jobs = self.construct_grouped_jobs(grouped_label_level_dict) + return grouped_jobs + + def construct_map_between_jobs_and_events(self, job_nodes, grouped_jobs): + job_event_map = {} + for grouped_label in grouped_jobs: + for eventservice in grouped_jobs[grouped_label]: + name = eventservice['name'] + events = eventservice['events'] + for event_index in events: + job = events[event_index] + job_event_map[job.name] = {'group_label': grouped_label, 'event_job': name, 'event_index': event_index} + for job_name in job_nodes: + if job_name not in job_event_map: + raise Exception("Job is not converted into EventService maps" % job_name) + return job_event_map + + def construct_event_map(self, grouped_jobs, event_map_name=None): + job_event_map = self.construct_map_between_jobs_and_events(self.job_nodes, grouped_jobs) + + event_map = DomaEventMap(event_map_name) + for grouped_label in grouped_jobs: + event_task = DomaEventMapTask(grouped_label) + for eventservice in grouped_jobs[grouped_label]: + name = eventservice['name'] + events = eventservice['events'] + event_job = DomaEventMapJob(grouped_label, name, events) + event_job.construct_event_dependencies(job_event_map) + + event_task.add_job(event_job) + event_map.add_task(event_task) + event_map.save() + return event_map + + def construct_idds_work(self, label, jobs, job_nodes): + for job in jobs: + name = job['name'] + events = job['events'] + construct_events = [] + for event_index in events: + job_node = events[event_index] + # gwjob = job_node.gwjob + deps = job_node.deps + construct_event = {'name': name, 'index': event_index, 'dependencies': {}} + for dep_name in deps: + dep_job_node = job_nodes[dep_name] + dep = {'group_label': dep_job_node.group_label, + 'event_file': dep_job_node.event_file, + 'event_index': dep_job_node.event_index} + construct_event['dependencies'].append(dep) + construct_events.append(construct_event) + job['construct_events'] = construct_events diff --git a/main/lib/idds/tests/test_domapanda_lsst_workflow.py b/main/lib/idds/tests/test_domapanda_lsst_workflow.py new file mode 100644 index 00000000..4292b683 --- /dev/null +++ b/main/lib/idds/tests/test_domapanda_lsst_workflow.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + + +""" +Test lsst generic workflow. +""" + +# import json +import logging + +logging.basicConfig(level=logging.DEBUG) + +# import traceback + +from collections import Counter # noqa #402 +from lsst.ctrl.bps import generic_workflow as gw # noqa #402 + +# from rucio.client.client import Client as Rucio_Client +# from rucio.common.exception import CannotAuthenticate + +# from idds.client.client import Client +from idds.client.clientmanager import ClientManager # noqa E402 +# from idds.common.constants import RequestType, RequestStatus +# from idds.common.utils import get_rest_host +# from idds.tests.common import get_example_real_tape_stagein_request +# from idds.tests.common import get_example_prodsys2_tape_stagein_request + +# from idds.workflowv2.work import Work, Parameter, WorkStatus +# from idds.workflowv2.workflow import Condition, Workflow +from idds.workflowv2.workflow import Workflow, Condition # noqa E402 +# from idds.atlas.workflowv2.atlasstageinwork import ATLASStageinWork +from idds.doma.workflowv2.domapandawork import DomaPanDAWork # noqa E402 +from idds.doma.workflowv2.domatree import DomaTree # noqa E402 +from idds.doma.workflowv2.domaeventmap import DomaEventMap # noqa E402 + + +def setup_gw_workflow(): + gwf = gw.GenericWorkflow("mytest") + + exec1 = gw.GenericWorkflowExec( + name="test.py", src_uri="${CTRL_BPS_DIR}/bin/test1.py", transfer_executable=False + ) + job1 = gw.GenericWorkflowJob("job1", label="label1") + job1.quanta_counts = Counter({"pt1": 1, "pt2": 2}) + job1.executable = exec1 + + job2 = gw.GenericWorkflowJob("job2", label="label2") + job2.quanta_counts = Counter({"pt1": 1, "pt2": 2}) + job2.executable = exec1 + + job3 = gw.GenericWorkflowJob("job3") + job3.label = "label2" + job3.quanta_counts = Counter({"pt1": 1, "pt2": 2}) + job3.executable = exec1 + + gwf.add_job(job1) + gwf.add_job(job2) + gwf.add_job(job3) + gwf.add_job_relationships("job1", ["job2", "job3"]) + # gwf.add_job_relationships("job1", "job2") + + srcjob1 = gw.GenericWorkflowJob("srcjob1") + srcjob1.label = "srclabel1" + srcjob1.executable = exec1 + srcjob2 = gw.GenericWorkflowJob("srcjob2") + srcjob2.label = "srclabel1" + srcjob2.executable = exec1 + srcjob3 = gw.GenericWorkflowJob("srcjob3") + srcjob3.label = "srclabel2" + srcjob3.executable = exec1 + srcjob4 = gw.GenericWorkflowJob("srcjob4") + srcjob4.label = "srclabel2" + srcjob4.executable = exec1 + gwf2 = gw.GenericWorkflow("mytest2") + gwf2.add_job(srcjob1) + gwf2.add_job(srcjob2) + gwf2.add_job(srcjob3) + gwf2.add_job(srcjob4) + gwf2.add_job_relationships("srcjob1", "srcjob3") + gwf2.add_job_relationships("srcjob2", "srcjob4") + + gwf.add_workflow_source(gwf2) + return gwf + + +def setup_gw_workflow2(): + gwf = gw.GenericWorkflow("mytest") + + exec1 = gw.GenericWorkflowExec( + name="test1.py", src_uri="${CTRL_BPS_DIR}/bin/test1.py", transfer_executable=False + ) + + exec2 = gw.GenericWorkflowExec( + name="test2.py", src_uri="${CTRL_BPS_DIR}/bin/test2.py", transfer_executable=False + ) + + exec3 = gw.GenericWorkflowExec( + name="test3.py", src_uri="${CTRL_BPS_DIR}/bin/test3.py", transfer_executable=False + ) + + exec4 = gw.GenericWorkflowExec( + name="test4.py", src_uri="${CTRL_BPS_DIR}/bin/test4.py", transfer_executable=False + ) + + exec5 = gw.GenericWorkflowExec( + name="test5.py", src_uri="${CTRL_BPS_DIR}/bin/test5.py", transfer_executable=False + ) + + job1 = gw.GenericWorkflowJob("init1", label="init") + job1.quanta_counts = Counter({"pt1": 1, "pt2": 2}) + job1.executable = exec1 + gwf.add_job(job1) + + for i in range(1000): + job2 = gw.GenericWorkflowJob("isr_%s" % i, label="isr") + job2.quanta_counts = Counter({"pt1": 1, "pt2": 2}) + job2.executable = exec2 + job2.attrs['grouping'] = True + job2.attrs['grouping_max_jobs'] = 160 + gwf.add_job(job2) + gwf.add_job_relationships("init1", "isr_%s" % i) + + for i in range(1000): + job3 = gw.GenericWorkflowJob("characterizeImage_%s" % i, label="characterizeImage") + job3.quanta_counts = Counter({"pt1": 1, "pt2": 2}) + job3.executable = exec3 + job3.attrs['grouping'] = True + job3.attrs['grouping_max_jobs'] = 160 + gwf.add_job(job3) + gwf.add_job_relationships("isr_%s" % i, "characterizeImage_%s" % i) + + for i in range(1000): + job4 = gw.GenericWorkflowJob("calibrate_%s" % i, label="calibrate") + job4.quanta_counts = Counter({"pt1": 1, "pt2": 2}) + job4.executable = exec4 + job4.attrs['grouping'] = True + job4.attrs['grouping_max_jobs'] = 100 + gwf.add_job(job4) + gwf.add_job_relationships("characterizeImage_%s" % i, "calibrate_%s" % i) + + for i in range(10): + job5 = gw.GenericWorkflowJob("writePreSourceTable_%s" % i, label="writePreSourceTable") + job5.quanta_counts = Counter({"pt1": 1, "pt2": 2}) + job5.executable = exec5 + job5.attrs['grouping'] = True + job5.attrs['grouping_max_jobs'] = 8 + gwf.add_job(job5) + + for i in range(1000): + gwf.add_job_relationships("calibrate_%s" % i, "writePreSourceTable_%s" % int(i / 100)) + + return gwf + + +def test_show_jobs(generic_workflow): + for job_label in generic_workflow.labels: + jobs_by_label = generic_workflow.get_jobs_by_label(job_label) + for gwjob in jobs_by_label: + # pseudo_filename = _make_pseudo_filename(config, gwjob) + # job_to_pseudo_filename[gwjob.name] = pseudo_filename + # job_to_task[gwjob.name] = work.get_work_name() + + # deps = [] + for parent_job_name in generic_workflow.predecessors(gwjob.name): + # deps.append({"task": job_to_task[parent_job_name], + # "inputname": job_to_pseudo_filename[parent_job_name], + # "available": False}) + pass + + # job = {"name": pseudo_filename, "dependencies": deps} + + +def construct_doma_jobs(generic_workflow): + tree = DomaTree('test_tree') + grouped_jobs = tree.from_generic_workflow(generic_workflow) + # print(grouped_jobs) + for grouped_label in grouped_jobs: + print(grouped_label) + for eventservice in grouped_jobs[grouped_label]: + print(" %s" % eventservice['name']) + print(" %s" % eventservice['events']) + + event_map = tree.construct_event_map(grouped_jobs) + print(event_map) + + event_map1 = DomaEventMap() + event_map1.load() + print(event_map1) + + +def test(): + gw_workflow = setup_gw_workflow() + # print(json.dumps(gw_workflow)) + # gw_workflow = setup_gw_workflow2() + test_show_jobs(gw_workflow) + construct_doma_jobs(gw_workflow) + + +def test_load(): + event_map = DomaEventMap() + event_map.load() + print(event_map) + + # event file name + task_name = "srclabel1_srclabel2_label1_label2" + job_name = "eventservice_srclabel1_srclabel2_label1_label2_0_0" + event_id = [0, 1, 2, 3] + event_task = event_map.get_task(task_name) + event_job = event_task.get_job(job_name) + + # sync event status from panda + # get event status from panda + event_job.set_event_finished(event_id[0], reported=True) # reported True means that we don't need to update this event to panda + event_job.set_event_failed(event_id[1], reported=True) + event_job.set_event_missing(event_id[2], reported=True) + + # check the forth event + is_ok = event_job.is_ok_to_process_event(event_id[3]) + print("is_ok: %s" % is_ok) + if is_ok: + event = event_job.get_event(event_id[3]) + # process event + print(event) + # ctrl bps job + print(event.gwjob) + event_job.set_event_finished(event.event_index) + + to_report = event_job.get_events_to_report() + print(to_report) + # report event status to panda + event_job.acknowledge_event_report(to_report) # update the report status, to avoid reporting it again + to_report = event_job.get_events_to_report() + print(to_report) + +def test1(): + # gw_workflow = setup_gw_workflow() + # print(json.dumps(gw_workflow)) + gw_workflow = setup_gw_workflow2() + test_show_jobs(gw_workflow) + construct_doma_jobs(gw_workflow) + + +def test_load1(): + event_map = DomaEventMap() + event_map.load() + print(event_map) + + # event file name + # task_name = "srclabel1_srclabel2_label1_label2" + # job_name = "eventservice_srclabel1_srclabel2_label1_label2_0_0" + + +if __name__ == '__main__': + test() + test_load() + # test1() + # test_load1() diff --git a/workflow/lib/idds/workflowv2/tree.py b/workflow/lib/idds/workflowv2/tree.py new file mode 100644 index 00000000..004d4871 --- /dev/null +++ b/workflow/lib/idds/workflowv2/tree.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + + +""" +Construct tree from a workflow +""" + +from idds.common.utils import setup_logging + +from .base import Base + + +setup_logging(__name__) + + +class Node(Base): + def __init__(self, name, children=None, parents=None, index=None, level=None, groups=None, disable_grouping=False, **kwargs): + self.__dict__.update(kwargs) + + if children is None: + children = [] + if parents is None: + parents = [] + if groups is None: + groups = [] + self.name = name + self.index = index + self.level = level + self.children = children + self.parents = parents + self.groups = [] + self.group_id = None + self.disable_grouping = disable_grouping + + self.setup_logger() + + @property + def index(self): + return str(self.__index) + + @index.setter + def index(self, value): + if value: + self.__index = str(value) + else: + self.__index = value + + @property + def level(self): + return str(self.__level) + + @level.setter + def level(self, value): + if value is not None: + if self.__level is None or (self.__level is not None and value > self.__level): + self.__level = value + for child in self.children: + child.level = value + 1 + else: + self.__level = value + + def add_child(self, obj): + self.children.append(obj) + obj.parents.append(self) + + def add_group(self, group): + if group and group not in self.groups: + self.groups.append(group) + for child in self.children: + if not child.group_id: + child.add_group(group) + for parent in self.parents: + if not parent.group_id: + parent.add_group(group) + + def get_potential_group_id(self): + if not self.groups: + return "None" + return "_".join([str(i) for i in sorted(self.groups)]) + + @property + def group_id(self): + return self.__group_id + + @group_id.setter + def group_id(self, group_id): + self.__group_id = group_id + if group_id: + for child in self.children: + if not child.group_id: + child.add_group(group_id) + for parent in self.parents: + if not parent.group_id: + parent.add_group(group_id) + + def get_node_name(self): + return get_node_name(self.index, self.name) + + def __repr__(self): + # if self.__level: + # return "_" * self.__level + "Node(name: %s, level: %s)" % (self.name, self.__level) + # return "Node(name: %s, level: %s, group_id: %s, groups: %s, parents: %s, children: %s)" % (self.name, self.__level, self.group_id, self.groups, self.parents, self.children) + return "Node(name: %s, level: %s)" % (self.name, self.__level) + + +class WorkNode(Node): + def __init__(self, name, work=None, children=None, parents=None, index=None, level=None, groups=None, + disable_grouping=False, **kwargs): + super(WorkNode, self).__init__(name=name, work=work, children=children, parents=parents, + index=index, level=level, groups=groups, + disable_grouping=disable_grouping, **kwargs) + + def __repr__(self): + return "WorkNode(name: %s, level: %s)" % (self.name, self.level) + + +class LabelNode(Node): + def __init__(self, name, jobs=None, children=None, parents=None, index=None, level=None, groups=None, + disable_grouping=False, **kwargs): + super(LabelNode, self).__init__(name=name, jobs=jobs, children=children, parents=parents, + index=index, level=level, groups=groups, + disable_grouping=disable_grouping, **kwargs) + + def __repr__(self): + return "LabelNode(name: %s, level: %s)" % (self.name, self.level) + + +class JobNode(Node): + def __init__(self, name, work_node=None, children=None, parents=None, index=None, level=None, + groups=None, disable_grouping=False, **kwargs): + super(JobNode, self).__init__(name=name, work=work_node, children=children, parents=parents, + index=index, level=level, groups=groups, + disable_grouping=disable_grouping, **kwargs) + + def __repr__(self): + return "JobNode(name: %s, level: %s)" % (self.name, self.level) + + def get_node_name(self): + index = self.index + if self.work_node: + index = self.work_node.index + return get_node_name(index, self.name) + + +def get_node_name(index=None, name=None): + if index: + return "%s:%s" % (index, name) + return name + + +class Tree(Base): + def __init__(self, name, **kwargs): + self.__dict__.update(kwargs) + self.name = name + self.roots = [] + + self.setup_logger() From 94740715f76faf8baec73b555607b61a1f664737 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 13 Jun 2023 23:14:25 +0200 Subject: [PATCH 03/19] fix duplication during abort contents --- doma/lib/idds/doma/workflowv2/domapandawork.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/doma/lib/idds/doma/workflowv2/domapandawork.py b/doma/lib/idds/doma/workflowv2/domapandawork.py index 42a6a005..7852daba 100644 --- a/doma/lib/idds/doma/workflowv2/domapandawork.py +++ b/doma/lib/idds/doma/workflowv2/domapandawork.py @@ -1126,17 +1126,20 @@ def get_contents_ext(self, input_output_maps, contents_ext, contents_ext_full, j self.logger.debug("get_contents_ext, left_contents[:1]: %s" % (str(left_contents[:3]))) return new_contents_ext_d, update_contents_ext_d, left_contents - def abort_contents(self, input_output_maps, updated_contents, contents_ext): + def abort_contents(self, input_output_maps, updated_contents, contents_ext, to_update_new_contents_ext): contents_ext_dict = {content['content_id']: content for content in contents_ext} new_contents_ext = [] + updated_contents_ids = [c['content_id'] for c in updated_contents] + new_contents_ext_ids = [c['content_id'] for c in to_update_new_contents_ext] for map_id in input_output_maps: outputs = input_output_maps[map_id]['outputs'] for content in outputs: - update_content = {'content_id': content['content_id'], - 'substatus': ContentStatus.Missing} - updated_contents.append(update_content) - if content['content_id'] not in contents_ext_dict: + if content['content_id'] not in updated_contents_ids: + update_content = {'content_id': content['content_id'], + 'substatus': ContentStatus.Missing} + updated_contents.append(update_content) + if content['content_id'] not in contents_ext_dict and content['content_id'] not in new_contents_ext_ids: new_content_ext = {'content_id': content['content_id'], 'request_id': content['request_id'], 'transform_id': content['transform_id'], @@ -1193,7 +1196,7 @@ def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext= contents_ext_full, job_info_maps) # if left_jobs: if processing_status in [ProcessingStatus.Cancelled]: - updated_contents, new_contents_ext1 = self.abort_contents(input_output_maps, updated_contents, contents_ext) + updated_contents, new_contents_ext1 = self.abort_contents(input_output_maps, updated_contents, contents_ext, new_contents_ext) new_contents_ext = new_contents_ext + new_contents_ext1 return processing_status, updated_contents, update_contents_full, new_contents_ext, update_contents_ext From ace59f08feb779b9e5f36b95dd0bb1b8fc8bb15e Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 22 Jun 2023 10:47:43 +0200 Subject: [PATCH 04/19] fix abort requests without activated tasks --- main/lib/idds/agents/clerk/clerk.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/main/lib/idds/agents/clerk/clerk.py b/main/lib/idds/agents/clerk/clerk.py index 6296b2c3..6727ea95 100644 --- a/main/lib/idds/agents/clerk/clerk.py +++ b/main/lib/idds/agents/clerk/clerk.py @@ -1121,6 +1121,7 @@ def process_abort_request(self, event): wf = req['request_metadata']['build_workflow'] works = wf.get_all_works() if works: + has_abort_work = False for work in works: if (work.is_started() or work.is_starting()) and not work.is_terminated(): if not to_abort_transform_id or to_abort_transform_id == work.get_work_id(): @@ -1129,6 +1130,12 @@ def process_abort_request(self, event): transform_id=work.get_work_id(), content=event._content) self.event_bus.send(event) + has_abort_work = True + if not has_abort_work: + self.logger.info(log_pre + "not has abort work") + self.logger.info(log_pre + "UpdateRequestEvent(request_id: %s)" % str(req['request_id'])) + event = UpdateRequestEvent(publisher_id=self.id, request_id=req['request_id'], content=event._content) + self.event_bus.send(event) else: # no works. should trigger update request self.logger.info(log_pre + "UpdateRequestEvent(request_id: %s)" % str(req['request_id'])) From 69f8d9b0206c99c1761ddd543131bd77b8a7f20a Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 4 Jul 2023 15:38:17 +0200 Subject: [PATCH 05/19] refactor domapadawork --- .../lib/idds/doma/workflowv2/domapandawork.py | 524 ++++++------------ 1 file changed, 169 insertions(+), 355 deletions(-) diff --git a/doma/lib/idds/doma/workflowv2/domapandawork.py b/doma/lib/idds/doma/workflowv2/domapandawork.py index 7852daba..f6bbb266 100644 --- a/doma/lib/idds/doma/workflowv2/domapandawork.py +++ b/doma/lib/idds/doma/workflowv2/domapandawork.py @@ -84,6 +84,7 @@ def __init__(self, executable=None, arguments=None, parameters=None, setup=None, # self.logger.setLevel(logging.DEBUG) self.task_name = task_name + self.orig_task_name = self.task_name self.real_task_name = None self.set_work_name(task_name) self.task_queue = task_queue @@ -210,7 +211,7 @@ def load_panda_urls(self): os.environ['PANDA_CONFIG_ROOT'] = self.panda_config_root def set_agent_attributes(self, attrs, req_attributes=None): - if 'life_time' not in attrs[self.class_name] or int(attrs[self.class_name]['life_time']) <= 0: + if self.class_name in attrs and ('life_time' not in attrs[self.class_name] or int(attrs[self.class_name]['life_time']) <= 0): attrs['life_time'] = None super(DomaPanDAWork, self).set_agent_attributes(attrs) if 'num_retries' in self.agent_attributes and self.agent_attributes['num_retries']: @@ -533,7 +534,7 @@ def create_processing(self, input_output_maps=[]): task_param_map['cloud'] = self.task_cloud task_param_map['PandaSite'] = self.task_site if self.task_rss and self.task_rss > 0: - task_param_map['ramCount'] = self.task_rss + task_param_map['ramCount'] = self.task_rss / self.core_count if self.core_count else self.task_rss # task_param_map['ramUnit'] = 'MB' task_param_map['ramUnit'] = 'MBPerCoreFixed' @@ -733,51 +734,6 @@ def reactive_contents(self, input_output_maps): updated_contents.append(update_content) return updated_contents - def sort_panda_jobids(self, input_output_maps): - panda_job_ids = {} - panda_id_to_map_ids = {} - map_id_without_panda_ids = [] - for map_id in input_output_maps: - outputs = input_output_maps[map_id]['outputs'] - for content in outputs: - if content['status'] not in panda_job_ids: - panda_job_ids[content['status']] = [] - - if 'panda_id' in content['content_metadata']: - panda_job_ids[content['status']].append(content['content_metadata']['panda_id']) - panda_id_to_map_ids[content['content_metadata']['panda_id']] = map_id - else: - map_id_without_panda_ids.append(map_id) - - return panda_job_ids, map_id_without_panda_ids, panda_id_to_map_ids - - def get_registered_panda_jobids(self, input_output_maps): - panda_job_ids, map_id_without_panda_ids, panda_id_to_map_ids = self.sort_panda_jobids(input_output_maps) - unterminated_panda_ids = [] - finished_panda_ids = [] - failed_panda_ids = [] - for key in panda_job_ids: - if key in [ContentStatus.Available]: - finished_panda_ids += panda_job_ids[key] - elif key in [ContentStatus.Failed, ContentStatus.FinalFailed, - ContentStatus.Lost, ContentStatus.Deleted, - ContentStatus.Missing]: - failed_panda_ids += panda_job_ids[key] - else: - unterminated_panda_ids += panda_job_ids[key] - return finished_panda_ids + failed_panda_ids, unterminated_panda_ids, map_id_without_panda_ids, panda_id_to_map_ids - - def get_map_id_from_input(self, input_output_maps, input_file): - map_keys = list(input_output_maps.keys()) - map_keys.reverse() - for map_id in map_keys: - inputs = input_output_maps[map_id]['inputs'] - # outputs = input_output_maps[map_id]['outputs'] - for content in inputs: - if content['name'] == input_file: - return map_id - return None - def get_content_status_from_panda_status(self, job_info): if job_info is None: return ContentStatus.Processing @@ -798,29 +754,48 @@ def get_content_status_from_panda_status(self, job_info): else: return ContentStatus.Processing - def get_update_contents_from_map_id(self, map_id, input_output_maps, job_info): - outputs = input_output_maps[map_id]['outputs'] - update_contents = [] - for content in outputs: - status = self.get_content_status_from_panda_status(job_info) - content['substatus'] = status - - if 'panda_id' in content['content_metadata'] and content['content_metadata']['panda_id']: - # if content['content_metadata']['panda_id'] != job_info.PandaID: - if content['content_metadata']['panda_id'] < job_info.PandaID: - # new panda id is the bigger one. - if 'old_panda_id' not in content['content_metadata']: - content['content_metadata']['old_panda_id'] = [] - if content['content_metadata']['panda_id'] not in content['content_metadata']['old_panda_id']: - content['content_metadata']['old_panda_id'].append(content['content_metadata']['panda_id']) - content['content_metadata']['panda_id'] = job_info.PandaID - - update_contents.append(content) - return update_contents - - def get_panda_job_status(self, jobids): - jobids = list(jobids) - self.logger.debug("get_panda_job_status, jobids[:10]: %s" % str(jobids[:10])) + def get_unterminated_jobs(self, all_jobs_ids, input_output_maps, contents_ext): + finished_jobs, failed_jobs = [], [] + + contents_ext_dict = {content['content_id']: content for content in contents_ext} + + for map_id in input_output_maps: + outputs = input_output_maps[map_id]['outputs'] + for content in outputs: + if content['substatus'] in [ContentStatus.Available]: + if 'panda_id' in content['content_metadata']: + panda_id = content['content_metadata']['panda_id'] + if content['content_id'] not in contents_ext_dict: + continue + + content_ext = contents_ext_dict[content['content_id']] + if content['substatus'] != content_ext['status'] or panda_id != content_ext['panda_id']: + continue + + if panda_id not in finished_jobs: + finished_jobs.append(panda_id) + elif content['substatus'] in [ContentStatus.FinalFailed, + ContentStatus.Lost, ContentStatus.Deleted, + ContentStatus.Missing]: + if 'panda_id' in content['content_metadata']: + panda_id = content['content_metadata']['panda_id'] + if content['content_id'] not in contents_ext_dict: + continue + + content_ext = contents_ext_dict[content['content_id']] + if content['substatus'] != content_ext['status'] or panda_id != content_ext['panda_id']: + continue + + if panda_id not in failed_jobs: + failed_jobs.append(panda_id) + + all_jobs_ids = set(all_jobs_ids) + terminated_jobs = set(finished_jobs + failed_jobs) + unterminated_jobs = all_jobs_ids - terminated_jobs + return list(unterminated_jobs) + + def get_panda_job_status(self, jobids, log_prefix=''): + self.logger.debug(log_prefix + "get_panda_job_status, jobids[:10]: %s" % str(jobids[:10])) try: from pandaclient import Client ret = Client.getJobStatus(jobids, verbose=0) @@ -843,80 +818,24 @@ def get_panda_job_status(self, jobids): self.logger.error(str(ex)) self.logger.error(traceback.format_exc()) return ret_jobs + else: + self.logger.warn(log_prefix + "get_panda_job_status failed: %s" % str(ret)) + return [] except Exception as ex: self.logger.error(str(ex)) self.logger.error(traceback.format_exc()) return [] - def map_panda_ids(self, unregistered_job_ids, input_output_maps): - self.logger.debug("map_panda_ids, unregistered_job_ids[:10]: %s" % str(unregistered_job_ids[:10])) - - # updated_map_ids = [] - full_update_contents = [] - chunksize = 2000 - chunks = [unregistered_job_ids[i:i + chunksize] for i in range(0, len(unregistered_job_ids), chunksize)] - for chunk in chunks: - # jobs_list = Client.getJobStatus(chunk, verbose=0)[1] - jobs_list = self.get_panda_job_status(chunk) - for job_info in jobs_list: - if job_info and job_info.Files and len(job_info.Files) > 0: - for job_file in job_info.Files: - # if job_file.type in ['log']: - if job_file.type not in ['pseudo_input']: - continue - if ':' in job_file.lfn: - pos = job_file.lfn.find(":") - input_file = job_file.lfn[pos + 1:] - # input_file = job_file.lfn.split(':')[1] - else: - input_file = job_file.lfn - map_id = self.get_map_id_from_input(input_output_maps, input_file) - if map_id: - update_contents = self.get_update_contents_from_map_id(map_id, input_output_maps, job_info) - full_update_contents += update_contents - return full_update_contents - - def get_status_changed_contents(self, unterminated_job_ids, input_output_maps, panda_id_to_map_ids): - self.logger.debug("get_status_changed_contents, unterminated_job_ids[:10]: %s" % str(unterminated_job_ids[:10])) - - full_update_contents = [] - chunksize = 2000 - chunks = [unterminated_job_ids[i:i + chunksize] for i in range(0, len(unterminated_job_ids), chunksize)] - for chunk in chunks: - # jobs_list = Client.getJobStatus(chunk, verbose=0)[1] - jobs_list = self.get_panda_job_status(chunk) - for job_info in jobs_list: - panda_id = job_info.PandaID - map_id = panda_id_to_map_ids[panda_id] - update_contents = self.get_update_contents_from_map_id(map_id, input_output_maps, job_info) - full_update_contents += update_contents - return full_update_contents - - def get_final_update_contents(self, input_output_maps): - update_contents = [] - for map_id in input_output_maps: - outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] - for content in outputs: - if (content['substatus'] not in [ContentStatus.Available, ContentStatus.FakeAvailable, ContentStatus.FinalFailed]): - content['content_metadata']['old_final_status'] = content['substatus'] - content['substatus'] = ContentStatus.FinalFailed - update_contents.append(content) - - return update_contents - - def poll_panda_jobs(self, job_ids): - job_ids = list(job_ids) - self.logger.debug("poll_panda_jobs, poll_panda_jobs_chunk_size: %s, job_ids[:10]: %s" % (self.poll_panda_jobs_chunk_size, str(job_ids[:10]))) - - # updated_map_ids = [] - inputname_jobid_map = {} + def poll_panda_jobs(self, job_ids, log_prefix=''): + job_status_info = {} + self.logger.debug(log_prefix + "poll_panda_jobs, poll_panda_jobs_chunk_size: %s, job_ids[:10]: %s" % (self.poll_panda_jobs_chunk_size, str(job_ids[:10]))) chunksize = self.poll_panda_jobs_chunk_size chunks = [job_ids[i:i + chunksize] for i in range(0, len(job_ids), chunksize)] for chunk in chunks: # jobs_list = Client.getJobStatus(chunk, verbose=0)[1] - jobs_list = self.get_panda_job_status(chunk) + jobs_list = self.get_panda_job_status(chunk, log_prefix=log_prefix) if jobs_list: - self.logger.debug("poll_panda_jobs, input jobs: %s, output_jobs: %s" % (len(chunk), len(jobs_list))) + self.logger.debug(log_prefix + "poll_panda_jobs, input jobs: %s, output_jobs: %s" % (len(chunk), len(jobs_list))) for job_info in jobs_list: job_status = self.get_content_status_from_panda_status(job_info) if job_info and job_info.Files and len(job_info.Files) > 0: @@ -930,226 +849,119 @@ def poll_panda_jobs(self, job_ids): # input_file = job_file.lfn.split(':')[1] else: input_file = job_file.lfn - inputname_jobid_map[input_file] = {'panda_id': job_info.PandaID, 'status': job_status, 'job_info': job_info} + job_status_info[input_file] = {'panda_id': job_info.PandaID, 'status': job_status, 'job_info': job_info} else: - self.logger.warn("poll_panda_jobs, input jobs: %s, output_jobs: %s" % (len(chunk), jobs_list)) - return inputname_jobid_map + self.logger.warn(log_prefix + "poll_panda_jobs, input jobs: %s, output_jobs: %s" % (len(chunk), jobs_list)) + return job_status_info - def get_job_maps(self, input_output_maps): - inputname_mapid_map = {} - finished_jobs, failed_jobs = [], [] + def get_update_contents(self, unterminated_jobs_status, input_output_maps, contents_ext, job_info_maps, abort=False, log_prefix=''): + inputname_to_map_id_outputs = {} for map_id in input_output_maps: inputs = input_output_maps[map_id]['inputs'] outputs = input_output_maps[map_id]['outputs'] - for content in outputs: - if content['substatus'] in [ContentStatus.Available]: - if 'panda_id' in content['content_metadata']: - finished_jobs.append(content['content_metadata']['panda_id']) - elif content['substatus'] in [ContentStatus.FinalFailed, - ContentStatus.Lost, ContentStatus.Deleted, - ContentStatus.Missing]: - if 'panda_id' in content['content_metadata']: - failed_jobs.append(content['content_metadata']['panda_id']) for content in inputs: - inputname_mapid_map[content['name']] = {'map_id': map_id, - 'outputs': outputs} - return finished_jobs + failed_jobs, inputname_mapid_map + inputname_to_map_id_outputs[content['name']] = {'map_id': map_id, 'outputs': outputs} - def get_update_contents(self, inputnames, inputname_mapid_map, inputname_jobid_map): - self.logger.debug("get_update_contents, inputnames[:5]: %s" % str(inputnames[:5])) - # self.logger.debug("get_update_contents, inputname_mapid_map[:5]: %s" % str({k: inputname_mapid_map[k] for k in inputnames[:5]})) - self.logger.debug("get_update_contents, inputname_jobid_map[:3]: %s" % str({k: inputname_jobid_map[k] for k in inputnames[:3]})) + contents_ext_dict = {content['content_id']: content for content in contents_ext} - update_contents = [] - update_contents_full = [] - contents_ext_full = {} - num_updated_contents, num_unupdated_contents = 0, 0 - for inputname in inputnames: - panda_id_status = inputname_jobid_map[inputname] - panda_id = panda_id_status['panda_id'] - panda_status = panda_id_status['status'] - job_info = panda_id_status['job_info'] - map_id_contents = inputname_mapid_map[inputname] - contents = map_id_contents['outputs'] - for content in contents: - if content['substatus'] != panda_status: - # content['status'] = panda_status - content['substatus'] = panda_status - update_contents_full.append(content) - update_content = {'content_id': content['content_id'], - # 'status': panda_status, - 'substatus': panda_status} - # 'content_metadata': content['content_metadata'] - if 'panda_id' in content['content_metadata'] and content['content_metadata']['panda_id']: - # if content['content_metadata']['panda_id'] != job_info.PandaID: - if content['content_metadata']['panda_id'] < panda_id: - # new panda id is the bigger one. - if 'old_panda_id' not in content['content_metadata']: - content['content_metadata']['old_panda_id'] = [] - if content['content_metadata']['panda_id'] not in content['content_metadata']['old_panda_id']: - content['content_metadata']['old_panda_id'].append(content['content_metadata']['panda_id']) - content['content_metadata']['panda_id'] = panda_id - # content['status'] = panda_status - content['substatus'] = panda_status - update_content['content_metadata'] = content['content_metadata'] - elif content['content_metadata']['panda_id'] > panda_id: - if 'old_panda_id' not in content['content_metadata']: - content['content_metadata']['old_panda_id'] = [] - if panda_id not in content['content_metadata']['old_panda_id']: - content['content_metadata']['old_panda_id'].append(panda_id) - # content['content_metadata']['panda_id'] = content['content_metadata']['panda_id'] - # content['substatus'] = panda_status - update_content['content_metadata'] = content['content_metadata'] - else: - pass - # content['content_metadata']['panda_id'] = panda_id - content['substatus'] = panda_status - else: + update_contents, update_contents_full = [], [] + new_contents_ext, update_contents_ext = [], [] + update_contents_dict, new_contents_ext_dict = {}, {} + for input_file in unterminated_jobs_status: + panda_job_status = unterminated_jobs_status[input_file] + panda_id = panda_job_status['panda_id'] + panda_status = panda_job_status['status'] + job_info = panda_job_status['job_info'] + + output_contents = inputname_to_map_id_outputs[input_file]['outputs'] + for content in output_contents: + content['substatus'] = panda_status + update_contents_full.append(content) + update_content = {'content_id': content['content_id'], + # 'status': panda_status, + 'substatus': panda_status} + + if 'panda_id' in content['content_metadata'] and content['content_metadata']['panda_id']: + if content['content_metadata']['panda_id'] < panda_id: + # new panda id is the bigger one. + if 'old_panda_id' not in content['content_metadata']: + content['content_metadata']['old_panda_id'] = [] + if content['content_metadata']['panda_id'] not in content['content_metadata']['old_panda_id']: + content['content_metadata']['old_panda_id'].append(content['content_metadata']['panda_id']) content['content_metadata']['panda_id'] = panda_id - content['substatus'] = panda_status update_content['content_metadata'] = content['content_metadata'] - - update_contents.append(update_content) - num_updated_contents += 1 + elif content['content_metadata']['panda_id'] > panda_id: + if 'old_panda_id' not in content['content_metadata']: + content['content_metadata']['old_panda_id'] = [] + if panda_id not in content['content_metadata']['old_panda_id']: + content['content_metadata']['old_panda_id'].append(panda_id) + # content['content_metadata']['panda_id'] = content['content_metadata']['panda_id'] + # content['substatus'] = panda_status + update_content['content_metadata'] = content['content_metadata'] + else: + pass else: - # num_unupdated_contents += 1 - pass + content['content_metadata']['panda_id'] = panda_id + update_content['content_metadata'] = content['content_metadata'] + + update_contents.append(update_content) + update_contents_dict[update_content['content_id']] = update_content if panda_status in [ContentStatus.Available, ContentStatus.Failed, ContentStatus.FinalFailed, ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: - contents_ext_full[content['content_id']] = {'content': content, 'job_info': job_info} - - self.logger.debug("get_update_contents, num_updated_contents: %s, num_unupdated_contents: %s" % (num_updated_contents, num_unupdated_contents)) - self.logger.debug("get_update_contents, update_contents[:3]: %s" % (str(update_contents[:3]))) - self.logger.debug("get_update_contents, contents_ext_full[:3]: %s" % (str({k: contents_ext_full[k] for k in list(contents_ext_full.keys())[:3]}))) - return update_contents, update_contents_full, contents_ext_full - - def get_contents_ext_detail(self, new_contents_ext, update_contents_ext, job_info_maps={}): - new_contents_ext_d, update_contents_ext_d = [], [] - - for content_id in new_contents_ext: - content = new_contents_ext[content_id]['content'] - job_info = new_contents_ext[content_id]['job_info'] - new_content_ext_d = {'content_id': content['content_id'], - 'request_id': content['request_id'], - 'transform_id': content['transform_id'], - 'workload_id': content['workload_id'], - 'coll_id': content['coll_id'], - 'map_id': content['map_id'], - 'status': content['status']} - for job_info_item in job_info_maps: - new_content_ext_d[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) - if new_content_ext_d[job_info_item] == 'NULL': - new_content_ext_d[job_info_item] = None - if new_content_ext_d[job_info_item] is None: - del new_content_ext_d[job_info_item] - - new_contents_ext_d.append(new_content_ext_d) - - for content_id in update_contents_ext: - content = update_contents_ext[content_id]['content'] - job_info = update_contents_ext[content_id]['job_info'] - update_content_ext_d = {'content_id': content['content_id'], - 'status': content['status']} - for job_info_item in job_info_maps: - update_content_ext_d[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) - if update_content_ext_d[job_info_item] == 'NULL': - update_content_ext_d[job_info_item] = None - if update_content_ext_d[job_info_item] is None: - del update_content_ext_d[job_info_item] - - update_contents_ext_d.append(update_content_ext_d) - - return new_contents_ext_d, update_contents_ext_d - - def get_contents_ext(self, input_output_maps, contents_ext, contents_ext_full, job_info_maps={}): - self.logger.debug("get_contents_ext, len(contents_ext): %s" % (str(len(contents_ext)))) - self.logger.debug("get_contents_ext, contents_ext[:3]: %s" % (str(contents_ext[:3]))) - - contents_ext_dict = {content['content_id']: content for content in contents_ext} - - left_contents = [] - to_check_panda_ids = {} - new_contents_ext, update_contents_ext = {}, {} - new_need_poll_contents_ext, update_need_poll_contents_ext = {}, {} - - for map_id in input_output_maps: - # inputs = input_output_maps[map_id]['inputs'] - outputs = input_output_maps[map_id]['outputs'] - - for content in outputs: - if content['substatus'] in [ContentStatus.Available, ContentStatus.Failed, ContentStatus.FinalFailed, - ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: if content['content_id'] not in contents_ext_dict: - if content['content_id'] in contents_ext_full: - new_contents_ext[content['content_id']] = contents_ext_full[content['content_id']] - else: - new_need_poll_contents_ext[content['content_id']] = content - if content['content_metadata'] and 'panda_id' in content['content_metadata']: - to_check_panda_ids[content['content_metadata']['panda_id']] = content['content_id'] + new_content_ext = {'content_id': content['content_id'], + 'request_id': content['request_id'], + 'transform_id': content['transform_id'], + 'workload_id': content['workload_id'], + 'coll_id': content['coll_id'], + 'map_id': content['map_id'], + 'status': panda_status} + for job_info_item in job_info_maps: + new_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) + if new_content_ext[job_info_item] == 'NULL': + new_content_ext[job_info_item] = None + if new_content_ext[job_info_item] is None: + del new_content_ext[job_info_item] + new_contents_ext.append(new_content_ext) + new_contents_ext_dict[new_content_ext['content_id']] = new_content_ext else: - content_ext = contents_ext_dict[content['content_id']] - panda_id = None - if content['content_metadata'] and 'panda_id' in content['content_metadata']: - panda_id = content['content_metadata']['panda_id'] - if content['substatus'] != content_ext['status'] or panda_id != content_ext['panda_id']: - if content['content_id'] in contents_ext_full: - update_contents_ext[content['content_id']] = contents_ext_full[content['content_id']] - else: - update_need_poll_contents_ext[content['content_id']] = content - if panda_id: - to_check_panda_ids[panda_id] = content['content_id'] - else: - left_contents.append(content) - - if to_check_panda_ids: - to_check_panda_ids_list = list(to_check_panda_ids.keys()) - ret_job_infos = self.get_panda_job_status(to_check_panda_ids_list) - for job_info in ret_job_infos: - content_id = to_check_panda_ids[job_info.PandaID] - del to_check_panda_ids[job_info.PandaID] - if content_id in new_need_poll_contents_ext: - new_contents_ext[content_id] = {'content': new_need_poll_contents_ext[content_id], 'job_info': job_info} - del new_need_poll_contents_ext[content_id] - else: - update_contents_ext[content_id] = {'content': update_need_poll_contents_ext[content_id], 'job_info': job_info} - del update_need_poll_contents_ext[content_id] - for content_id in new_need_poll_contents_ext: - left_contents.append(new_need_poll_contents_ext[content_id]) - for content_id in update_need_poll_contents_ext: - left_contents.append(update_need_poll_contents_ext[content_id]) - - new_contents_ext_d, update_contents_ext_d = self.get_contents_ext_detail(new_contents_ext, update_contents_ext, job_info_maps) - - self.logger.debug("get_contents_ext, new_contents_ext_d[:1]: %s" % (str(new_contents_ext_d[:1]))) - self.logger.debug("get_contents_ext, update_contents_ext_d[:1]: %s" % (str(update_contents_ext_d[:1]))) - self.logger.debug("get_contents_ext, left_contents[:1]: %s" % (str(left_contents[:3]))) - return new_contents_ext_d, update_contents_ext_d, left_contents + update_content_ext = {'content_id': content['content_id'], + 'status': panda_status} + for job_info_item in job_info_maps: + update_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) + if update_content_ext[job_info_item] == 'NULL': + update_content_ext[job_info_item] = None + if update_content_ext[job_info_item] is None: + del update_content_ext[job_info_item] + update_contents_ext.append(update_content_ext) + + if abort: + for map_id in input_output_maps: + outputs = input_output_maps[map_id]['outputs'] + for content in outputs: + if content['substatus'] not in [ContentStatus.Available, ContentStatus.Failed, ContentStatus.FinalFailed, + ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: + if content['content_id'] not in update_contents_dict: + update_content = {'content_id': content['content_id'], + 'substatus': ContentStatus.Missing} + update_contents.append(update_content) + if content['content_id'] not in contents_ext_dict and content['content_id'] not in new_contents_ext_dict: + new_content_ext = {'content_id': content['content_id'], + 'request_id': content['request_id'], + 'transform_id': content['transform_id'], + 'workload_id': content['workload_id'], + 'coll_id': content['coll_id'], + 'map_id': content['map_id'], + 'status': ContentStatus.Missing} + new_contents_ext.append(new_content_ext) + + self.logger.debug("get_update_contents, num_update_contents: %s" % (len(update_contents))) + self.logger.debug("get_update_contents, update_contents[:3]: %s" % (str(update_contents[:3]))) + self.logger.debug("get_update_contents, new_contents_ext[:1]: %s" % (str(new_contents_ext[:1]))) + self.logger.debug("get_update_contents, update_contents_ext[:1]: %s" % (str(update_contents_ext[:1]))) - def abort_contents(self, input_output_maps, updated_contents, contents_ext, to_update_new_contents_ext): - contents_ext_dict = {content['content_id']: content for content in contents_ext} - new_contents_ext = [] - updated_contents_ids = [c['content_id'] for c in updated_contents] - new_contents_ext_ids = [c['content_id'] for c in to_update_new_contents_ext] - - for map_id in input_output_maps: - outputs = input_output_maps[map_id]['outputs'] - for content in outputs: - if content['content_id'] not in updated_contents_ids: - update_content = {'content_id': content['content_id'], - 'substatus': ContentStatus.Missing} - updated_contents.append(update_content) - if content['content_id'] not in contents_ext_dict and content['content_id'] not in new_contents_ext_ids: - new_content_ext = {'content_id': content['content_id'], - 'request_id': content['request_id'], - 'transform_id': content['transform_id'], - 'workload_id': content['workload_id'], - 'coll_id': content['coll_id'], - 'map_id': content['map_id'], - 'status': ContentStatus.Missing} - new_contents_ext.append(new_content_ext) - - return updated_contents, new_contents_ext + return update_contents, update_contents_full, new_contents_ext, update_contents_ext def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext=None, job_info_maps={}, log_prefix=''): task_id = None @@ -1178,26 +990,15 @@ def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext= all_jobs_ids = task_info['PandaID'] - terminated_jobs, inputname_mapid_map = self.get_job_maps(input_output_maps) - self.logger.debug(log_prefix + "poll_panda_task, task_id: %s, all jobs: %s, terminated_jobs: %s" % (str(task_id), len(all_jobs_ids), len(terminated_jobs))) - - all_jobs_ids = set(all_jobs_ids) - terminated_jobs = set(terminated_jobs) - unterminated_jobs = all_jobs_ids - terminated_jobs - - inputname_jobid_map = self.poll_panda_jobs(unterminated_jobs) - intersection_keys = set(inputname_mapid_map.keys()) & set(inputname_jobid_map.keys()) + unterminated_jobs = self.get_unterminated_jobs(all_jobs_ids, input_output_maps, contents_ext) + self.logger.debug(log_prefix + "poll_panda_task, task_id: %s, all jobs: %s, unterminated_jobs: %s" % (str(task_id), len(all_jobs_ids), len(unterminated_jobs))) - updated_contents, update_contents_full, contents_ext_full = self.get_update_contents(list(intersection_keys), - inputname_mapid_map, - inputname_jobid_map) - - new_contents_ext, update_contents_ext, left_contents = self.get_contents_ext(input_output_maps, contents_ext, - contents_ext_full, job_info_maps) - # if left_jobs: + unterminated_jobs_status = self.poll_panda_jobs(unterminated_jobs, log_prefix=log_prefix) + abort_status = False if processing_status in [ProcessingStatus.Cancelled]: - updated_contents, new_contents_ext1 = self.abort_contents(input_output_maps, updated_contents, contents_ext, new_contents_ext) - new_contents_ext = new_contents_ext + new_contents_ext1 + abort_status = True + ret_contents = self.get_update_contents(unterminated_jobs_status, input_output_maps, contents_ext, job_info_maps, abort=abort_status, log_prefix=log_prefix) + updated_contents, update_contents_full, new_contents_ext, update_contents_ext = ret_contents return processing_status, updated_contents, update_contents_full, new_contents_ext, update_contents_ext else: @@ -1282,6 +1083,19 @@ def resume_processing(self, processing, log_prefix=''): def require_ext_contents(self): return True + def has_external_content_id(self): + return True + + def get_external_content_ids(self, processing, log_prefix=''): + if processing: + from pandaclient import Client + proc = processing['processing_metadata']['processing'] + task_id = proc.workload_id + status, output = Client.get_files_in_datasets(task_id, verbose=False) + if status == 0: + return output + return [] + def poll_processing_updates(self, processing, input_output_maps, contents_ext=None, job_info_maps={}, log_prefix=''): """ *** Function called by Carrier agent. From 786e374e18be8bd1ecdf7c1651edafbb54b4ab03 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 4 Jul 2023 15:39:06 +0200 Subject: [PATCH 06/19] improve doma event map --- doma/lib/idds/doma/workflowv2/domaeventmap.py | 56 ++++++++++++++++--- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/doma/lib/idds/doma/workflowv2/domaeventmap.py b/doma/lib/idds/doma/workflowv2/domaeventmap.py index e5a7d1ce..e5635a09 100644 --- a/doma/lib/idds/doma/workflowv2/domaeventmap.py +++ b/doma/lib/idds/doma/workflowv2/domaeventmap.py @@ -13,6 +13,7 @@ """ +import datetime import os import pickle @@ -31,10 +32,27 @@ def construct_event_dependencies(self, job_event_map): self.event_deps[event_index] = [] job = self.events[event_index] deps = job.deps + dep_names = [] for dep in deps: # dep is gwjob - event_dep = job_event_map[dep.name] - self.event_deps[event_index].append(event_dep) + if dep.name not in dep_names: + event_dep = job_event_map[dep.name] + self.event_deps[event_index].append(event_dep) + else: + raise Exception("duplicated dependencies %s in job %s of task %s" % (dep.name, self.name, self.task_name)) + + def get_dependency_map(self): + return self.event_deps + + def dict(self): + ret = {} + ret['events'] = {} + for event_index in self.events: + job = self.events[event_index] + ret['events'][event_index] = {'name': job.name, 'deps': []} + ret['events'][event_index]['deps'] = self.event_deps[event_index] + + return ret def set_event_status(self, event_index, status, reported): self.event_status[str(event_index)] = {'status': status, 'reported': reported} @@ -96,14 +114,30 @@ def __init__(self, name): def add_job(self, job): self.jobs[job.name] = job + def dict(self): + ret = {} + for job_name in self.jobs: + ret[job_name] = self.jobs[job_name].dict() + return ret + def get_job(self, job_name): return self.jobs.get(job_name, None) + def get_dependency_map(self): + dep_map = {} + for job_name in self.jobs: + job = self.jobs[job_name] + dep_map[job_name] = job.get_dependency_map() + return dep_map + class DomaEventMap(object): - def __init__(self, name='doma_event_map.pickle', base_dir='./'): + def __init__(self, name=None, file_name='doma_event_map.pickle', base_dir='./'): + if not file_name: + file_name = 'doma_event_map.pickle' + self.file_name = file_name if not name: - name = 'doma_event_map.pickle' + name = "idds_event_" + datetime.datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S_%f") self.name = name self.base_dir = base_dir self.tasks = {} @@ -114,14 +148,20 @@ def add_task(self, task): def get_task(self, task_name): return self.tasks.get(task_name, None) + def dict(self): + ret = {} + for task_name in self.tasks: + ret[task_name] = self.tasks[task_name].dict() + return ret + def get_path(self): - if os.path.isabs(self.name): - path = self.name + if os.path.isabs(self.file_name): + path = self.file_name else: if self.base_dir: - path = os.path.join(self.base_dir, self.name) + path = os.path.join(self.base_dir, self.file_name) else: - path = self.name + path = self.file_name return path def save(self): From f692c2818af1eb281f29ebd50be3988caa28e846 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 4 Jul 2023 15:39:30 +0200 Subject: [PATCH 07/19] add new doma panda eventservice work --- .../idds/doma/workflowv2/domapandaeswork.py | 738 ++++++++++++++++++ 1 file changed, 738 insertions(+) create mode 100644 doma/lib/idds/doma/workflowv2/domapandaeswork.py diff --git a/doma/lib/idds/doma/workflowv2/domapandaeswork.py b/doma/lib/idds/doma/workflowv2/domapandaeswork.py new file mode 100644 index 00000000..30777ed9 --- /dev/null +++ b/doma/lib/idds/doma/workflowv2/domapandaeswork.py @@ -0,0 +1,738 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + + +import traceback + +from idds.common import exceptions +from idds.common.constants import (ContentStatus, ContentType, ProcessingStatus) +from idds.workflowv2.work import Processing + +from .domapandawork import DomaPanDAWork + + +class DomaPanDAESWork(DomaPanDAWork): + def __init__(self, executable=None, arguments=None, parameters=None, setup=None, + work_tag='lsst', exec_type='panda', sandbox=None, work_id=None, + primary_input_collection=None, other_input_collections=None, + input_collections=None, + primary_output_collection=None, other_output_collections=None, + output_collections=None, log_collections=None, + logger=None, dependency_map=None, task_name="", + task_queue=None, queue=None, processing_type=None, + prodSourceLabel='test', task_type='lsst', + maxwalltime=90000, maxattempt=5, core_count=1, + encode_command_line=False, + num_retries=5, + task_priority=900, + task_log=None, + task_cloud=None, + task_site=None, + task_rss=1000, + vo='wlcg', + working_group='lsst', + es_dependency_map=None): + + super(DomaPanDAESWork, self).__init__(executable=executable, arguments=arguments, + parameters=parameters, setup=setup, + work_tag=work_tag, exec_type=exec_type, sandbox=sandbox, work_id=work_id, + primary_input_collection=primary_input_collection, + other_input_collections=other_input_collections, + primary_output_collection=primary_output_collection, + other_output_collections=other_output_collections, + input_collections=input_collections, + output_collections=output_collections, + log_collections=log_collections, + logger=logger, + dependency_map=dependency_map, task_name=task_name, + task_queue=task_queue, queue=queue, processing_type=processing_type, + prodSourceLabel=prodSourceLabel, task_type=task_type, + maxwalltime=maxwalltime, maxattempt=maxattempt, core_count=core_count, + encode_command_line=encode_command_line, + num_retries=num_retries, + task_priority=task_priority, task_log=task_log, + task_cloud=task_cloud, task_site=task_site, task_rss=task_rss, + vo=vo, working_group=working_group) + self.es_dependency_map = es_dependency_map + + def with_sub_map_id(self): + return False + + @property + def es_dependency_map(self): + return self._es_dependency_map + + @es_dependency_map.setter + def es_dependency_map(self, value): + if value: + if type(value) not in [dict]: + raise exceptions.IDDSException("ES dependency_map should be a dict") + # the dumplication is already verified in DomaEventMap, not do it again here + + self._es_dependency_map = value + + def depend_on(self, work): + self.logger.debug("checking depending on") + if self.dependency_tasks is None: + self.logger.debug("constructing dependency_tasks set") + dependency_tasks = set([]) + for job_name in self.es_dependency_map: + es_dep_map = self.es_dependency_map[job_name] + for event_index in es_dep_map: + input_dependency = es_dep_map[event_index] + + for input_d in input_dependency: + task_name = input_d['group_label'] + if task_name not in dependency_tasks: + dependency_tasks.add(task_name) + self.dependency_tasks = list(dependency_tasks) + + if work.task_name in self.dependency_tasks: + self.logger.debug("finished checking depending on") + return True + else: + self.logger.debug("finished checking depending on") + return False + + def get_ancestry_works(self): + tasks = set([]) + for job_name in self.es_dependency_map: + es_dep_map = self.es_dependency_map[job_name] + for event_index in es_dep_map: + input_dependency = es_dep_map[event_index] + + for input_d in input_dependency: + task_name = input_d['group_label'] + if task_name not in tasks: + tasks.add(task_name) + return list(tasks) + + def get_mapped_inputs(self, mapped_input_output_maps): + ret = [] + for map_id in mapped_input_output_maps: + inputs = mapped_input_output_maps[map_id]['inputs'] + + # if 'primary' is not set, the first one is the primary input. + primary_input = inputs[0] + for ip in inputs: + if 'primary' in ip['content_metadata'] and ip['content_metadata']['primary']: + primary_input = ip + ret.append(primary_input) + return ret + + def get_mapped_outputs(self, mapped_input_output_maps): + ret = [] + for map_id in mapped_input_output_maps: + outputs = mapped_input_output_maps[map_id]['outputs'] + + # if 'primary' is not set, the first one is the primary input. + primary_output = outputs[0] + for ip in outputs: + if 'primary' in ip['content_metadata'] and ip['content_metadata']['primary']: + primary_output = ip + ret.append(primary_output) + return ret + + def map_file_to_content(self, coll_id, scope, name, event_index, dep_event_index=None): + content = {'coll_id': coll_id, + 'scope': scope, + 'name': name, # or a different file name from the dataset name + 'sub_map_id': int(event_index), + 'bytes': 1, + 'adler32': '12345678', + 'min_id': int(event_index), + 'max_id': int(event_index) + 1, + 'content_type': ContentType.Event, + # 'content_relation_type': content_relation_type, + # here events is all events for eventservice, not used here. + 'content_metadata': {'events': 1}} + if dep_event_index is not None: + content['content_metadata'] = {'events': 1, 'dep_sub_map_id': int(dep_event_index)} + content['dep_sub_map_id'] = int(dep_event_index) + return content + + def is_all_dependency_tasks_available(self, inputs_dependency, task_name_to_coll_map): + for input_d in inputs_dependency: + task_name = input_d['group_label'] + if (task_name not in task_name_to_coll_map # noqa: W503 + or 'outputs' not in task_name_to_coll_map[task_name] # noqa: W503 + or not task_name_to_coll_map[task_name]['outputs']): # noqa: W503 + return False + return True + + def get_unmapped_jobs(self, mapped_input_output_maps={}): + mapped_outputs = self.get_mapped_outputs(mapped_input_output_maps) + mapped_outputs_name = [ip['name'] for ip in mapped_outputs] + unmapped_jobs = [] + for job_name in self.es_dependency_map: + if job_name not in mapped_outputs_name: + unmapped_jobs.append(job_name) + return unmapped_jobs + + def has_external_dependency(self): + if self.es_dependency_map: + for job_name in self.es_dependency_map: + es_dep_map = self.es_dependency_map[job_name] + for event_index in es_dep_map: + inputs_dependency = es_dep_map[event_index] + if inputs_dependency: + for input_dep in inputs_dependency: + dep_task_name = input_dep['group_label'] + dep_job_name = input_dep['event_job'] + if dep_task_name != self.orig_task_name or dep_job_name != job_name: + return True + return False + + def get_parent_work_names(self): + parent_work_names = [] + for job_name in self.es_dependency_map: + es_dep_map = self.es_dependency_map[job_name] + for event_index in es_dep_map: + input_dependency = es_dep_map[event_index] + + for input_d in input_dependency: + task_name = input_d['group_label'] + if task_name not in parent_work_names: + parent_work_names.append(task_name) + return parent_work_names + + def get_parent_workload_ids(self): + parent_workload_ids = [] + parent_work_names = self.get_parent_work_names() + work_name_to_coll_map = self.get_work_name_to_coll_map() + for work_name in parent_work_names: + if work_name in work_name_to_coll_map: + input_d_coll = work_name_to_coll_map[work_name]['outputs'][0] + if input_d_coll and 'workload_id' in input_d_coll: + parent_workload_ids.append(input_d_coll['workload_id']) + return parent_workload_ids + + def get_new_input_output_maps(self, mapped_input_output_maps={}): + """ + *** Function called by Transformer agent. + New inputs which are not yet mapped to outputs. + + :param mapped_input_output_maps: Inputs that are already mapped. + """ + new_input_output_maps = {} + + unmapped_jobs = self.get_unmapped_jobs(mapped_input_output_maps) + if not unmapped_jobs: + self.set_has_new_inputs(False) + return new_input_output_maps + + if unmapped_jobs: + mapped_keys = mapped_input_output_maps.keys() + if mapped_keys: + next_key = max(mapped_keys) + 1 + else: + next_key = 1 + + input_coll = self.get_input_collections()[0] + input_coll_id = input_coll.coll_id + output_coll = self.get_output_collections()[0] + output_coll_id = output_coll.coll_id + + task_name_to_coll_map = self.get_work_name_to_coll_map() + + for job_name in unmapped_jobs: + es_dep_map = self.es_dependency_map[job_name] + job_input_dependency = [] + for event_index in es_dep_map: + input_dependency = es_dep_map[event_index] + job_input_dependency += input_dependency + + if self.is_all_dependency_tasks_available(job_input_dependency, task_name_to_coll_map): + new_input_output_maps[next_key] = {'inputs_dependency': [], + 'logs': [], + 'inputs': [], + 'outputs': []} + for event_index in sorted(list(es_dep_map.keys())): + input_content = self.map_file_to_content(input_coll_id, input_coll.scope, job_name, event_index) + output_content = self.map_file_to_content(output_coll_id, output_coll.scope, job_name, event_index) + + new_input_output_maps[next_key]['inputs'].append(input_content) + new_input_output_maps[next_key]['outputs'].append(output_content) + + uni_input_name = {} + input_dependency = es_dep_map[event_index] + for input_d in input_dependency: + task_name = input_d['group_label'] + job_name = input_d['event_job'] + dep_event_index = input_d['event_index'] + task_name_input_name = task_name + job_name + str(dep_event_index) + if task_name_input_name not in uni_input_name: + uni_input_name[task_name_input_name] = None + input_d_coll = task_name_to_coll_map[task_name]['outputs'][0] + input_d_content = self.map_file_to_content(input_d_coll['coll_id'], input_d_coll['scope'], job_name, event_index, dep_event_index) + new_input_output_maps[next_key]['inputs_dependency'].append(input_d_content) + else: + self.logger.debug("get_new_input_output_maps, duplicated input dependency for job %s event_index %s: %s" % (job_name, event_index, str(input_dependency))) + + # all inputs are parsed. move it to dependency_map_deleted + # self.dependency_map_deleted.append(job) + next_key += 1 + else: + # not all inputs for this job can be parsed. + # self.dependency_map.append(job) + pass + + # self.logger.debug("get_new_input_output_maps, new_input_output_maps: %s" % str(new_input_output_maps)) + self.logger.debug("get_new_input_output_maps, new_input_output_maps len: %s" % len(new_input_output_maps)) + return new_input_output_maps + + def create_processing(self, input_output_maps=[]): + """ + *** Function called by Transformer agent. + + :param input_output_maps: new maps from inputs to outputs. + """ + # avoid duplicated task name + self.task_name = self.task_name + "_" + str(self.get_request_id()) + "_" + str(self.get_work_id()) + + in_files = [] + # has_dependencies = False + if self.es_dependency_map is None: + self.es_dependency_map = {} + for job_name in self.es_dependency_map: + es_dep_map = self.es_dependency_map[job_name] + event_indexes = list(es_dep_map.keys()) + # min_event_index = min(event_indexes) + # max_event_index = max(event_indexes) + # event_file_name = job_name + "%s^%s" % (min_event_index, max_event_index) + event_file_name = job_name + "^%s" % (len(event_indexes)) + + in_files.append(event_file_name) + + task_param_map = {} + task_param_map['vo'] = self.vo + if self.task_queue and len(self.task_queue) > 0: + task_param_map['site'] = self.task_queue + elif self.queue and len(self.queue) > 0: + task_param_map['site'] = self.queue + task_param_map['workingGroup'] = self.working_group + task_param_map['nFilesPerJob'] = 1 + if in_files: + if self.has_external_dependency(): + task_param_map['inputPreStaging'] = True + task_param_map['nFiles'] = len(in_files) + task_param_map['noInput'] = True + task_param_map['pfnList'] = in_files + else: + # task_param_map['inputPreStaging'] = True + in_files = ['pseudo_file'] + task_param_map['nFiles'] = len(in_files) + task_param_map['noInput'] = True + task_param_map['pfnList'] = in_files + + # enabling eventservice + task_param_map['fineGrainedProc'] = True + # task_param_map['eventService'] = 3 + + task_param_map['taskName'] = self.task_name + task_param_map['userName'] = self.username if self.username else 'iDDS' + task_param_map['taskPriority'] = self.task_priority + task_param_map['architecture'] = '' + task_param_map['transUses'] = '' + task_param_map['transHome'] = None + + executable = self.executable + executable = "export IDDS_BUILD_REQUEST_ID=" + str(self.get_request_id()) + ";" + executable += "export IDDS_BUIL_SIGNATURE=" + str(self.signature) + "; " + self.executable + + if self.encode_command_line: + # task_param_map['transPath'] = 'https://atlpan.web.cern.ch/atlpan/bash-c-enc' + task_param_map['transPath'] = 'https://storage.googleapis.com/drp-us-central1-containers/bash-c-enc' + task_param_map['encJobParams'] = True + else: + # task_param_map['transPath'] = 'https://atlpan.web.cern.ch/atlpan/bash-c' + task_param_map['transPath'] = 'https://storage.googleapis.com/drp-us-central1-containers/bash-c' + task_param_map['processingType'] = self.processingType + task_param_map['prodSourceLabel'] = self.prodSourceLabel + task_param_map['noWaitParent'] = True + task_param_map['taskType'] = self.task_type + task_param_map['coreCount'] = self.core_count + task_param_map['skipScout'] = True + task_param_map['cloud'] = self.task_cloud + task_param_map['PandaSite'] = self.task_site + if self.task_rss and self.task_rss > 0: + task_param_map['ramCount'] = self.task_rss / self.core_count if self.core_count else self.task_rss + # task_param_map['ramUnit'] = 'MB' + task_param_map['ramUnit'] = 'MBPerCoreFixed' + + # task_param_map['inputPreStaging'] = True + task_param_map['prestagingRuleID'] = 123 + task_param_map['nChunksToWait'] = 1 + task_param_map['maxCpuCount'] = self.core_count + task_param_map['maxWalltime'] = self.maxWalltime + task_param_map['maxFailure'] = self.maxAttempt if self.maxAttempt else 5 + task_param_map['maxAttempt'] = self.maxAttempt if self.maxAttempt else 5 + if task_param_map['maxAttempt'] < self.num_retries: + task_param_map['maxAttempt'] = self.num_retries + if task_param_map['maxFailure'] < self.num_retries: + task_param_map['maxFailure'] = self.num_retries + task_param_map['log'] = self.task_log + task_param_map['jobParameters'] = [ + {'type': 'constant', + 'value': executable, # noqa: E501 + }, + ] + + task_param_map['reqID'] = self.get_request_id() + + processing_metadata = {'task_param': task_param_map} + proc = Processing(processing_metadata=processing_metadata) + proc.workload_id = None + self.add_processing_to_processings(proc) + self.active_processings.append(proc.internal_id) + return proc + + def get_unterminated_jobs(self, all_jobs_ids, input_output_maps, contents_ext): + finished_jobs, failed_jobs = [], [] + + contents_ext_dict = {content['content_id']: content for content in contents_ext} + + for map_id in input_output_maps: + outputs = input_output_maps[map_id]['outputs'] + for content in outputs: + if content['substatus'] in [ContentStatus.Available]: + if 'panda_id' in content['content_metadata']: + panda_id = content['content_metadata']['panda_id'] + if content['content_id'] not in contents_ext_dict: + continue + + content_ext = contents_ext_dict[content['content_id']] + if content['substatus'] != content_ext['status'] or panda_id != content_ext['panda_id']: + continue + + if panda_id not in finished_jobs: + finished_jobs.append(panda_id) + elif content['substatus'] in [ContentStatus.FinalFailed, + ContentStatus.Lost, ContentStatus.Deleted, + ContentStatus.Missing]: + if 'panda_id' in content['content_metadata']: + panda_id = content['content_metadata']['panda_id'] + if content['content_id'] not in contents_ext_dict: + continue + + content_ext = contents_ext_dict[content['content_id']] + if content['substatus'] != content_ext['status'] or panda_id != content_ext['panda_id']: + continue + + if panda_id not in failed_jobs: + failed_jobs.append(panda_id) + + all_jobs_ids = set(all_jobs_ids) + terminated_jobs = set(finished_jobs + failed_jobs) + unterminated_jobs = all_jobs_ids - terminated_jobs + return list(unterminated_jobs) + + def get_panda_job_status(self, jobids, log_prefix=''): + self.logger.debug(log_prefix + "get_panda_job_status, jobids[:10]: %s" % str(jobids[:10])) + try: + from pandaclient import Client + ret = Client.getJobStatus(jobids, verbose=0) + if ret[0] == 0: + left_jobids = [] + ret_jobs = [] + jobs_list = ret[1] + for jobid, jobinfo in zip(jobids, jobs_list): + if jobinfo is None: + left_jobids.append(jobid) + else: + ret_jobs.append(jobinfo) + if left_jobids: + try: + ret1 = Client.getFullJobStatus(ids=left_jobids, verbose=False) + if ret1[0] == 0: + left_jobs_list = ret1[1] + ret_jobs = ret_jobs + left_jobs_list + except Exception as ex: + self.logger.error(str(ex)) + self.logger.error(traceback.format_exc()) + return ret_jobs + else: + self.logger.warn(log_prefix + "get_panda_job_status failed: %s" % str(ret)) + return [] + except Exception as ex: + self.logger.error(str(ex)) + self.logger.error(traceback.format_exc()) + return [] + + def get_panda_event_status(self, eventids, log_prefix=''): + self.logger.debug(log_prefix + "get_panda_event_status, eventids[:10]: %s" % str(eventids[:10])) + try: + from pandaclient import Client + ret_status, events_status = Client.get_events_status(eventids) + self.logger.info(log_prefix + "get_panda_events_status: status: %s" % ret_status) + # self.logger.debug(log_prefix + "poll_panda_jobs, get_events_status: event status: %s" % event_status) + if ret_status != 0: + self.logger.error(log_prefix + "get_panda_events_status: event status: %s" % events_status) + else: + # self.logger.error(log_prefix + "get_panda_events_status: event status: %s" % events_status) + return events_status + except Exception as ex: + self.logger.error(str(ex)) + self.logger.error(traceback.format_exc()) + return None + + def get_content_status_from_panda_status(self, job_info): + if job_info is None: + return ContentStatus.Processing + + jobstatus = job_info.jobStatus + if jobstatus in ['finished', 'merging']: + return ContentStatus.Available + elif jobstatus in ['failed', 'closed', 'cancelled', 'lost', 'broken', 'missing']: + attempt_nr = int(job_info.attemptNr) if job_info.attemptNr else 0 + max_attempt = int(job_info.maxAttempt) if job_info.maxAttempt else 0 + self_maxAttempt = int(self.maxAttempt) if self.maxAttempt else 0 + if (attempt_nr >= max_attempt) and (attempt_nr >= self_maxAttempt): + return ContentStatus.FinalFailed + else: + return ContentStatus.Failed + elif jobstatus in ['activated']: + return ContentStatus.Activated + else: + return ContentStatus.Processing + + def poll_panda_jobs(self, task_id, job_ids, log_prefix=''): + terminated_jobs = {} + job_status_info = {} + self.logger.debug(log_prefix + "poll_panda_jobs, poll_panda_jobs_chunk_size: %s, job_ids[:10]: %s" % (self.poll_panda_jobs_chunk_size, str(job_ids[:10]))) + chunksize = self.poll_panda_jobs_chunk_size + chunks = [job_ids[i:i + chunksize] for i in range(0, len(job_ids), chunksize)] + for chunk in chunks: + # jobs_list = Client.getJobStatus(chunk, verbose=0)[1] + jobs_list = self.get_panda_job_status(chunk, log_prefix=log_prefix) + if jobs_list: + self.logger.debug(log_prefix + "poll_panda_jobs, input jobs: %s, output_jobs: %s" % (len(chunk), len(jobs_list))) + for job_info in jobs_list: + job_status = self.get_content_status_from_panda_status(job_info) + if job_info and job_info.Files and len(job_info.Files) > 0: + for job_file in job_info.Files: + # if job_file.type in ['log']: + if job_file.type not in ['pseudo_input']: + continue + if ':' in job_file.lfn: + pos = job_file.lfn.find(":") + input_file = job_file.lfn[pos + 1:] + # input_file = job_file.lfn.split(':')[1] + else: + input_file = job_file.lfn + job_status_info[input_file] = {'panda_id': job_info.PandaID, 'status': job_status, 'job_info': job_info} + + if job_status in [ContentStatus.Available, ContentStatus.Failed, ContentStatus.FinalFailed, + ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: + if job_info.PandaID not in terminated_jobs: + terminated_jobs[job_info.PandaID] = [] + terminated_jobs[job_info.PandaID].append(input_file) + else: + self.logger.warn(log_prefix + "poll_panda_jobs, input jobs: %s, output_jobs: %s" % (len(chunk), jobs_list)) + + # poll event status + terminated_job_ids = list(terminated_jobs.keys()) + chunks = [terminated_job_ids[i:i + chunksize] for i in range(0, len(terminated_job_ids), chunksize)] + for chunk in chunks: + chunk_ids = [{'task_id': task_id, 'panda_id': panda_id} for panda_id in chunk] + events_status = self.get_panda_event_status(chunk_ids) + if events_status is None: + pass + else: + for panda_id in events_status: + input_files = terminated_jobs[int(panda_id)] + for input_file in input_files: + job_status_info[input_file]['events_status'] = events_status[str(panda_id)] + return job_status_info + + def get_update_contents(self, unterminated_jobs_status, input_output_maps, contents_ext, job_info_maps, abort=False, log_prefix=''): + inputname_to_map_id_outputs = {} + for map_id in input_output_maps: + inputs = input_output_maps[map_id]['inputs'] + outputs = input_output_maps[map_id]['outputs'] + for content in inputs: + inputname_to_map_id_outputs[content['name']] = {'map_id': map_id, 'outputs': outputs} + + contents_ext_dict = {content['content_id']: content for content in contents_ext} + + update_contents, update_contents_full = [], [] + new_contents_ext, update_contents_ext = [], [] + for input_file in unterminated_jobs_status: + panda_job_status = unterminated_jobs_status[input_file] + panda_id = panda_job_status['panda_id'] + job_status = panda_job_status['status'] + job_info = panda_job_status['job_info'] + events_status = None + if 'events_status' in panda_job_status: + events_status = panda_job_status['events_status'] + + if events_status is None or job_status not in [ContentStatus.Failed, ContentStatus.FinalFailed, + ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: + continue + + output_contents = inputname_to_map_id_outputs[input_file]['outputs'] + output_contents_sub_map = {} + for content in output_contents: + if content['sub_map_id'] not in output_contents_sub_map: + output_contents_sub_map[content['sub_map_id']] = [] + output_contents_sub_map[content['sub_map_id']].append(content) + + for sub_map_id in output_contents_sub_map: + for content in output_contents_sub_map[sub_map_id]: + event_status = events_status.get(str(sub_map_id), None) + update_content = None + if not event_status: + if job_status in [ContentStatus.Failed, ContentStatus.FinalFailed, + ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: + event_status = job_status + if event_status: + # content['substatus'] = panda_status + content['substatus'] = event_status + update_contents_full.append(content) + update_content = {'content_id': content['content_id'], + # 'status': panda_status, + # 'substatus': panda_status, + 'substatus': event_status, + 'external_event_status': event_status} + + if 'panda_id' in content['content_metadata'] and content['content_metadata']['panda_id']: + if content['content_metadata']['panda_id'] < panda_id: + # new panda id is the bigger one. + if 'old_panda_id' not in content['content_metadata']: + content['content_metadata']['old_panda_id'] = [] + if content['content_metadata']['panda_id'] not in content['content_metadata']['old_panda_id']: + content['content_metadata']['old_panda_id'].append(content['content_metadata']['panda_id']) + content['content_metadata']['panda_id'] = panda_id + update_content['content_metadata'] = content['content_metadata'] + elif content['content_metadata']['panda_id'] > panda_id: + if 'old_panda_id' not in content['content_metadata']: + content['content_metadata']['old_panda_id'] = [] + if panda_id not in content['content_metadata']['old_panda_id']: + content['content_metadata']['old_panda_id'].append(panda_id) + # content['content_metadata']['panda_id'] = content['content_metadata']['panda_id'] + # content['substatus'] = panda_status + update_content['content_metadata'] = content['content_metadata'] + else: + pass + else: + content['content_metadata']['panda_id'] = panda_id + update_content['content_metadata'] = content['content_metadata'] + + if update_content: + update_contents.append(update_content) + + if job_status in [ContentStatus.Available, ContentStatus.Failed, ContentStatus.FinalFailed, + ContentStatus.Lost, ContentStatus.Deleted, ContentStatus.Missing]: + if content['content_id'] not in contents_ext_dict: + new_content_ext = {'content_id': content['content_id'], + 'request_id': content['request_id'], + 'transform_id': content['transform_id'], + 'workload_id': content['workload_id'], + 'coll_id': content['coll_id'], + 'map_id': content['map_id'], + 'status': event_status} + for job_info_item in job_info_maps: + new_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) + if new_content_ext[job_info_item] == 'NULL': + new_content_ext[job_info_item] = None + if new_content_ext[job_info_item] is None: + del new_content_ext[job_info_item] + new_contents_ext.append(new_content_ext) + else: + update_content_ext = {'content_id': content['content_id'], + 'status': event_status} + for job_info_item in job_info_maps: + update_content_ext[job_info_item] = getattr(job_info, job_info_maps[job_info_item]) + if update_content_ext[job_info_item] == 'NULL': + update_content_ext[job_info_item] = None + if update_content_ext[job_info_item] is None: + del update_content_ext[job_info_item] + update_contents_ext.append(update_content_ext) + + self.logger.debug("get_update_contents, num_update_contents: %s" % (len(update_contents))) + self.logger.debug("get_update_contents, update_contents[:3]: %s" % (str(update_contents[:3]))) + self.logger.debug("get_update_contents, new_contents_ext[:1]: %s" % (str(new_contents_ext[:1]))) + self.logger.debug("get_update_contents, update_contents_ext[:1]: %s" % (str(update_contents_ext[:1]))) + + return update_contents, update_contents_full, new_contents_ext, update_contents_ext + + def poll_panda_task(self, processing=None, input_output_maps=None, contents_ext=None, job_info_maps={}, log_prefix=''): + task_id = None + try: + from pandaclient import Client + + if processing: + proc = processing['processing_metadata']['processing'] + task_id = proc.workload_id + if task_id is None: + task_id = self.get_panda_task_id(processing) + + if task_id: + # ret_ids = Client.getPandaIDsWithTaskID(task_id, verbose=False) + self.logger.debug(log_prefix + "poll_panda_task, task_id: %s" % str(task_id)) + task_info = Client.getJediTaskDetails({'jediTaskID': task_id}, True, True, verbose=True) + self.logger.debug(log_prefix + "poll_panda_task, task_info[0]: %s" % str(task_info[0])) + if task_info[0] != 0: + self.logger.warn(log_prefix + "poll_panda_task %s, error getting task status, task_info: %s" % (task_id, str(task_info))) + return ProcessingStatus.Running, [], [], [], [] + + task_info = task_info[1] + + processing_status = self.get_processing_status_from_panda_status(task_info["status"]) + self.logger.info(log_prefix + "poll_panda_task processing_status: %s" % processing_status) + + all_jobs_ids = task_info['PandaID'] + + unterminated_jobs = self.get_unterminated_jobs(all_jobs_ids, input_output_maps, contents_ext) + self.logger.debug(log_prefix + "poll_panda_task, task_id: %s, all jobs: %s, unterminated_jobs: %s" % (str(task_id), len(all_jobs_ids), len(unterminated_jobs))) + + unterminated_jobs_status = self.poll_panda_jobs(task_id, unterminated_jobs, log_prefix=log_prefix) + abort_status = False + if processing_status in [ProcessingStatus.Cancelled]: + abort_status = True + ret_contents = self.get_update_contents(unterminated_jobs_status, input_output_maps, contents_ext, job_info_maps, abort=abort_status, log_prefix=log_prefix) + updated_contents, update_contents_full, new_contents_ext, update_contents_ext = ret_contents + + return processing_status, updated_contents, update_contents_full, new_contents_ext, update_contents_ext + else: + self.logger.error("poll_panda_task, task_id (%s) cannot be found" % task_id) + return ProcessingStatus.Failed, [], [], [], [] + except Exception as ex: + msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex)) + self.logger.error(log_prefix + msg) + self.logger.error(log_prefix + str(ex)) + self.logger.error(traceback.format_exc()) + # raise exceptions.IDDSException(msg) + return ProcessingStatus.Running, [], [], [], [] + + def poll_processing_updates(self, processing, input_output_maps, contents_ext=None, job_info_maps={}, log_prefix=''): + """ + *** Function called by Carrier agent. + """ + update_contents = [] + update_contents_full = [] + self.logger.debug(log_prefix + "poll_processing_updates, input_output_maps.keys[:3]: %s" % str(list(input_output_maps.keys())[:3])) + + if processing: + proc = processing['processing_metadata']['processing'] + + ret_poll_panda_task = self.poll_panda_task(processing=processing, + input_output_maps=input_output_maps, + contents_ext=contents_ext, + job_info_maps=job_info_maps, + log_prefix=log_prefix) + + processing_status, update_contents, update_contents_full, new_contents_ext, update_contents_ext = ret_poll_panda_task + self.logger.debug(log_prefix + "poll_processing_updates, processing_status: %s" % str(processing_status)) + self.logger.debug(log_prefix + "poll_processing_updates, update_contents[:3]: %s" % str(update_contents[:3])) + + if update_contents: + proc.has_new_updates() + return processing_status, update_contents, {}, update_contents_full, {}, new_contents_ext, update_contents_ext From d39300190a0da89ed1b52b3ae4d128fb6201989f Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 4 Jul 2023 15:41:18 +0200 Subject: [PATCH 08/19] new poll period for new tasks --- main/lib/idds/agents/carrier/poller.py | 20 ++++++++++++++------ main/lib/idds/agents/carrier/submitter.py | 2 +- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/main/lib/idds/agents/carrier/poller.py b/main/lib/idds/agents/carrier/poller.py index de55efe0..f78d6148 100644 --- a/main/lib/idds/agents/carrier/poller.py +++ b/main/lib/idds/agents/carrier/poller.py @@ -55,6 +55,11 @@ def __init__(self, num_threads=1, poll_period=10, retries=3, retrieve_bulk_size= else: self.update_poll_period = int(self.update_poll_period) + if not hasattr(self, 'update_poll_period_for_new_task') or not self.update_poll_period_for_new_task: + self.update_poll_period_for_new_task = 180 + else: + self.update_poll_period_for_new_task = int(self.update_poll_period_for_new_task) + if hasattr(self, 'poll_period_increase_rate'): self.poll_period_increase_rate = float(self.poll_period_increase_rate) else: @@ -154,7 +159,7 @@ def get_work_tag_attribute(self, work_tag, attribute): work_tag_attribute_value = int(getattr(self, work_tag_attribute)) return work_tag_attribute_value - def load_poll_period(self, processing, parameters): + def load_poll_period(self, processing, parameters, new=False): proc = processing['processing_metadata']['processing'] work = proc.work work_tag = work.get_work_tag() @@ -165,11 +170,14 @@ def load_poll_period(self, processing, parameters): elif self.new_poll_period and processing['new_poll_period'] != self.new_poll_period: parameters['new_poll_period'] = self.new_poll_period - work_tag_update_poll_period = self.get_work_tag_attribute(work_tag, "update_poll_period") - if work_tag_update_poll_period: - parameters['update_poll_period'] = work_tag_update_poll_period - elif self.update_poll_period and processing['update_poll_period'] != self.update_poll_period: - parameters['update_poll_period'] = self.update_poll_period + if new: + parameters['update_poll_period'] = self.update_poll_period_for_new_task + else: + work_tag_update_poll_period = self.get_work_tag_attribute(work_tag, "update_poll_period") + if work_tag_update_poll_period: + parameters['update_poll_period'] = work_tag_update_poll_period + elif self.update_poll_period and processing['update_poll_period'] != self.update_poll_period: + parameters['update_poll_period'] = self.update_poll_period return parameters def get_log_prefix(self, processing): diff --git a/main/lib/idds/agents/carrier/submitter.py b/main/lib/idds/agents/carrier/submitter.py index 3419830e..456943cf 100644 --- a/main/lib/idds/agents/carrier/submitter.py +++ b/main/lib/idds/agents/carrier/submitter.py @@ -92,7 +92,7 @@ def handle_new_processing(self, processing): 'substatus': ProcessingStatus.Submitting, 'locking': ProcessingLocking.Idle, 'processing_metadata': processing['processing_metadata']} - parameters = self.load_poll_period(processing, parameters) + parameters = self.load_poll_period(processing, parameters, new=True) proc = processing['processing_metadata']['processing'] if proc.submitted_at: From 6a514b05d47dd1e13d389e0ddeeb7f8ee7841c09 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 4 Jul 2023 15:42:33 +0200 Subject: [PATCH 09/19] add sub_map_id to manage events --- main/lib/idds/agents/carrier/utils.py | 450 +++++++++++++++----------- 1 file changed, 269 insertions(+), 181 deletions(-) diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index 42bf35c6..522261ff 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -62,6 +62,10 @@ def get_new_content(request_id, transform_id, workload_id, map_id, input_content content['min_id'] = 0 if content['max_id'] is None: content['max_id'] = 0 + if 'sub_map_id' in input_content: + content['sub_map_id'] = input_content['sub_map_id'] + if 'dep_sub_map_id' in input_content: + content['dep_sub_map_id'] = input_content['dep_sub_map_id'] return content @@ -177,7 +181,8 @@ def get_input_output_maps(transform_id, work): mapped_input_output_maps = core_transforms.get_transform_input_output_maps(transform_id, input_coll_ids=input_coll_ids, output_coll_ids=output_coll_ids, - log_coll_ids=log_coll_ids) + log_coll_ids=log_coll_ids, + with_sub_map_id=work.with_sub_map_id()) # work_name_to_coll_map = core_transforms.get_work_name_to_coll_map(request_id=transform['request_id']) # work.set_work_name_to_coll_map(work_name_to_coll_map) @@ -191,26 +196,6 @@ def get_ext_contents(transform_id, work): return contents_ids -def resolve_input_dependency_id(request_id, new_input_dep_coll_ids, new_input_dependency_contents, logger=None, log_prefix=''): - logger = get_logger(logger) - - logger.info(log_prefix + "resolve_input_dependency_id: new_input_dep_coll_ids: %s" % (str(new_input_dep_coll_ids))) - logger.info(log_prefix + "resolve_input_dependency_id: len(new_input_dependency_contents): %s" % len(new_input_dependency_contents)) - - if new_input_dep_coll_ids: - contents = core_catalog.get_contents_by_coll_id_status(coll_id=new_input_dep_coll_ids) - content_name_id_map = {} - for content in contents: - if content['coll_id'] not in content_name_id_map: - content_name_id_map[content['coll_id']] = {} - content_name_id_map[content['coll_id']][content['name']] = content['content_id'] - for content in new_input_dependency_contents: - if content['coll_id'] in content_name_id_map and content['name'] in content_name_id_map[content['coll_id']]: - content_dep_id = content_name_id_map[content['coll_id']][content['name']] - content['content_dep_id'] = content_dep_id - return new_input_dependency_contents - - def get_new_contents(request_id, transform_id, workload_id, new_input_output_maps, logger=None, log_prefix=''): logger = get_logger(logger) @@ -239,7 +224,6 @@ def get_new_contents(request_id, transform_id, workload_id, new_input_output_map content = get_new_content(request_id, transform_id, workload_id, map_id, log_content, content_relation_type=ContentRelationType.Log) new_log_contents.append(content) - # new_input_dependency_contents = resolve_input_dependency_id(request_id, new_input_dep_coll_ids, new_input_dependency_contents, logger=logger, log_prefix=log_prefix) return new_input_contents, new_output_contents, new_log_contents, new_input_dependency_contents @@ -261,34 +245,40 @@ def get_update_contents(request_id, transform_id, workload_id, input_output_maps outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] # logs = input_output_maps[map_id]['logs'] if 'logs' in input_output_maps[map_id] else [] - content_update_status = None - if is_all_contents_available(inputs_dependency): - # logger.debug("all input dependency available: %s, inputs: %s" % (str(inputs_dependency), str(inputs))) - content_update_status = ContentStatus.Available - elif is_all_contents_terminated(inputs_dependency): - # logger.debug("all input dependency terminated: %s, inputs: %s, outputs: %s" % (str(inputs_dependency), str(inputs), str(outputs))) - content_update_status = ContentStatus.Missing - - if content_update_status: - for content in inputs: - content['substatus'] = content_update_status - if content['status'] != content['substatus']: - updated_content, content = get_update_content(content) - updated_contents.append(updated_content) - updated_input_contents_full.append(content) - if content_update_status in [ContentStatus.Missing]: - for content in outputs: - content['substatus'] = content_update_status + input_output_sub_maps = get_input_output_sub_maps(inputs, outputs, inputs_dependency) + for sub_map_id in input_output_sub_maps: + inputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + outputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + inputs_dependency_sub = input_output_sub_maps[sub_map_id]['inputs_dependency'] + + content_update_status = None + if is_all_contents_available(inputs_dependency_sub): + # logger.debug("all input dependency available: %s, inputs: %s" % (str(inputs_dependency), str(inputs))) + content_update_status = ContentStatus.Available + elif is_all_contents_terminated(inputs_dependency_sub): + # logger.debug("all input dependency terminated: %s, inputs: %s, outputs: %s" % (str(inputs_dependency), str(inputs), str(outputs))) + content_update_status = ContentStatus.Missing + + if content_update_status: + for content in inputs_sub: + content['substatus'] = content_update_status + if content['status'] != content['substatus']: + updated_content, content = get_update_content(content) + updated_contents.append(updated_content) + updated_input_contents_full.append(content) + if content_update_status in [ContentStatus.Missing]: + for content in outputs_sub: + content['substatus'] = content_update_status + if content['status'] != content['substatus']: + updated_content, content = get_update_content(content) + updated_contents.append(updated_content) + updated_output_contents_full.append(content) + + for content in outputs_sub: if content['status'] != content['substatus']: updated_content, content = get_update_content(content) updated_contents.append(updated_content) updated_output_contents_full.append(content) - - for content in outputs: - if content['status'] != content['substatus']: - updated_content, content = get_update_content(content) - updated_contents.append(updated_content) - updated_output_contents_full.append(content) return updated_contents, updated_input_contents_full, updated_output_contents_full @@ -316,7 +306,9 @@ def generate_file_messages(request_id, transform_id, workload_id, work, files, r 'name': file['name'], 'path': file['path'], 'map_id': file['map_id'], - 'content_id': file['content_id'], + 'content_id': file['content_id'] if 'content_id' in file else None, + 'external_coll_id': file['external_coll_id'] if 'external_coll_id' in file else None, + 'external_content_id': file['external_content_id'] if 'external_content_id' in file else None, 'status': file_status} files_message.append(file_message) msg_content = {'msg_type': i_msg_type_str.value, @@ -540,6 +532,31 @@ def get_updated_contents_by_request(request_id, transform_id, workload_id, work, return updated_contents, updated_contents_full_input, updated_contents_full_output, updated_contents_full_input_deps +def get_input_output_sub_maps(inputs, outputs, inputs_dependency, logs=[]): + input_output_sub_maps = {} + for content in inputs: + sub_map_id = content['sub_map_id'] + if sub_map_id not in input_output_sub_maps: + input_output_sub_maps[sub_map_id] = {'inputs': [], 'outputs': [], 'logs': [], 'inputs_dependency': []} + input_output_sub_maps[sub_map_id]['inputs'].append(content) + for content in inputs_dependency: + sub_map_id = content['sub_map_id'] + if sub_map_id not in input_output_sub_maps: + input_output_sub_maps[sub_map_id] = {'inputs': [], 'outputs': [], 'logs': [], 'inputs_dependency': []} + input_output_sub_maps[sub_map_id]['inputs_dependency'].append(content) + for content in outputs: + sub_map_id = content['sub_map_id'] + if sub_map_id not in input_output_sub_maps: + input_output_sub_maps[sub_map_id] = {'inputs': [], 'outputs': [], 'logs': [], 'inputs_dependency': []} + input_output_sub_maps[sub_map_id]['outputs'].append(content) + for content in logs: + sub_map_id = content['sub_map_id'] + if sub_map_id not in input_output_sub_maps: + input_output_sub_maps[sub_map_id] = {'inputs': [], 'outputs': [], 'logs': [], 'inputs_dependency': []} + input_output_sub_maps[sub_map_id]['logs'].append(content) + return input_output_sub_maps + + def get_updated_contents_by_input_output_maps(input_output_maps=None, logger=None, log_prefix=''): updated_contents, updated_contents_full_input, updated_contents_full_output = [], [], [] updated_contents_full_input_deps = [] @@ -555,6 +572,8 @@ def get_updated_contents_by_input_output_maps(input_output_maps=None, logger=Non outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] # logs = input_output_maps[map_id]['logs'] if 'logs' in input_output_maps[map_id] else [] + input_output_sub_maps = get_input_output_sub_maps(inputs, outputs, inputs_dependency) + for content in inputs: if (content['status'] != content['substatus']) and content['substatus'] in status_to_check: u_content = {'content_id': content['content_id'], @@ -588,52 +607,57 @@ def get_updated_contents_by_input_output_maps(input_output_maps=None, logger=Non updated_contents.append(u_content) updated_contents_full_input_deps.append(content) - input_content_update_status = None - if is_all_contents_available(inputs_dependency): - input_content_update_status = ContentStatus.Available - elif is_all_contents_terminated(inputs_dependency): - input_content_update_status = ContentStatus.Missing - if input_content_update_status: - for content in inputs: - if content['substatus'] != input_content_update_status: - u_content = {'content_id': content['content_id'], - 'status': input_content_update_status, - 'substatus': input_content_update_status} - updated_contents.append(u_content) - content['status'] = input_content_update_status - content['substatus'] = input_content_update_status - updated_contents_full_input.append(content) - u_content_substatus = {'content_id': content['content_id'], - 'substatus': content['substatus'], - 'request_id': content['request_id'], - 'transform_id': content['transform_id'], - 'workload_id': content['workload_id'], - 'coll_id': content['coll_id']} - new_update_contents.append(u_content_substatus) - - output_content_update_status = None - if is_all_contents_available(inputs): - # wait for the job to finish - pass - elif is_all_contents_terminated_but_not_available(inputs): - output_content_update_status = ContentStatus.Missing - if output_content_update_status: - for content in outputs: - if content['substatus'] != output_content_update_status: - u_content = {'content_id': content['content_id'], - 'status': output_content_update_status, - 'substatus': output_content_update_status} - updated_contents.append(u_content) - content['status'] = output_content_update_status - content['substatus'] = output_content_update_status - updated_contents_full_output.append(content) - u_content_substatus = {'content_id': content['content_id'], - 'substatus': content['substatus'], - 'request_id': content['request_id'], - 'transform_id': content['transform_id'], - 'workload_id': content['workload_id'], - 'coll_id': content['coll_id']} - new_update_contents.append(u_content_substatus) + for sub_map_id in input_output_sub_maps: + inputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + outputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + inputs_dependency_sub = input_output_sub_maps[sub_map_id]['inputs_dependency'] + + input_content_update_status = None + if is_all_contents_available(inputs_dependency_sub): + input_content_update_status = ContentStatus.Available + elif is_all_contents_terminated(inputs_dependency_sub): + input_content_update_status = ContentStatus.Missing + if input_content_update_status: + for content in inputs_sub: + if content['substatus'] != input_content_update_status: + u_content = {'content_id': content['content_id'], + 'status': input_content_update_status, + 'substatus': input_content_update_status} + updated_contents.append(u_content) + content['status'] = input_content_update_status + content['substatus'] = input_content_update_status + updated_contents_full_input.append(content) + u_content_substatus = {'content_id': content['content_id'], + 'substatus': content['substatus'], + 'request_id': content['request_id'], + 'transform_id': content['transform_id'], + 'workload_id': content['workload_id'], + 'coll_id': content['coll_id']} + new_update_contents.append(u_content_substatus) + + output_content_update_status = None + if is_all_contents_available(inputs_sub): + # wait for the job to finish + pass + elif is_all_contents_terminated_but_not_available(inputs_sub): + output_content_update_status = ContentStatus.Missing + if output_content_update_status: + for content in outputs_sub: + if content['substatus'] != output_content_update_status: + u_content = {'content_id': content['content_id'], + 'status': output_content_update_status, + 'substatus': output_content_update_status} + updated_contents.append(u_content) + content['status'] = output_content_update_status + content['substatus'] = output_content_update_status + updated_contents_full_output.append(content) + u_content_substatus = {'content_id': content['content_id'], + 'substatus': content['substatus'], + 'request_id': content['request_id'], + 'transform_id': content['transform_id'], + 'workload_id': content['workload_id'], + 'coll_id': content['coll_id']} + new_update_contents.append(u_content_substatus) return updated_contents, updated_contents_full_input, updated_contents_full_output, updated_contents_full_input_deps, new_update_contents @@ -800,19 +824,25 @@ def trigger_release_inputs_no_deps(request_id, transform_id, workload_id, work, for map_id in input_output_maps: inputs = input_output_maps[map_id]['inputs'] if 'inputs' in input_output_maps[map_id] else [] inputs_dependency = input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in input_output_maps[map_id] else [] - # outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] + outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] # logs = input_output_maps[map_id]['logs'] if 'logs' in input_output_maps[map_id] else [] - if not inputs_dependency: - for content in inputs: - if content['substatus'] != ContentStatus.Available: - u_content = {'content_id': content['content_id'], - # 'status': ContentStatus.Available, - 'substatus': ContentStatus.Available} - update_contents.append(u_content) - content['status'] = ContentStatus.Available - content['substatus'] = ContentStatus.Available - update_input_contents_full[transform_id].append(content) + input_output_sub_maps = get_input_output_sub_maps(inputs, outputs, inputs_dependency) + for sub_map_id in input_output_sub_maps: + inputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + # outputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + inputs_dependency_sub = input_output_sub_maps[sub_map_id]['inputs_dependency'] + + if not inputs_dependency_sub: + for content in inputs_sub: + if content['substatus'] != ContentStatus.Available: + u_content = {'content_id': content['content_id'], + # 'status': ContentStatus.Available, + 'substatus': ContentStatus.Available} + update_contents.append(u_content) + content['status'] = ContentStatus.Available + content['substatus'] = ContentStatus.Available + update_input_contents_full[transform_id].append(content) return update_contents, update_input_contents_full @@ -847,42 +877,48 @@ def trigger_release_inputs(request_id, transform_id, workload_id, work, updated_ outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] # logs = input_output_maps[map_id]['logs'] if 'logs' in input_output_maps[map_id] else [] - input_content_update_status = None - if is_all_contents_available(inputs_dependency): - input_content_update_status = ContentStatus.Available - elif is_all_contents_terminated(inputs_dependency): - input_content_update_status = ContentStatus.Missing - if input_content_update_status: - for content in inputs_dependency: - # u_content = {'content_id': content['content_id'], 'status': content['substatus']) - # update_contents.append(u_content) + input_output_sub_maps = get_input_output_sub_maps(inputs, outputs, inputs_dependency) + for sub_map_id in input_output_sub_maps: + inputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + outputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + inputs_dependency_sub = input_output_sub_maps[sub_map_id]['inputs_dependency'] + + input_content_update_status = None + if is_all_contents_available(inputs_dependency_sub): + input_content_update_status = ContentStatus.Available + elif is_all_contents_terminated(inputs_dependency_sub): + input_content_update_status = ContentStatus.Missing + if input_content_update_status: + for content in inputs_dependency_sub: + # u_content = {'content_id': content['content_id'], 'status': content['substatus']) + # update_contents.append(u_content) + pass + for content in inputs_sub: + u_content = {'content_id': content['content_id'], + 'substatus': input_content_update_status} + update_contents.append(u_content) + content['status'] = input_content_update_status + content['substatus'] = input_content_update_status + update_input_contents_full[transform_id].append(content) + + output_content_update_status = None + if is_all_contents_available(inputs_sub): + # wait for the job to finish + # for content in inputs: + # u_content = {'content_id': content['content_id'], 'status': content['substatus']) + # update_contents.append(u_content) pass - for content in inputs: - u_content = {'content_id': content['content_id'], - 'substatus': input_content_update_status} - update_contents.append(u_content) - content['status'] = input_content_update_status - content['substatus'] = input_content_update_status - update_input_contents_full[transform_id].append(content) - - output_content_update_status = None - if is_all_contents_available(inputs): - # wait for the job to finish - # for content in inputs: - # u_content = {'content_id': content['content_id'], 'status': content['substatus']) - # update_contents.append(u_content) - pass - elif is_all_contents_terminated_but_not_available(inputs): - # for content in inputs: - # u_content = {'content_id': content['content_id'], 'status': content['substatus']) - # update_contents.append(u_content) - pass - output_content_update_status = ContentStatus.Missing - if output_content_update_status: - for content in outputs: - u_content = {'content_id': content['content_id'], - 'substatus': output_content_update_status} - update_contents.append(u_content) + elif is_all_contents_terminated_but_not_available(inputs_sub): + # for content in inputs: + # u_content = {'content_id': content['content_id'], 'status': content['substatus']) + # update_contents.append(u_content) + pass + output_content_update_status = ContentStatus.Missing + if output_content_update_status: + for content in outputs_sub: + u_content = {'content_id': content['content_id'], + 'substatus': output_content_update_status} + update_contents.append(u_content) return update_contents, update_input_contents_full, update_contents_status_name, update_contents_status @@ -892,26 +928,70 @@ def poll_missing_outputs(input_output_maps): for map_id in input_output_maps: inputs = input_output_maps[map_id]['inputs'] if 'inputs' in input_output_maps[map_id] else [] - # inputs_dependency = input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in input_output_maps[map_id] else [] + inputs_dependency = input_output_maps[map_id]['inputs_dependency'] if 'inputs_dependency' in input_output_maps[map_id] else [] outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] # logs = input_output_maps[map_id]['logs'] if 'logs' in input_output_maps[map_id] else [] - content_update_status = None - if is_all_contents_terminated_but_not_available(inputs): - content_update_status = ContentStatus.Missing + input_output_sub_maps = get_input_output_sub_maps(inputs, outputs, inputs_dependency) + for sub_map_id in input_output_sub_maps: + inputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + outputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + # inputs_dependency_sub = input_output_sub_maps[sub_map_id]['inputs_dependency'] - for content in outputs: - content['substatus'] = content_update_status - if content['status'] != content['substatus']: - u_content = {'content_id': content['content_id'], - 'substatus': content['substatus']} + content_update_status = None + if is_all_contents_terminated_but_not_available(inputs_sub): + content_update_status = ContentStatus.Missing + + for content in outputs_sub: + content['substatus'] = content_update_status + if content['status'] != content['substatus']: + u_content = {'content_id': content['content_id'], + 'substatus': content['substatus']} - content_updates_missing.append(u_content) - updated_contents_full_missing.append(content) + content_updates_missing.append(u_content) + updated_contents_full_missing.append(content) return content_updates_missing, updated_contents_full_missing +def has_external_content_id(input_output_maps): + for map_id in input_output_maps: + inputs = input_output_maps[map_id]['inputs'] if 'inputs' in input_output_maps[map_id] else [] + for content in inputs: + if not content['external_content_id']: + return False + return True + + +def get_update_external_content_ids(input_output_maps, external_content_ids): + name_to_id_map = {} + update_contents = [] + for map_id in input_output_maps: + inputs = input_output_maps[map_id]['inputs'] if 'inputs' in input_output_maps[map_id] else [] + outputs = input_output_maps[map_id]['outputs'] if 'outputs' in input_output_maps[map_id] else [] + for content in inputs + outputs: + if content['name'] not in name_to_id_map: + name_to_id_map[content['name']] = [] + name_to_id_map[content['name']].append(content['content_id']) + for dataset in external_content_ids: + dataset_id = dataset['dataset']['id'] + files = dataset['files'] + for file_item in files: + lfn = file_item['lfn'] + # remove scope '00000:' + pos = lfn.find(":") + if pos >= 0: + lfn = lfn[pos + 1:] + file_id = file_item['id'] + content_ids = name_to_id_map.get(lfn, []) + for content_id in content_ids: + update_content = {'content_id': content_id, + 'external_coll_id': dataset_id, + 'external_content_id': file_id} + update_contents.append(update_content) + return update_contents + + def handle_update_processing(processing, agent_attributes, logger=None, log_prefix=''): logger = get_logger(logger) @@ -930,6 +1010,11 @@ def handle_update_processing(processing, agent_attributes, logger=None, log_pref logger.debug(log_prefix + "get_input_output_maps: len: %s" % len(input_output_maps)) logger.debug(log_prefix + "get_input_output_maps.keys[:3]: %s" % str(list(input_output_maps.keys())[:3])) + if work.has_external_content_id() and not has_external_content_id(input_output_maps): + external_content_ids = work.get_external_content_ids(processing, log_prefix=log_prefix) + update_external_content_ids = get_update_external_content_ids(input_output_maps, external_content_ids) + core_catalog.update_contents(update_external_content_ids) + new_input_output_maps = work.get_new_input_output_maps(input_output_maps) logger.debug(log_prefix + "get_new_input_output_maps: len: %s" % len(new_input_output_maps)) logger.debug(log_prefix + "get_new_input_output_maps.keys[:3]: %s" % str(list(new_input_output_maps.keys())[:3])) @@ -1018,7 +1103,7 @@ def handle_trigger_processing(processing, agent_attributes, trigger_new_updates= work.set_agent_attributes(agent_attributes, processing) if (not work.use_dependency_to_release_jobs()) or workload_id is None: - return processing['substatus'], [], [], {}, {}, {}, [] + return processing['substatus'], [], [], {}, {}, {}, [], [] else: if trigger_new_updates: # delete information in the contents_update table, to invoke the trigger. @@ -1221,7 +1306,9 @@ def get_input_name_content_id_map(request_id, workload_id, transform_id): input_name_content_id_map = {} for content in contents: if content['content_relation_type'] == ContentRelationType.Output: - input_name_content_id_map[content['name']] = content['content_id'] + if content['name'] not in input_name_content_id_map: + input_name_content_id_map[content['name']] = [] + input_name_content_id_map[content['name']].append(content['content_id']) cache.set(input_name_content_id_map_key, input_name_content_id_map) @@ -1244,8 +1331,8 @@ def get_jobid_content_id_map(request_id, workload_id, transform_id, job_id, inpu pos = ip.find(":") ip = ip[pos + 1:] if ip in input_name_content_id_map: - content_id = input_name_content_id_map[ip] - jobid_content_id_map[job_id] = content_id + content_ids = input_name_content_id_map[ip] + jobid_content_id_map[job_id] = content_ids break cache.set(jobid_content_id_map_key, jobid_content_id_map) @@ -1256,10 +1343,10 @@ def get_content_id_from_job_id(request_id, workload_id, transform_id, job_id, in jobid_content_id_map, to_update_jobid = get_jobid_content_id_map(request_id, workload_id, transform_id, job_id, inputs) if str(job_id) in jobid_content_id_map: - content_id = jobid_content_id_map[str(job_id)] + content_ids = jobid_content_id_map[str(job_id)] else: - content_id = None - return content_id, to_update_jobid + content_ids = None + return content_ids, to_update_jobid pending_lock = threading.Lock() @@ -1391,30 +1478,31 @@ def handle_messages_processing(messages, logger=None, log_prefix='', update_proc logger.debug(log_prefix + "(request_id, transform_id, processing_id, status, substatus): %s" % str(ret_req_tf_pr_id)) req_id, tf_id, processing_id, r_status, r_substatus = ret_req_tf_pr_id - content_id, to_update_jobid = get_content_id_from_job_id(req_id, workload_id, tf_id, job_id, inputs) - if content_id: - if to_update_jobid: - u_content = {'content_id': content_id, - 'request_id': req_id, - 'transform_id': tf_id, - 'workload_id': workload_id, - # 'status': get_content_status_from_panda_msg_status(status), - 'substatus': get_content_status_from_panda_msg_status(status), - 'content_metadata': {'panda_id': job_id}} - else: - u_content = {'content_id': content_id, - 'request_id': req_id, - 'transform_id': tf_id, - 'workload_id': workload_id, - 'substatus': get_content_status_from_panda_msg_status(status)} - # # 'status': get_content_status_from_panda_msg_status(status)} - - update_contents.append(u_content) - # if processing_id not in update_processings: - # if processing_id not in update_processings and whether_to_update_processing(processing_id, update_processing_interval): - if processing_id not in update_processings_by_job: - update_processings_by_job.append(processing_id) - logger.debug(log_prefix + "Add to update processing by job: %s" % str(processing_id)) + content_ids, to_update_jobid = get_content_id_from_job_id(req_id, workload_id, tf_id, job_id, inputs) + if content_ids: + for content_id in content_ids: + if to_update_jobid: + u_content = {'content_id': content_id, + 'request_id': req_id, + 'transform_id': tf_id, + 'workload_id': workload_id, + # 'status': get_content_status_from_panda_msg_status(status), + 'substatus': get_content_status_from_panda_msg_status(status), + 'content_metadata': {'panda_id': job_id}} + else: + u_content = {'content_id': content_id, + 'request_id': req_id, + 'transform_id': tf_id, + 'workload_id': workload_id, + 'substatus': get_content_status_from_panda_msg_status(status)} + # # 'status': get_content_status_from_panda_msg_status(status)} + + update_contents.append(u_content) + # if processing_id not in update_processings: + # if processing_id not in update_processings and whether_to_update_processing(processing_id, update_processing_interval): + if processing_id not in update_processings_by_job: + update_processings_by_job.append(processing_id) + logger.debug(log_prefix + "Add to update processing by job: %s" % str(processing_id)) return update_processings, update_processings_by_job, terminated_processings, update_contents, [] From b9e61e7429939e9b466c7d81ed0b7187150acfe5 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 4 Jul 2023 15:43:24 +0200 Subject: [PATCH 10/19] fix throttler --- main/lib/idds/agents/clerk/clerk.py | 51 +++++++++++++++-------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/main/lib/idds/agents/clerk/clerk.py b/main/lib/idds/agents/clerk/clerk.py index 6727ea95..fc751ce8 100644 --- a/main/lib/idds/agents/clerk/clerk.py +++ b/main/lib/idds/agents/clerk/clerk.py @@ -359,7 +359,7 @@ def generate_transform(self, req, work, build=False): def get_num_active_requests(self, site_name): cache = get_redis_cache() - num_requests = cache.get("num_requests", default=None) + num_requests = cache.get("num_requests", default={}) if num_requests is None: num_requests = {} active_status = [RequestStatus.New, RequestStatus.Ready, RequestStatus.Throttling] @@ -381,7 +381,7 @@ def get_num_active_requests(self, site_name): def get_num_active_transforms(self, site_name): cache = get_redis_cache() - num_transforms = cache.get("num_transforms", default=None) + num_transforms = cache.get("num_transforms", default={}) if num_transforms is None: num_transforms = {} active_status = [TransformStatus.New, TransformStatus.Ready] @@ -403,7 +403,7 @@ def get_num_active_transforms(self, site_name): def get_num_active_processings(self, site_name): cache = get_redis_cache() - num_processings = cache.get("num_processings", default=None) + num_processings = cache.get("num_processings", default={}) active_transforms = cache.get("active_transforms", default={}) if num_processings is None: num_processings = {} @@ -444,28 +444,29 @@ def get_num_active_contents(self, site_name, active_transform_ids): num_output_contents = cache.get("num_output_contents", default=None) if num_input_contents is None or num_output_contents is None: num_input_contents, num_output_contents = {}, {} - ret = core_catalog.get_content_status_statistics_by_relation_type(all_tf_ids) - for item in ret: - status, relation_type, transform_id, count = item - site = tf_id_site_map[transform_id] - if site not in num_input_contents: - num_input_contents[site] = {'new': 0, 'activated': 0, 'processed': 0} - num_output_contents[site] = {'new': 0, 'activated': 0, 'processed': 0} - if status in [ContentStatus.New]: - if relation_type == ContentRelationType.Input: - num_input_contents[site]['new'] += count - elif relation_type == ContentRelationType.Output: - num_output_contents[site]['new'] += count - if status in [ContentStatus.Activated]: - if relation_type == ContentRelationType.Input: - num_input_contents[site]['activated'] += count - elif relation_type == ContentRelationType.Output: - num_output_contents[site]['activated'] += count - else: - if relation_type == ContentRelationType.Input: - num_input_contents[site]['processed'] += count - elif relation_type == ContentRelationType.Output: - num_output_contents[site]['processed'] += count + if all_tf_ids: + ret = core_catalog.get_content_status_statistics_by_relation_type(all_tf_ids) + for item in ret: + status, relation_type, transform_id, count = item + site = tf_id_site_map[transform_id] + if site not in num_input_contents: + num_input_contents[site] = {'new': 0, 'activated': 0, 'processed': 0} + num_output_contents[site] = {'new': 0, 'activated': 0, 'processed': 0} + if status in [ContentStatus.New]: + if relation_type == ContentRelationType.Input: + num_input_contents[site]['new'] += count + elif relation_type == ContentRelationType.Output: + num_output_contents[site]['new'] += count + if status in [ContentStatus.Activated]: + if relation_type == ContentRelationType.Input: + num_input_contents[site]['activated'] += count + elif relation_type == ContentRelationType.Output: + num_output_contents[site]['activated'] += count + else: + if relation_type == ContentRelationType.Input: + num_input_contents[site]['processed'] += count + elif relation_type == ContentRelationType.Output: + num_output_contents[site]['processed'] += count cache.set("num_input_contents", num_input_contents, expire_seconds=self.cache_expire_seconds) cache.set("num_output_contents", num_output_contents, expire_seconds=self.cache_expire_seconds) From d2bd4bcec8690ac4e8e0acf0a59f336a98b4a74b Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 4 Jul 2023 15:44:20 +0200 Subject: [PATCH 11/19] add sub_map_id in core functions --- main/lib/idds/core/processings.py | 15 +++++++++-- main/lib/idds/core/transforms.py | 44 ++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/main/lib/idds/core/processings.py b/main/lib/idds/core/processings.py index c3e8df9d..2b08ef53 100644 --- a/main/lib/idds/core/processings.py +++ b/main/lib/idds/core/processings.py @@ -294,9 +294,20 @@ def resolve_input_dependency_id(new_input_dependency_contents, session=None): for content in contents: if content['coll_id'] not in content_name_id_map: content_name_id_map[content['coll_id']] = {} - content_name_id_map[content['coll_id']][content['name']] = content['content_id'] + if content['name'] not in content_name_id_map[content['coll_id']]: + content_name_id_map[content['coll_id']][content['name']] = {} + # if content['map_id'] not in content_name_id_map[content['coll_id']][content['name']]: + # content_name_id_map[content['coll_id']][content['name']][content['map_id']] = {} + content_name_id_map[content['coll_id']][content['name']][content['sub_map_id']] = content['content_id'] + # content_name_id_map[content['coll_id']][content['name']] = content['content_id'] + for content in new_input_dependency_contents: - content_dep_id = content_name_id_map[content['coll_id']][content['name']] + if 'sub_map_id' not in content or content['sub_map_id'] is None: + content['sub_map_id'] = 0 + dep_sub_map_id = content.get("dep_sub_map_id", 0) + if dep_sub_map_id is None: + dep_sub_map_id = 0 + content_dep_id = content_name_id_map[content['coll_id']][content['name']][dep_sub_map_id] content['content_dep_id'] = content_dep_id return new_input_dependency_contents diff --git a/main/lib/idds/core/transforms.py b/main/lib/idds/core/transforms.py index 581103ee..f6db3fd8 100644 --- a/main/lib/idds/core/transforms.py +++ b/main/lib/idds/core/transforms.py @@ -381,7 +381,7 @@ def clean_next_poll_at(status, session=None): @read_session -def get_transform_input_output_maps(transform_id, input_coll_ids, output_coll_ids, log_coll_ids=[], session=None): +def get_transform_input_output_maps(transform_id, input_coll_ids, output_coll_ids, log_coll_ids=[], with_sub_map_id=False, session=None): """ Get transform input output maps. @@ -391,9 +391,15 @@ def get_transform_input_output_maps(transform_id, input_coll_ids, output_coll_id ret = {} for content in contents: map_id = content['map_id'] - if map_id not in ret: - ret[map_id] = {'inputs_dependency': [], 'inputs': [], 'outputs': [], 'logs': [], 'others': []} - + if not with_sub_map_id: + if map_id not in ret: + ret[map_id] = {'inputs_dependency': [], 'inputs': [], 'outputs': [], 'logs': [], 'others': []} + else: + sub_map_id = content['sub_map_id'] + if map_id not in ret: + ret[map_id] = {} + if sub_map_id not in ret[map_id]: + ret[map_id][sub_map_id] = {'inputs_dependency': [], 'inputs': [], 'outputs': [], 'logs': [], 'others': []} """ if content['coll_id'] in input_coll_ids: ret[map_id]['inputs'].append(content) @@ -404,16 +410,28 @@ def get_transform_input_output_maps(transform_id, input_coll_ids, output_coll_id else: ret[map_id]['others'].append(content) """ - if content['content_relation_type'] == ContentRelationType.Input: - ret[map_id]['inputs'].append(content) - elif content['content_relation_type'] == ContentRelationType.InputDependency: - ret[map_id]['inputs_dependency'].append(content) - elif content['content_relation_type'] == ContentRelationType.Output: - ret[map_id]['outputs'].append(content) - elif content['content_relation_type'] == ContentRelationType.Log: - ret[map_id]['logs'].append(content) + if not with_sub_map_id: + if content['content_relation_type'] == ContentRelationType.Input: + ret[map_id]['inputs'].append(content) + elif content['content_relation_type'] == ContentRelationType.InputDependency: + ret[map_id]['inputs_dependency'].append(content) + elif content['content_relation_type'] == ContentRelationType.Output: + ret[map_id]['outputs'].append(content) + elif content['content_relation_type'] == ContentRelationType.Log: + ret[map_id]['logs'].append(content) + else: + ret[map_id]['others'].append(content) else: - ret[map_id]['others'].append(content) + if content['content_relation_type'] == ContentRelationType.Input: + ret[map_id][sub_map_id]['inputs'].append(content) + elif content['content_relation_type'] == ContentRelationType.InputDependency: + ret[map_id][sub_map_id]['inputs_dependency'].append(content) + elif content['content_relation_type'] == ContentRelationType.Output: + ret[map_id][sub_map_id]['outputs'].append(content) + elif content['content_relation_type'] == ContentRelationType.Log: + ret[map_id][sub_map_id]['logs'].append(content) + else: + ret[map_id][sub_map_id]['others'].append(content) return ret From 1819af1a6155c2ee20f755b0f1130a888917c9c2 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 4 Jul 2023 15:44:54 +0200 Subject: [PATCH 12/19] add sub_map_id in db models --- main/lib/idds/orm/base/models.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/main/lib/idds/orm/base/models.py b/main/lib/idds/orm/base/models.py index 0e8d01e0..b94dc7cd 100644 --- a/main/lib/idds/orm/base/models.py +++ b/main/lib/idds/orm/base/models.py @@ -538,6 +538,8 @@ class Content(BASE, ModelBase): request_id = Column(BigInteger().with_variant(Integer, "sqlite"), nullable=False) workload_id = Column(Integer()) map_id = Column(BigInteger().with_variant(Integer, "sqlite"), default=0, nullable=False) + sub_map_id = Column(BigInteger().with_variant(Integer, "sqlite"), default=0) + dep_sub_map_id = Column(BigInteger().with_variant(Integer, "sqlite"), default=0) content_dep_id = Column(BigInteger()) scope = Column(String(SCOPE_LENGTH)) name = Column(String(LONG_NAME_LENGTH)) @@ -554,6 +556,10 @@ class Content(BASE, ModelBase): processing_id = Column(Integer()) storage_id = Column(Integer()) retries = Column(Integer(), default=0) + external_coll_id = Column(BigInteger()) + external_content_id = Column(BigInteger()) + external_event_id = Column(BigInteger()) + external_event_status = Column(EnumWithValue(ContentStatus)) path = Column(String(4000)) created_at = Column("created_at", DateTime, default=datetime.datetime.utcnow) updated_at = Column("updated_at", DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow) @@ -565,7 +571,8 @@ class Content(BASE, ModelBase): # UniqueConstraint('name', 'scope', 'coll_id', 'content_type', 'min_id', 'max_id', name='CONTENT_SCOPE_NAME_UQ'), # UniqueConstraint('name', 'scope', 'coll_id', 'min_id', 'max_id', name='CONTENT_SCOPE_NAME_UQ'), # UniqueConstraint('content_id', 'coll_id', name='CONTENTS_UQ'), - UniqueConstraint('transform_id', 'coll_id', 'map_id', 'name', 'min_id', 'max_id', name='CONTENT_ID_UQ'), + # UniqueConstraint('transform_id', 'coll_id', 'map_id', 'name', 'min_id', 'max_id', name='CONTENT_ID_UQ'), + UniqueConstraint('transform_id', 'coll_id', 'map_id', 'sub_map_id', 'dep_sub_map_id', 'content_relation_type', 'name', 'min_id', 'max_id', name='CONTENT_ID_UQ'), ForeignKeyConstraint(['transform_id'], ['transforms.transform_id'], name='CONTENTS_TRANSFORM_ID_FK'), ForeignKeyConstraint(['coll_id'], ['collections.coll_id'], name='CONTENTS_COLL_ID_FK'), CheckConstraint('status IS NOT NULL', name='CONTENTS_STATUS_ID_NN'), From 3b3448716730e41aba05a329233c5baa9add1393 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 4 Jul 2023 15:46:00 +0200 Subject: [PATCH 13/19] add sub_map_id in work and workflow --- workflow/lib/idds/workflow/work.py | 9 +++++++++ workflow/lib/idds/workflowv2/work.py | 15 ++++++++++++--- workflow/lib/idds/workflowv2/workflow.py | 2 +- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/workflow/lib/idds/workflow/work.py b/workflow/lib/idds/workflow/work.py index b6dafe0c..f5cbbe08 100644 --- a/workflow/lib/idds/workflow/work.py +++ b/workflow/lib/idds/workflow/work.py @@ -821,6 +821,9 @@ def collections(self, value): coll_metadata[k] = {'coll_id': coll.coll_id} self.add_metadata_item('collections', coll_metadata) + def with_sub_map_id(self): + return False + @property def processings(self): return self._processings @@ -1828,6 +1831,9 @@ def use_dependency_to_release_jobs(self): def require_ext_contents(self): return False + def has_external_content_id(self): + return False + def set_work_name_to_coll_map(self, work_name_to_coll_map): self.work_name_to_coll_map = work_name_to_coll_map @@ -2260,3 +2266,6 @@ def unset_user_proxy(self): os.environ['X509_USER_PROXY'] = self.original_proxy else: del os.environ['X509_USER_PROXY'] + + def get_external_content_ids(self, processing, log_prefix=''): + return [] diff --git a/workflow/lib/idds/workflowv2/work.py b/workflow/lib/idds/workflowv2/work.py index 3c3d758e..6595544f 100644 --- a/workflow/lib/idds/workflowv2/work.py +++ b/workflow/lib/idds/workflowv2/work.py @@ -6,7 +6,7 @@ # http://www.apache.org/licenses/LICENSE-2.0OA # # Authors: -# - Wen Guan, , 2020 - 2021 +# - Wen Guan, , 2020 - 2023 import copy import datetime @@ -850,6 +850,9 @@ def collections(self, value): coll_metadata[k] = {'coll_id': coll.coll_id} self.add_metadata_item('collections', coll_metadata) + def with_sub_map_id(self): + return False + @property def processings(self): return self._processings @@ -1285,9 +1288,9 @@ def clean_work(self): self.last_updated_at = datetime.datetime.utcnow() def set_agent_attributes(self, attrs, req_attributes=None): + if self.agent_attributes is None: + self.agent_attributes = {} if attrs and self.class_name in attrs: - if self.agent_attributes is None: - self.agent_attributes = {} for key, value in attrs[self.class_name].items(): self.agent_attributes[key] = value self.logger.info("agent_attributes: %s" % self.agent_attributes) @@ -1863,6 +1866,9 @@ def use_dependency_to_release_jobs(self): def require_ext_contents(self): return False + def has_external_content_id(self): + return False + def set_work_name_to_coll_map(self, work_name_to_coll_map): self.work_name_to_coll_map = work_name_to_coll_map @@ -2295,3 +2301,6 @@ def unset_user_proxy(self): os.environ['X509_USER_PROXY'] = self.original_proxy else: del os.environ['X509_USER_PROXY'] + + def get_external_content_ids(self, processing, log_prefix=''): + return [] diff --git a/workflow/lib/idds/workflowv2/workflow.py b/workflow/lib/idds/workflowv2/workflow.py index 103c2d53..b2781819 100644 --- a/workflow/lib/idds/workflowv2/workflow.py +++ b/workflow/lib/idds/workflowv2/workflow.py @@ -1893,7 +1893,7 @@ def is_finished(self, synchronize=True): """ *** Function called by Marshaller agent. """ - return self.is_terminated(synchronize=synchronize) and self.num_finished_works == self.num_total_works + return self.is_terminated(synchronize=synchronize) and self.num_finished_works == self.num_total_works and (self.num_total_works > 0) def is_subfinished(self, synchronize=True): """ From db67ea0f6694146af2d075476eb9f2dd795bb00e Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 4 Jul 2023 15:47:31 +0200 Subject: [PATCH 14/19] add alembic versions --- .../versions/b0ec813021d6_add_sub_map_id.py | 45 +++++++++++++++++++ .../f79663a7e94e_add_external_content_id.py | 45 +++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 main/lib/idds/orm/base/alembic/versions/b0ec813021d6_add_sub_map_id.py create mode 100644 main/lib/idds/orm/base/alembic/versions/f79663a7e94e_add_external_content_id.py diff --git a/main/lib/idds/orm/base/alembic/versions/b0ec813021d6_add_sub_map_id.py b/main/lib/idds/orm/base/alembic/versions/b0ec813021d6_add_sub_map_id.py new file mode 100644 index 00000000..470b571e --- /dev/null +++ b/main/lib/idds/orm/base/alembic/versions/b0ec813021d6_add_sub_map_id.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + +"""add sub_map_id + +Revision ID: b0ec813021d6 +Revises: f79663a7e94e +Create Date: 2023-06-22 11:46:41.634551+00:00 + +""" +from alembic import op +from alembic import context +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'b0ec813021d6' +down_revision = 'f79663a7e94e' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + if context.get_context().dialect.name in ['oracle', 'mysql', 'postgresql']: + schema = context.get_context().version_table_schema if context.get_context().version_table_schema else '' + op.add_column('contents', sa.Column('sub_map_id', sa.BigInteger()), schema=schema) + op.add_column('contents', sa.Column('dep_sub_map_id', sa.BigInteger()), schema=schema) + op.drop_constraint(constraint_name="CONTENT_ID_UQ", table_name="contents", schema=schema) + op.create_unique_constraint('CONTENT_ID_UQ', 'contents', ['transform_id', 'coll_id', 'map_id', 'sub_map_id', 'dep_sub_map_id', 'content_relation_type', 'name', 'min_id', 'max_id'], schema=schema) + + +def downgrade() -> None: + if context.get_context().dialect.name in ['oracle', 'mysql', 'postgresql']: + schema = context.get_context().version_table_schema if context.get_context().version_table_schema else '' + op.drop_constraint(constraint_name="CONTENT_ID_UQ", table_name="contents", schema=schema) + op.drop_column('contents', 'sub_map_id', schema=schema) + op.drop_column('contents', 'dep_sub_map_id', schema=schema) + op.create_unique_constraint('CONTENT_ID_UQ', 'contents', ['transform_id', 'coll_id', 'map_id', 'name', 'min_id', 'max_id'], schema=schema) diff --git a/main/lib/idds/orm/base/alembic/versions/f79663a7e94e_add_external_content_id.py b/main/lib/idds/orm/base/alembic/versions/f79663a7e94e_add_external_content_id.py new file mode 100644 index 00000000..44d7b724 --- /dev/null +++ b/main/lib/idds/orm/base/alembic/versions/f79663a7e94e_add_external_content_id.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0OA +# +# Authors: +# - Wen Guan, , 2023 + +"""add external content_id + +Revision ID: f79663a7e94e +Revises: 0204f391c32d +Create Date: 2023-06-22 11:36:20.664961+00:00 + +""" +from alembic import op +from alembic import context +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = 'f79663a7e94e' +down_revision = '0204f391c32d' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + if context.get_context().dialect.name in ['oracle', 'mysql', 'postgresql']: + schema = context.get_context().version_table_schema if context.get_context().version_table_schema else '' + op.add_column('contents', sa.Column('external_coll_id', sa.BigInteger()), schema=schema) + op.add_column('contents', sa.Column('external_content_id', sa.BigInteger()), schema=schema) + op.add_column('contents', sa.Column('external_event_id', sa.BigInteger()), schema=schema) + op.add_column('contents', sa.Column('external_event_status', sa.Integer()), schema=schema) + + +def downgrade() -> None: + if context.get_context().dialect.name in ['oracle', 'mysql', 'postgresql']: + schema = context.get_context().version_table_schema if context.get_context().version_table_schema else '' + op.drop_column('contents', 'external_coll_id', schema=schema) + op.drop_column('contents', 'external_content_id', schema=schema) + op.drop_column('contents', 'external_event_id', schema=schema) + op.drop_column('contents', 'external_event_status', schema=schema) From bd07998706710c12de6412c161fc9cc114c45d4b Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 4 Jul 2023 15:48:00 +0200 Subject: [PATCH 15/19] add sql --- main/etc/sql/oracle_update.sql | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/main/etc/sql/oracle_update.sql b/main/etc/sql/oracle_update.sql index 1290c58a..26a200f9 100644 --- a/main/etc/sql/oracle_update.sql +++ b/main/etc/sql/oracle_update.sql @@ -433,3 +433,12 @@ CREATE TABLE Throttlers ); alter table Messages add (poll_period INTERVAL DAY TO SECOND DEFAULT '00 00:05:00'); + + +--- 20230626 +alter table contents add (external_coll_id NUMBER(12), external_content_id NUMBER(12), external_event_id NUMBER(12), external_event_status NUMBER(2)); + +alter table contents add (sub_map_id NUMBER(12) default 0); +alter table contents add (dep_sub_map_id NUMBER(12) default 0); +alter table contents drop constraint CONTENT_ID_UQ; +alter table contents add constraint CONTENT_ID_UQ UNIQUE (transform_id, coll_id, map_id, sub_map_id, dep_sub_map_id, content_relation_type, name, min_id, max_id) USING INDEX LOCAL; From 30a774af7dcf3346f4bdad4b44e23ab496f491be Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 4 Jul 2023 15:48:11 +0200 Subject: [PATCH 16/19] add tests --- main/lib/idds/tests/core_tests.py | 3 +- main/lib/idds/tests/panda_test.py | 26 ++++++- main/lib/idds/tests/set_throttlers.py | 2 +- main/lib/idds/tests/test_domapanda.py | 5 +- .../tests/test_domapanda_lsst_workflow.py | 77 +++++++++++++++++-- main/tools/env/environment.yml | 1 + monitor/data/conf.js | 12 +-- 7 files changed, 106 insertions(+), 20 deletions(-) diff --git a/main/lib/idds/tests/core_tests.py b/main/lib/idds/tests/core_tests.py index d9dd89c2..5321cc5a 100644 --- a/main/lib/idds/tests/core_tests.py +++ b/main/lib/idds/tests/core_tests.py @@ -172,11 +172,12 @@ def print_workflow_template(workflow, layers=0): # reqs = get_requests(request_id=28182323, with_request=True, with_detail=False, with_metadata=True) # reqs = get_requests(request_id=385554, with_request=True, with_detail=False, with_metadata=True) reqs = get_requests(request_id=479187, with_request=True, with_detail=False, with_metadata=True) +reqs = get_requests(request_id=4498, with_request=True, with_detail=False, with_metadata=True) for req in reqs: # print(req['request_id']) # print(req) # print(rets) - # print(json_dumps(req, sort_keys=True, indent=4)) + print(json_dumps(req, sort_keys=True, indent=4)) # show_works(req) pass if 'build_workflow' in req['request_metadata']: diff --git a/main/lib/idds/tests/panda_test.py b/main/lib/idds/tests/panda_test.py index 9008354a..4ecaad29 100644 --- a/main/lib/idds/tests/panda_test.py +++ b/main/lib/idds/tests/panda_test.py @@ -7,12 +7,18 @@ os.environ['PANDA_URL_SSL'] = 'https://pandaserver-doma.cern.ch:25443/server/panda' os.environ['PANDA_BEHIND_REAL_LB'] = "1" -os.environ['PANDA_URL'] = 'http://rubin-panda-server-dev.slac.stanford.edu:80/server/panda' -os.environ['PANDA_URL_SSL'] = 'https://rubin-panda-server-dev.slac.stanford.edu:8443/server/panda' +# os.environ['PANDA_URL'] = 'http://rubin-panda-server-dev.slac.stanford.edu:80/server/panda' +# os.environ['PANDA_URL_SSL'] = 'https://rubin-panda-server-dev.slac.stanford.edu:8443/server/panda' from pandaclient import Client # noqa E402 +task_ids = [i for i in range(157023, 157050)] +for task_id in task_ids: + print("Killing %s" % task_id) + ret = Client.killTask(task_id) + print(ret) + # jobids = [52690679] jobids = [9] """ @@ -35,9 +41,19 @@ print(f.type) """ -ret = Client.getFullJobStatus(ids=jobids, verbose=False) +# ret = Client.getFullJobStatus(ids=jobids, verbose=False) +# print(ret) + +jediTaskID = 156668 +ret = Client.get_files_in_datasets(jediTaskID, verbose=False) print(ret) +panda_ids = [{'task_id': 157016, 'panda_id': 53943290}] +ret = Client.get_events_status(panda_ids, verbose=True) +print(ret) + +# sys.exit(0) + """ jediTaskID = 10517 # 10607 jediTaskID = 146329 @@ -97,7 +113,7 @@ ret = Client.getJediTaskDetails({'jediTaskID': jediTaskID}, True, True, verbose=False) print(ret) -sys.exit(0) +# sys.exit(0) task_ids = [] # task_ids = [1565, 1566, 1567, 1568, 1570, 1572, 1575, 1576, 1579, 1580, 1581, 1582, 1584, 1585, 1586, 1587, 1588, 1589, 1590, 1591, 1592, 1593, 1597, 1598, 1599, 1601, 1602, 1603, 1604, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1617] @@ -126,6 +142,8 @@ # task_ids = [i for i in range(151444, 151453)] task_ids = [i for i in range(45, 53)] # task_ids = [] +task_ids = [i for i in range(156974, 156981)] +task_ids = [i for i in range(157023, 157050)] for task_id in task_ids: print("Killing %s" % task_id) Client.killTask(task_id) diff --git a/main/lib/idds/tests/set_throttlers.py b/main/lib/idds/tests/set_throttlers.py index 0f260056..91b604b0 100644 --- a/main/lib/idds/tests/set_throttlers.py +++ b/main/lib/idds/tests/set_throttlers.py @@ -3,6 +3,6 @@ throttler = {'site': 'Default', 'status': ThrottlerStatus.Active, - 'new_contents': 100000, + 'new_contents': 200000, 'queue_contents': 50000} core_throttlers.add_throttler(**throttler) diff --git a/main/lib/idds/tests/test_domapanda.py b/main/lib/idds/tests/test_domapanda.py index c1683bba..e3d85579 100644 --- a/main/lib/idds/tests/test_domapanda.py +++ b/main/lib/idds/tests/test_domapanda.py @@ -44,9 +44,10 @@ # task_queue = 'SLAC_TEST' # task_queue = 'DOMA_LSST_SLAC_TEST' task_queue = 'SLAC_Rubin' +task_queue = 'SLAC_Rubin_Extra_Himem_32Cores' -# task_cloud = 'EU' -# task_queue = 'CC-IN2P3_TEST' +task_cloud = 'EU' +task_queue = 'CC-IN2P3_TEST' # task_cloud = 'EU' # task_queue = 'LANCS_TEST' diff --git a/main/lib/idds/tests/test_domapanda_lsst_workflow.py b/main/lib/idds/tests/test_domapanda_lsst_workflow.py index 4292b683..426c412c 100644 --- a/main/lib/idds/tests/test_domapanda_lsst_workflow.py +++ b/main/lib/idds/tests/test_domapanda_lsst_workflow.py @@ -13,8 +13,9 @@ Test lsst generic workflow. """ -# import json +import json import logging +import sys # noqa E402 F401 logging.basicConfig(level=logging.DEBUG) @@ -29,7 +30,7 @@ # from idds.client.client import Client from idds.client.clientmanager import ClientManager # noqa E402 # from idds.common.constants import RequestType, RequestStatus -# from idds.common.utils import get_rest_host +from idds.common.utils import get_rest_host # noqa E402 # from idds.tests.common import get_example_real_tape_stagein_request # from idds.tests.common import get_example_prodsys2_tape_stagein_request @@ -37,9 +38,22 @@ # from idds.workflowv2.workflow import Condition, Workflow from idds.workflowv2.workflow import Workflow, Condition # noqa E402 # from idds.atlas.workflowv2.atlasstageinwork import ATLASStageinWork -from idds.doma.workflowv2.domapandawork import DomaPanDAWork # noqa E402 -from idds.doma.workflowv2.domatree import DomaTree # noqa E402 -from idds.doma.workflowv2.domaeventmap import DomaEventMap # noqa E402 +from idds.doma.workflowv2.domapandawork import DomaPanDAWork # noqa E402 +from idds.doma.workflowv2.domapandaeswork import DomaPanDAESWork # noqa E402 +from idds.doma.workflowv2.domatree import DomaTree # noqa E402 +from idds.doma.workflowv2.domaeventmap import DomaEventMap # noqa E402 + + +task_cloud = 'US' +task_queue = 'SLAC_Rubin' +task_queue = 'SLAC_Rubin_Extra_Himem_32Cores' + + +task_cloud = 'EU' +task_queue = 'CC-IN2P3_TEST' + +# task_cloud = 'EU' +# task_queue = 'LANCS_TEST' def setup_gw_workflow(): @@ -195,13 +209,63 @@ def construct_doma_jobs(generic_workflow): event_map1.load() print(event_map1) + return event_map + + +def setup_workflow(event_map): + pending_time = 12 + # pending_time = None + workflow = Workflow(pending_time=pending_time) + workflow.name = event_map.name + + for task_name in event_map.tasks: + task = event_map.get_task(task_name) + executable = "cmd_line_es_decoder.py" + dependency_map = task.get_dependency_map() + work = DomaPanDAESWork(executable=executable, + primary_input_collection={'scope': 'pseudo_dataset', 'name': 'pseudo_input_collection#1'}, + output_collections=[{'scope': 'pseudo_dataset', 'name': 'pseudo_output_collection#1'}], + log_collections=[], es_dependency_map=dependency_map, + task_name=task_name, task_queue=task_queue, + encode_command_line=True, + prodSourceLabel='managed', + task_log={"dataset": "PandaJob_#{pandaid}/", + "destination": "local", + "param_type": "log", + "token": "local", + "type": "template", + "value": "log.tgz"}, + task_cloud=task_cloud) + workflow.add_work(work) + return workflow + def test(): gw_workflow = setup_gw_workflow() # print(json.dumps(gw_workflow)) # gw_workflow = setup_gw_workflow2() test_show_jobs(gw_workflow) - construct_doma_jobs(gw_workflow) + event_map = construct_doma_jobs(gw_workflow) + + print("event_map") + print(json.dumps(event_map.dict(), sort_keys=True, indent=4)) + # sys.exit(0) + + print("task_dep_map") + + for task_name in event_map.tasks: + task = event_map.get_task(task_name) + print("task_name :%s" % task_name) + print(json.dumps(task.get_dependency_map(), sort_keys=True, indent=4)) + workflow = setup_workflow(event_map) + + # sys.exit(0) + + host = get_rest_host() + wm = ClientManager(host=host) + # wm.set_original_user(user_name="wguandev") + request_id = wm.submit(workflow, use_dataset_name=False) + print(request_id) def test_load(): @@ -240,6 +304,7 @@ def test_load(): to_report = event_job.get_events_to_report() print(to_report) + def test1(): # gw_workflow = setup_gw_workflow() # print(json.dumps(gw_workflow)) diff --git a/main/tools/env/environment.yml b/main/tools/env/environment.yml index 53f88041..9602d5db 100644 --- a/main/tools/env/environment.yml +++ b/main/tools/env/environment.yml @@ -31,6 +31,7 @@ dependencies: - deepdiff - pyzmq - oic + - lsst-ctrl-bps - idds-common==0.11.5 - idds-workflow==0.11.5 - idds-client==0.11.5 diff --git a/monitor/data/conf.js b/monitor/data/conf.js index 99b0cb99..014a6e97 100644 --- a/monitor/data/conf.js +++ b/monitor/data/conf.js @@ -1,9 +1,9 @@ var appConfig = { - 'iddsAPI_request': "https://lxplus803.cern.ch:443/idds/monitor_request/null/null", - 'iddsAPI_transform': "https://lxplus803.cern.ch:443/idds/monitor_transform/null/null", - 'iddsAPI_processing': "https://lxplus803.cern.ch:443/idds/monitor_processing/null/null", - 'iddsAPI_request_detail': "https://lxplus803.cern.ch:443/idds/monitor/null/null/true/false/false", - 'iddsAPI_transform_detail': "https://lxplus803.cern.ch:443/idds/monitor/null/null/false/true/false", - 'iddsAPI_processing_detail': "https://lxplus803.cern.ch:443/idds/monitor/null/null/false/false/true" + 'iddsAPI_request': "https://lxplus807.cern.ch:443/idds/monitor_request/null/null", + 'iddsAPI_transform': "https://lxplus807.cern.ch:443/idds/monitor_transform/null/null", + 'iddsAPI_processing': "https://lxplus807.cern.ch:443/idds/monitor_processing/null/null", + 'iddsAPI_request_detail': "https://lxplus807.cern.ch:443/idds/monitor/null/null/true/false/false", + 'iddsAPI_transform_detail': "https://lxplus807.cern.ch:443/idds/monitor/null/null/false/true/false", + 'iddsAPI_processing_detail': "https://lxplus807.cern.ch:443/idds/monitor/null/null/false/false/true" } From f6e3720ad17cb76947740a1fb90ba41fc25cb7c6 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Tue, 11 Jul 2023 21:19:28 +0200 Subject: [PATCH 17/19] fix trigger release jobs for submap --- main/lib/idds/agents/carrier/utils.py | 10 +++++----- main/lib/idds/tests/core_tests.py | 10 +++++++--- main/lib/idds/tests/panda_test.py | 3 +++ main/lib/idds/tests/test_domapanda.py | 9 ++++++--- main/lib/idds/tests/test_domapanda_lsst_workflow.py | 6 +++--- 5 files changed, 24 insertions(+), 14 deletions(-) diff --git a/main/lib/idds/agents/carrier/utils.py b/main/lib/idds/agents/carrier/utils.py index 522261ff..7d866230 100644 --- a/main/lib/idds/agents/carrier/utils.py +++ b/main/lib/idds/agents/carrier/utils.py @@ -248,7 +248,7 @@ def get_update_contents(request_id, transform_id, workload_id, input_output_maps input_output_sub_maps = get_input_output_sub_maps(inputs, outputs, inputs_dependency) for sub_map_id in input_output_sub_maps: inputs_sub = input_output_sub_maps[sub_map_id]['inputs'] - outputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + outputs_sub = input_output_sub_maps[sub_map_id]['outputs'] inputs_dependency_sub = input_output_sub_maps[sub_map_id]['inputs_dependency'] content_update_status = None @@ -609,7 +609,7 @@ def get_updated_contents_by_input_output_maps(input_output_maps=None, logger=Non for sub_map_id in input_output_sub_maps: inputs_sub = input_output_sub_maps[sub_map_id]['inputs'] - outputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + outputs_sub = input_output_sub_maps[sub_map_id]['outputs'] inputs_dependency_sub = input_output_sub_maps[sub_map_id]['inputs_dependency'] input_content_update_status = None @@ -830,7 +830,7 @@ def trigger_release_inputs_no_deps(request_id, transform_id, workload_id, work, input_output_sub_maps = get_input_output_sub_maps(inputs, outputs, inputs_dependency) for sub_map_id in input_output_sub_maps: inputs_sub = input_output_sub_maps[sub_map_id]['inputs'] - # outputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + # outputs_sub = input_output_sub_maps[sub_map_id]['outputs'] inputs_dependency_sub = input_output_sub_maps[sub_map_id]['inputs_dependency'] if not inputs_dependency_sub: @@ -880,7 +880,7 @@ def trigger_release_inputs(request_id, transform_id, workload_id, work, updated_ input_output_sub_maps = get_input_output_sub_maps(inputs, outputs, inputs_dependency) for sub_map_id in input_output_sub_maps: inputs_sub = input_output_sub_maps[sub_map_id]['inputs'] - outputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + outputs_sub = input_output_sub_maps[sub_map_id]['outputs'] inputs_dependency_sub = input_output_sub_maps[sub_map_id]['inputs_dependency'] input_content_update_status = None @@ -935,7 +935,7 @@ def poll_missing_outputs(input_output_maps): input_output_sub_maps = get_input_output_sub_maps(inputs, outputs, inputs_dependency) for sub_map_id in input_output_sub_maps: inputs_sub = input_output_sub_maps[sub_map_id]['inputs'] - outputs_sub = input_output_sub_maps[sub_map_id]['inputs'] + outputs_sub = input_output_sub_maps[sub_map_id]['outputs'] # inputs_dependency_sub = input_output_sub_maps[sub_map_id]['inputs_dependency'] content_update_status = None diff --git a/main/lib/idds/tests/core_tests.py b/main/lib/idds/tests/core_tests.py index 5321cc5a..7c750ce4 100644 --- a/main/lib/idds/tests/core_tests.py +++ b/main/lib/idds/tests/core_tests.py @@ -173,6 +173,7 @@ def print_workflow_template(workflow, layers=0): # reqs = get_requests(request_id=385554, with_request=True, with_detail=False, with_metadata=True) reqs = get_requests(request_id=479187, with_request=True, with_detail=False, with_metadata=True) reqs = get_requests(request_id=4498, with_request=True, with_detail=False, with_metadata=True) +reqs = get_requests(request_id=4615, with_request=True, with_detail=False, with_metadata=True) for req in reqs: # print(req['request_id']) # print(req) @@ -223,6 +224,8 @@ def print_workflow_template(workflow, layers=0): print("workflow template") print(json_dumps(workflow.template, sort_keys=True, indent=4)) + +""" sys.exit(0) reqs = get_requests(request_id=28182323, with_request=False, with_detail=True, with_metadata=False) @@ -230,6 +233,7 @@ def print_workflow_template(workflow, layers=0): print(json_dumps(req, sort_keys=True, indent=4)) # sys.exit(0) +""" """ # reqs = get_requests() @@ -245,7 +249,7 @@ def print_workflow_template(workflow, layers=0): """ - +""" tfs = get_transforms(request_id=470) # tfs = get_transforms(transform_id=350723) for tf in tfs: @@ -261,7 +265,7 @@ def print_workflow_template(workflow, layers=0): pass sys.exit(0) - +""" """ msgs = retrieve_messages(workload_id=25972557) @@ -282,7 +286,7 @@ def print_workflow_template(workflow, layers=0): sys.exit(0) """ -prs = get_processings(request_id=373602) +prs = get_processings(request_id=4615) # prs = get_processings(transform_id=350723) i = 0 for pr in prs: diff --git a/main/lib/idds/tests/panda_test.py b/main/lib/idds/tests/panda_test.py index 4ecaad29..ca72b7b3 100644 --- a/main/lib/idds/tests/panda_test.py +++ b/main/lib/idds/tests/panda_test.py @@ -14,6 +14,7 @@ task_ids = [i for i in range(157023, 157050)] +task_ids = [] for task_id in task_ids: print("Killing %s" % task_id) ret = Client.killTask(task_id) @@ -48,7 +49,9 @@ ret = Client.get_files_in_datasets(jediTaskID, verbose=False) print(ret) +print("get events") panda_ids = [{'task_id': 157016, 'panda_id': 53943290}] +panda_ids = [{'task_id': 157076, 'panda_id': 53943504}] ret = Client.get_events_status(panda_ids, verbose=True) print(ret) diff --git a/main/lib/idds/tests/test_domapanda.py b/main/lib/idds/tests/test_domapanda.py index e3d85579..af964c17 100644 --- a/main/lib/idds/tests/test_domapanda.py +++ b/main/lib/idds/tests/test_domapanda.py @@ -44,10 +44,10 @@ # task_queue = 'SLAC_TEST' # task_queue = 'DOMA_LSST_SLAC_TEST' task_queue = 'SLAC_Rubin' -task_queue = 'SLAC_Rubin_Extra_Himem_32Cores' +# task_queue = 'SLAC_Rubin_Extra_Himem_32Cores' -task_cloud = 'EU' -task_queue = 'CC-IN2P3_TEST' +# task_cloud = 'EU' +# task_queue = 'CC-IN2P3_TEST' # task_cloud = 'EU' # task_queue = 'LANCS_TEST' @@ -138,6 +138,7 @@ def setup_workflow(): log_collections=[], dependency_map=taskN1.dependencies, task_name=taskN1.name, task_queue=task_queue, encode_command_line=True, + task_priority=981, prodSourceLabel='managed', task_log={"dataset": "PandaJob_#{pandaid}/", "destination": "local", @@ -152,6 +153,7 @@ def setup_workflow(): log_collections=[], dependency_map=taskN2.dependencies, task_name=taskN2.name, task_queue=task_queue, encode_command_line=True, + task_priority=881, prodSourceLabel='managed', task_log={"dataset": "PandaJob_#{pandaid}/", "destination": "local", @@ -166,6 +168,7 @@ def setup_workflow(): log_collections=[], dependency_map=taskN3.dependencies, task_name=taskN3.name, task_queue=task_queue, encode_command_line=True, + task_priority=781, prodSourceLabel='managed', task_log={"dataset": "PandaJob_#{pandaid}/", "destination": "local", diff --git a/main/lib/idds/tests/test_domapanda_lsst_workflow.py b/main/lib/idds/tests/test_domapanda_lsst_workflow.py index 426c412c..82017bf8 100644 --- a/main/lib/idds/tests/test_domapanda_lsst_workflow.py +++ b/main/lib/idds/tests/test_domapanda_lsst_workflow.py @@ -46,11 +46,11 @@ task_cloud = 'US' task_queue = 'SLAC_Rubin' -task_queue = 'SLAC_Rubin_Extra_Himem_32Cores' +# task_queue = 'SLAC_Rubin_Extra_Himem_32Cores' -task_cloud = 'EU' -task_queue = 'CC-IN2P3_TEST' +# task_cloud = 'EU' +# task_queue = 'CC-IN2P3_TEST' # task_cloud = 'EU' # task_queue = 'LANCS_TEST' From 3bf40aba4d914daf0f2c3963495c63bb1dcfe03c Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Wed, 12 Jul 2023 17:25:37 +0200 Subject: [PATCH 18/19] fix messaging receiver to recover from a failure --- main/lib/idds/agents/carrier/receiver.py | 4 ++-- main/lib/idds/agents/common/plugins/messaging.py | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/main/lib/idds/agents/carrier/receiver.py b/main/lib/idds/agents/carrier/receiver.py index 069471fc..a94998fd 100644 --- a/main/lib/idds/agents/carrier/receiver.py +++ b/main/lib/idds/agents/carrier/receiver.py @@ -87,11 +87,11 @@ def suspend_receiver(self): def resume_receiver(self): if hasattr(self, 'receiver') and self.receiver: - self.logger.info("Stopping receiver: %s" % self.receiver) + self.logger.info("Resuming receiver: %s" % self.receiver) self.receiver.resume() def is_receiver_started(self): - if hasattr(self, 'receiver') and self.receiver: + if hasattr(self, 'receiver') and self.receiver and self.receiver.is_processing(): return True return False diff --git a/main/lib/idds/agents/common/plugins/messaging.py b/main/lib/idds/agents/common/plugins/messaging.py index 6b181b92..063167de 100644 --- a/main/lib/idds/agents/common/plugins/messaging.py +++ b/main/lib/idds/agents/common/plugins/messaging.py @@ -102,6 +102,9 @@ def suspend(self): def resume(self): self.graceful_suspend.clear() + def is_processing(self): + return (not self.graceful_stop.is_set()) and (not self.graceful_suspend.is_set()) + def set_request_queue(self, request_queue): self.request_queue = request_queue @@ -269,6 +272,7 @@ def execute_subscribe(self): except Exception as error: self.logger.error("Messaging receiver throws an exception: %s, %s" % (error, traceback.format_exc())) + sleep_count = 0 while not self.graceful_stop.is_set(): if self.graceful_suspend.is_set(): try: @@ -276,7 +280,12 @@ def execute_subscribe(self): except Exception as error: self.logger.error("Messaging receiver throws an exception: %s, %s" % (error, traceback.format_exc())) time.sleep(1) + sleep_count += 1 + if sleep_count > 300: + self.logger.info("graceful_suspend is set. sleeping") + sleep_count = 0 else: + sleep_count = 0 has_failed_connection = False try: for name in self.receiver_conns: From 8a416ccf7668a2a27c1678e700e3139e691bf9e2 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Wed, 12 Jul 2023 17:28:19 +0200 Subject: [PATCH 19/19] remove libs dependencies not needed for idds client --- client/tools/env/environment.yml | 9 +-------- common/tools/env/environment.yml | 7 +------ workflow/tools/env/environment.yml | 5 ----- 3 files changed, 2 insertions(+), 19 deletions(-) diff --git a/client/tools/env/environment.yml b/client/tools/env/environment.yml index 45756d68..bb4a0e95 100644 --- a/client/tools/env/environment.yml +++ b/client/tools/env/environment.yml @@ -5,14 +5,7 @@ dependencies: - pip: - requests # requests - urllib3 # url connections - - flask # web service - - stomp.py # Messaging broker client - - unittest2 # unit test tool - - pep8 # checks for PEP8 code style compliance - - flake8 # Wrapper around PyFlakes&pep8 - - pytest # python testing tool - - nose # nose test tools - tabulate - argcomplete - idds-common==0.11.5 - - idds-workflow==0.11.5 \ No newline at end of file + - idds-workflow==0.11.5 diff --git a/common/tools/env/environment.yml b/common/tools/env/environment.yml index 3ccd9865..b1a479ef 100644 --- a/common/tools/env/environment.yml +++ b/common/tools/env/environment.yml @@ -5,9 +5,4 @@ dependencies: - pip: - cryptography - pyjwt # Pyjwt - - unittest2 # unit test tool - - pep8 # checks for PEP8 code style compliance - - flake8 # Wrapper around PyFlakes&pep8 - - pytest # python testing tool - - nose # nose test tools - - dogpile.cache \ No newline at end of file + - dogpile.cache diff --git a/workflow/tools/env/environment.yml b/workflow/tools/env/environment.yml index e9ee95f7..581ae0ab 100644 --- a/workflow/tools/env/environment.yml +++ b/workflow/tools/env/environment.yml @@ -3,11 +3,6 @@ dependencies: - python==3.6 - pip - pip: - - unittest2 # unit test tool - - pep8 # checks for PEP8 code style compliance - - flake8 # Wrapper around PyFlakes&pep8 - anytree - networkx - - pytest # python testing tool - - nose # nose test tools - idds-common==0.11.5