diff --git a/src/job-exporter/src/docker_inspect.py b/src/job-exporter/src/docker_inspect.py index 5491320fe5..222312986b 100644 --- a/src/job-exporter/src/docker_inspect.py +++ b/src/job-exporter/src/docker_inspect.py @@ -28,19 +28,19 @@ class InspectResult(object): """ Represents a task meta data, parsed from docker inspect result """ - def __init__(self, username, job_name, role_name, task_index, gpu_ids, job_instance_id, pid, virtual_cluster): + def __init__(self, username, job_name, role_name, task_index, gpu_ids, job_instance_id, virtual_cluster, pid): self.username = username self.job_name = job_name self.role_name = role_name self.task_index = task_index self.gpu_ids = gpu_ids # comma separated str, str may be minor_number or UUID - self.pid = pid self.job_instance_id = job_instance_id # Used to distinguish job instance with same name but different retry number. self.virtual_cluster = virtual_cluster + self.pid = pid def __repr__(self): - return "username %s, job_name %s, role_name %s, task_index %s, gpu_ids %s, job_instance_id %s pid %s virtual_cluster %s" % \ - (self.username, self.job_name, self.role_name, self.task_index, self.gpu_ids, self.job_instance_id, self.pid, self.virtual_cluster) + return "username %s, job_name %s, role_name %s, task_index %s, gpu_ids %s, job_instance_id %s virtual_cluster %s pid %s " % \ + (self.username, self.job_name, self.role_name, self.task_index, self.gpu_ids, self.job_instance_id, self.virtual_cluster, self.pid) def __eq__(self, o): return self.username == o.username and \ @@ -49,8 +49,8 @@ def __eq__(self, o): self.task_index == o.task_index and \ self.gpu_ids == o.gpu_ids and \ self.job_instance_id == o.job_instance_id and \ - self.pid == o.pid and \ - self.virtual_cluster == o.virtual_cluster + self.virtual_cluster == o.virtual_cluster and \ + self.pid == o.pid keys = {"PAI_JOB_NAME", "PAI_USER_NAME", "PAI_CURRENT_TASK_ROLE_NAME", "GPU_ID", @@ -97,8 +97,8 @@ def parse_docker_inspect(inspect_output, gpu_vender): m.get("PAI_TASK_INDEX"), m.get("GPU_ID"), m.get("JOB_INSTANCE_ID"), - pid, - m.get("PAI_VIRTUAL_CLUSTER")) + m.get("PAI_VIRTUAL_CLUSTER"), + pid) def inspect(container_id, histogram, timeout, gpu_vender): try: diff --git a/src/job-exporter/test/data/docker_inspect_amd.json b/src/job-exporter/test/data/docker_inspect_amd.json index e3eb7892d6..19e5be98b1 100644 --- a/src/job-exporter/test/data/docker_inspect_amd.json +++ b/src/job-exporter/test/data/docker_inspect_amd.json @@ -231,6 +231,7 @@ "PAI_MIN_SUCCEEDED_TASK_COUNT_worker=1", "PAI_USERNAME=dgxadmin", "PAI_CURRENT_TASK_ROLE_NAME=worker", + "PAI_VIRTUAL_CLUSTER=default", "PAI_TASK_INDEX=0", "FC_FRAMEWORK_NAMESPACE=default", "FC_TASK_ATTEMPT_INSTANCE_UID=0_a1310f10-5ea6-11ea-8a2a-90b11c27f535", diff --git a/src/job-exporter/test/data/docker_inspect_kube_launcher_task.json b/src/job-exporter/test/data/docker_inspect_kube_launcher_task.json index 242d792bcd..c65710819a 100644 --- a/src/job-exporter/test/data/docker_inspect_kube_launcher_task.json +++ b/src/job-exporter/test/data/docker_inspect_kube_launcher_task.json @@ -180,6 +180,7 @@ "FC_CONFIGMAP_UID=69c05215-46fa-11e9-8937-000d3ab38724", "PAI_CURRENT_TASK_ROLE_MEM_MB=32768", "PAI_CURRENT_TASK_ROLE_NAME=worker", + "PAI_VIRTUAL_CLUSTER=default", "PAI_OUTPUT_DIR=", "PAI_MIN_FAILED_INSTANCE_worker=", "PAI_JOB_NAME=core~tensorflowcifar10", diff --git a/src/job-exporter/test/data/docker_inspect_sample.json b/src/job-exporter/test/data/docker_inspect_sample.json index d3eb1f22d9..f5f4b0e10b 100644 --- a/src/job-exporter/test/data/docker_inspect_sample.json +++ b/src/job-exporter/test/data/docker_inspect_sample.json @@ -88,6 +88,7 @@ "PAI_HOSTNAME": "paigcr-a-gpu-1058", "PAI_JOB_NAME": "trialslot_nnimain_d65bc5ac", "PAI_USER_NAME": "openmindstudio", + "PAI_VIRTUAL_CLUSTER": "default", "com.nvidia.build.id": "32579957", "com.nvidia.build.ref": "88bc4e08cac2668ec821eead444e5ede2cafcf25", "com.nvidia.cuda.version": "8.0.61", diff --git a/src/job-exporter/test/data/inspect_result_bug_fix.json b/src/job-exporter/test/data/inspect_result_bug_fix.json index 171c58a23c..4ececad688 100644 --- a/src/job-exporter/test/data/inspect_result_bug_fix.json +++ b/src/job-exporter/test/data/inspect_result_bug_fix.json @@ -250,6 +250,7 @@ "FRAMEWORK_NAME=sokoya~train-exp_offrl_sc_discard_0231-10th-beta07-lrfixed_13e9bf5_gCYv", "HADOOP_USER_NAME=sokoya", "PAI_TASK_INDEX=0", + "PAI_VIRTUAL_CLUSTER=default", "PAI_CONTAINER_HOST_IP=10.151.40.211", "PAI_CONTAINER_HOST_PORT_LIST=tensorboard:15452;http:15453;ssh:15454;", "PAI_CONTAINER_ID=container_e47_1553664769226_0080_01_000025", diff --git a/src/job-exporter/test/test_collector.py b/src/job-exporter/test/test_collector.py index 2acfbb39d8..d96ec79a68 100644 --- a/src/job-exporter/test/test_collector.py +++ b/src/job-exporter/test/test_collector.py @@ -48,8 +48,8 @@ def test_parse_from_labels(self): "0", "0,1,", "application_1522829300813_1943", - 12345, - "default") + "default", + 12345) gpu_ids, labels = ContainerCollector.parse_from_labels(inspect_result, None) self.assertEqual(["0", "1"], gpu_ids) diff --git a/src/job-exporter/test/test_docker_inspect.py b/src/job-exporter/test/test_docker_inspect.py index a6a76cad3c..53f3e5dcff 100644 --- a/src/job-exporter/test/test_docker_inspect.py +++ b/src/job-exporter/test/test_docker_inspect.py @@ -49,6 +49,7 @@ def test_parse_docker_inspect(self): "trialslot_nnimain_d65bc5ac", "tuner", "0", "0,1,", "application_1522829300813_1943", + "default", 95539) self.assertEqual(target_inspect_info, inspect_info) @@ -62,7 +63,7 @@ def test_parse_docker_inspect_kube(self): target_inspect_info = InspectResult( "core", "core~tensorflowcifar10", "worker", "0", "GPU-dc0671b0-61a4-443e-f456-f8fa6359b788", - "0_69c05215-46fa-11e9-8937-000d3ab38724", 23774) + "0_69c05215-46fa-11e9-8937-000d3ab38724", "default", 23774) self.assertEqual(target_inspect_info, inspect_info) def test_parse_docker_inspect_BUGFIX(self): @@ -74,7 +75,7 @@ def test_parse_docker_inspect_BUGFIX(self): target_inspect_info = InspectResult( "sokoya", "sokoya~train-exp_offrl_sc_discard_0231-10th-beta07-lrfixed_13e9bf5_gCYv", - "train", "0", "3,2,1,0", "application_1553664769226_0080", 30332) + "train", "0", "3,2,1,0", "application_1553664769226_0080", "default", 30332) self.assertEqual(target_inspect_info, inspect_info) def test_adapt_dlts_jobs(self): @@ -85,7 +86,7 @@ def test_adapt_dlts_jobs(self): inspect_info = parse_docker_inspect(docker_inspect, GpuVendor.NVIDIA) target_inspect_info = InspectResult( "dixu", "0c435eee-d31f-43d5-a1b3-442845fa1d0c", None, None, - "GPU-7c583998-b3ff-a885-8979-2d32d334cde4", None, 3533) + "GPU-7c583998-b3ff-a885-8979-2d32d334cde4", None, None, 3533) self.assertEqual(target_inspect_info, inspect_info) def test_parse_docker_inspect_amd(self): @@ -96,7 +97,7 @@ def test_parse_docker_inspect_amd(self): inspect_info = parse_docker_inspect(docker_inspect, GpuVendor.AMD) target_inspect_info = InspectResult( "dgxadmin", "dgxadmin~rocm_pytorch_mnist_a", "worker", "0", - "0,1", "0_a12d82af-5ea6-11ea-8a2a-90b11c27f535", 29647) + "0,1", "0_a12d82af-5ea6-11ea-8a2a-90b11c27f535", "default", 29647) self.assertEqual(target_inspect_info, inspect_info)