Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
fix docker inspect test
Browse files Browse the repository at this point in the history
  • Loading branch information
suiguoxin committed Aug 6, 2020
1 parent 45253ed commit 13c6871
Show file tree
Hide file tree
Showing 7 changed files with 19 additions and 14 deletions.
16 changes: 8 additions & 8 deletions src/job-exporter/src/docker_inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,19 @@

class InspectResult(object):
""" Represents a task meta data, parsed from docker inspect result """
def __init__(self, username, job_name, role_name, task_index, gpu_ids, job_instance_id, pid, virtual_cluster):
def __init__(self, username, job_name, role_name, task_index, gpu_ids, job_instance_id, virtual_cluster, pid):
self.username = username
self.job_name = job_name
self.role_name = role_name
self.task_index = task_index
self.gpu_ids = gpu_ids # comma separated str, str may be minor_number or UUID
self.pid = pid
self.job_instance_id = job_instance_id # Used to distinguish job instance with same name but different retry number.
self.virtual_cluster = virtual_cluster
self.pid = pid

def __repr__(self):
return "username %s, job_name %s, role_name %s, task_index %s, gpu_ids %s, job_instance_id %s pid %s virtual_cluster %s" % \
(self.username, self.job_name, self.role_name, self.task_index, self.gpu_ids, self.job_instance_id, self.pid, self.virtual_cluster)
return "username %s, job_name %s, role_name %s, task_index %s, gpu_ids %s, job_instance_id %s virtual_cluster %s pid %s " % \
(self.username, self.job_name, self.role_name, self.task_index, self.gpu_ids, self.job_instance_id, self.virtual_cluster, self.pid)

def __eq__(self, o):
return self.username == o.username and \
Expand All @@ -49,8 +49,8 @@ def __eq__(self, o):
self.task_index == o.task_index and \
self.gpu_ids == o.gpu_ids and \
self.job_instance_id == o.job_instance_id and \
self.pid == o.pid and \
self.virtual_cluster == o.virtual_cluster
self.virtual_cluster == o.virtual_cluster and \
self.pid == o.pid


keys = {"PAI_JOB_NAME", "PAI_USER_NAME", "PAI_CURRENT_TASK_ROLE_NAME", "GPU_ID",
Expand Down Expand Up @@ -97,8 +97,8 @@ def parse_docker_inspect(inspect_output, gpu_vender):
m.get("PAI_TASK_INDEX"),
m.get("GPU_ID"),
m.get("JOB_INSTANCE_ID"),
pid,
m.get("PAI_VIRTUAL_CLUSTER"))
m.get("PAI_VIRTUAL_CLUSTER"),
pid)

def inspect(container_id, histogram, timeout, gpu_vender):
try:
Expand Down
1 change: 1 addition & 0 deletions src/job-exporter/test/data/docker_inspect_amd.json
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@
"PAI_MIN_SUCCEEDED_TASK_COUNT_worker=1",
"PAI_USERNAME=dgxadmin",
"PAI_CURRENT_TASK_ROLE_NAME=worker",
"PAI_VIRTUAL_CLUSTER=default",
"PAI_TASK_INDEX=0",
"FC_FRAMEWORK_NAMESPACE=default",
"FC_TASK_ATTEMPT_INSTANCE_UID=0_a1310f10-5ea6-11ea-8a2a-90b11c27f535",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@
"FC_CONFIGMAP_UID=69c05215-46fa-11e9-8937-000d3ab38724",
"PAI_CURRENT_TASK_ROLE_MEM_MB=32768",
"PAI_CURRENT_TASK_ROLE_NAME=worker",
"PAI_VIRTUAL_CLUSTER=default",
"PAI_OUTPUT_DIR=",
"PAI_MIN_FAILED_INSTANCE_worker=",
"PAI_JOB_NAME=core~tensorflowcifar10",
Expand Down
1 change: 1 addition & 0 deletions src/job-exporter/test/data/docker_inspect_sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
"PAI_HOSTNAME": "paigcr-a-gpu-1058",
"PAI_JOB_NAME": "trialslot_nnimain_d65bc5ac",
"PAI_USER_NAME": "openmindstudio",
"PAI_VIRTUAL_CLUSTER": "default",
"com.nvidia.build.id": "32579957",
"com.nvidia.build.ref": "88bc4e08cac2668ec821eead444e5ede2cafcf25",
"com.nvidia.cuda.version": "8.0.61",
Expand Down
1 change: 1 addition & 0 deletions src/job-exporter/test/data/inspect_result_bug_fix.json
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@
"FRAMEWORK_NAME=sokoya~train-exp_offrl_sc_discard_0231-10th-beta07-lrfixed_13e9bf5_gCYv",
"HADOOP_USER_NAME=sokoya",
"PAI_TASK_INDEX=0",
"PAI_VIRTUAL_CLUSTER=default",
"PAI_CONTAINER_HOST_IP=10.151.40.211",
"PAI_CONTAINER_HOST_PORT_LIST=tensorboard:15452;http:15453;ssh:15454;",
"PAI_CONTAINER_ID=container_e47_1553664769226_0080_01_000025",
Expand Down
4 changes: 2 additions & 2 deletions src/job-exporter/test/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def test_parse_from_labels(self):
"0",
"0,1,",
"application_1522829300813_1943",
12345,
"default")
"default",
12345)

gpu_ids, labels = ContainerCollector.parse_from_labels(inspect_result, None)
self.assertEqual(["0", "1"], gpu_ids)
Expand Down
9 changes: 5 additions & 4 deletions src/job-exporter/test/test_docker_inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def test_parse_docker_inspect(self):
"trialslot_nnimain_d65bc5ac",
"tuner", "0", "0,1,",
"application_1522829300813_1943",
"default",
95539)

self.assertEqual(target_inspect_info, inspect_info)
Expand All @@ -62,7 +63,7 @@ def test_parse_docker_inspect_kube(self):
target_inspect_info = InspectResult(
"core", "core~tensorflowcifar10", "worker", "0",
"GPU-dc0671b0-61a4-443e-f456-f8fa6359b788",
"0_69c05215-46fa-11e9-8937-000d3ab38724", 23774)
"0_69c05215-46fa-11e9-8937-000d3ab38724", "default", 23774)
self.assertEqual(target_inspect_info, inspect_info)

def test_parse_docker_inspect_BUGFIX(self):
Expand All @@ -74,7 +75,7 @@ def test_parse_docker_inspect_BUGFIX(self):
target_inspect_info = InspectResult(
"sokoya",
"sokoya~train-exp_offrl_sc_discard_0231-10th-beta07-lrfixed_13e9bf5_gCYv",
"train", "0", "3,2,1,0", "application_1553664769226_0080", 30332)
"train", "0", "3,2,1,0", "application_1553664769226_0080", "default", 30332)
self.assertEqual(target_inspect_info, inspect_info)

def test_adapt_dlts_jobs(self):
Expand All @@ -85,7 +86,7 @@ def test_adapt_dlts_jobs(self):
inspect_info = parse_docker_inspect(docker_inspect, GpuVendor.NVIDIA)
target_inspect_info = InspectResult(
"dixu", "0c435eee-d31f-43d5-a1b3-442845fa1d0c", None, None,
"GPU-7c583998-b3ff-a885-8979-2d32d334cde4", None, 3533)
"GPU-7c583998-b3ff-a885-8979-2d32d334cde4", None, None, 3533)
self.assertEqual(target_inspect_info, inspect_info)

def test_parse_docker_inspect_amd(self):
Expand All @@ -96,7 +97,7 @@ def test_parse_docker_inspect_amd(self):
inspect_info = parse_docker_inspect(docker_inspect, GpuVendor.AMD)
target_inspect_info = InspectResult(
"dgxadmin", "dgxadmin~rocm_pytorch_mnist_a", "worker", "0",
"0,1", "0_a12d82af-5ea6-11ea-8a2a-90b11c27f535", 29647)
"0,1", "0_a12d82af-5ea6-11ea-8a2a-90b11c27f535", "default", 29647)
self.assertEqual(target_inspect_info, inspect_info)


Expand Down

0 comments on commit 13c6871

Please sign in to comment.