From e27b830a7080331ed8f46d4e9ff0ceea32d4be2c Mon Sep 17 00:00:00 2001 From: "alex.liang" Date: Fri, 20 Dec 2024 11:15:48 +0800 Subject: [PATCH] check the gpu avaiablity using the random api to adapte the rental gpus. --- python/fedml/__init__.py | 2 +- .../computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py | 3 ++- .../computing/scheduler/slave/base_slave_protocol_manager.py | 2 +- python/fedml/core/mlops/mlops_device_perfs.py | 2 +- python/setup.py | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py index d6341c25c9..677d06b4e5 100644 --- a/python/fedml/__init__.py +++ b/python/fedml/__init__.py @@ -37,7 +37,7 @@ _global_training_type = None _global_comm_backend = None -__version__ = "0.9.0" +__version__ = "0.9.2" # This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py index 34d0c3be1c..a6717de8cb 100644 --- a/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py @@ -25,7 +25,8 @@ def get_gpu_cards() -> List[GPUCard]: @staticmethod def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]: - return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory) + # return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory) + return GPUtil.getAvailable(order='random', limit=limit) @staticmethod def get_docker_gpu_device_mapping(gpu_ids: List[int], num_gpus: int = 0) -> Optional[Dict]: diff --git a/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py b/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py index 9ba8c26e15..3acd9f2488 100755 --- a/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py +++ b/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py @@ -321,7 +321,7 @@ def callback_report_device_info(self, topic, payload): run_id = payload_json.get("run_id", 0) listen_edge_id = str(topic).split("/")[-1] context = payload_json.get("context", None) - need_gpu_info = payload_json.get("need_gpu_info", False) + need_gpu_info = payload_json.get("need_gpu_info", True) need_running_process_list = payload_json.get("need_running_process_list", False) model_master_device_id = payload_json.get("model_master_device_id", None) model_slave_device_id_list = payload_json.get("model_slave_device_id_list", None) diff --git a/python/fedml/core/mlops/mlops_device_perfs.py b/python/fedml/core/mlops/mlops_device_perfs.py index 4bb41df73f..0c2bde6785 100644 --- a/python/fedml/core/mlops/mlops_device_perfs.py +++ b/python/fedml/core/mlops/mlops_device_perfs.py @@ -233,7 +233,7 @@ def report_gpu_device_info(edge_id, mqtt_mgr=None): # Do not use the following two lines as the realtime available gpu ids. # gpu_available_ids = JobRunnerUtils.get_available_gpu_id_list(edge_id) # gpu_available_ids = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids) - gpu_cores_available = len(gpu_available_ids) + gpu_cores_available = len(gpu_available_ids) if gpu_available_ids is not None else 0 deploy_worker_id_list = list() try: deploy_worker_id_list = json.loads(os.environ.get("FEDML_DEPLOY_WORKER_IDS", "[]")) diff --git a/python/setup.py b/python/setup.py index 3847f360fa..032bdb4eed 100644 --- a/python/setup.py +++ b/python/setup.py @@ -126,7 +126,7 @@ def finalize_options(self): setup( name="fedml", - version="0.9.0", + version="0.9.2", author="FedML Team", author_email="ch@fedml.ai", description="A research and production integrated edge-cloud library for "