From e27b830a7080331ed8f46d4e9ff0ceea32d4be2c Mon Sep 17 00:00:00 2001
From: "alex.liang" <alexliang.kh@gmail.com>
Date: Fri, 20 Dec 2024 11:15:48 +0800
Subject: [PATCH] check the gpu avaiablity using the random api to adapte the
 rental gpus.

---
 python/fedml/__init__.py                                       | 2 +-
 .../computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py   | 3 ++-
 .../computing/scheduler/slave/base_slave_protocol_manager.py   | 2 +-
 python/fedml/core/mlops/mlops_device_perfs.py                  | 2 +-
 python/setup.py                                                | 2 +-
 5 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py
index d6341c25c9..677d06b4e5 100644
--- a/python/fedml/__init__.py
+++ b/python/fedml/__init__.py
@@ -37,7 +37,7 @@
 _global_training_type = None
 _global_comm_backend = None
 
-__version__ = "0.9.0"
+__version__ = "0.9.2"
 
 
 # This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release
diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py
index 34d0c3be1c..a6717de8cb 100644
--- a/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py
+++ b/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py
@@ -25,7 +25,8 @@ def get_gpu_cards() -> List[GPUCard]:
 
     @staticmethod
     def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]:
-        return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory)
+        # return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory)
+        return GPUtil.getAvailable(order='random', limit=limit)
 
     @staticmethod
     def get_docker_gpu_device_mapping(gpu_ids: List[int], num_gpus: int = 0) -> Optional[Dict]:
diff --git a/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py b/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py
index 9ba8c26e15..3acd9f2488 100755
--- a/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py
+++ b/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py
@@ -321,7 +321,7 @@ def callback_report_device_info(self, topic, payload):
         run_id = payload_json.get("run_id", 0)
         listen_edge_id = str(topic).split("/")[-1]
         context = payload_json.get("context", None)
-        need_gpu_info = payload_json.get("need_gpu_info", False)
+        need_gpu_info = payload_json.get("need_gpu_info", True)
         need_running_process_list = payload_json.get("need_running_process_list", False)
         model_master_device_id = payload_json.get("model_master_device_id", None)
         model_slave_device_id_list = payload_json.get("model_slave_device_id_list", None)
diff --git a/python/fedml/core/mlops/mlops_device_perfs.py b/python/fedml/core/mlops/mlops_device_perfs.py
index 4bb41df73f..0c2bde6785 100644
--- a/python/fedml/core/mlops/mlops_device_perfs.py
+++ b/python/fedml/core/mlops/mlops_device_perfs.py
@@ -233,7 +233,7 @@ def report_gpu_device_info(edge_id, mqtt_mgr=None):
         # Do not use the following two lines as the realtime available gpu ids.
         # gpu_available_ids = JobRunnerUtils.get_available_gpu_id_list(edge_id)
         # gpu_available_ids = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids)
-        gpu_cores_available = len(gpu_available_ids)
+        gpu_cores_available = len(gpu_available_ids) if gpu_available_ids is not None else 0
         deploy_worker_id_list = list()
         try:
             deploy_worker_id_list = json.loads(os.environ.get("FEDML_DEPLOY_WORKER_IDS", "[]"))
diff --git a/python/setup.py b/python/setup.py
index 3847f360fa..032bdb4eed 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -126,7 +126,7 @@ def finalize_options(self):
 
 setup(
     name="fedml",
-    version="0.9.0",
+    version="0.9.2",
     author="FedML Team",
     author_email="ch@fedml.ai",
     description="A research and production integrated edge-cloud library for "