Merge pull request #2240 from FedML-AI/alexleung/dev_v0700_4_sync

check the gpu avaiablity using the random api to adapte the rental gpus.
FedML-AI · Dec 20, 2024 · 181621a · 181621a
2 parents cb489f8 + e27b830
commit 181621a
Show file tree

Hide file tree

Showing 5 changed files with 6 additions and 5 deletions.
diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py
@@ -37,7 +37,7 @@
 _global_training_type = None
 _global_comm_backend = None
 
-__version__ = "0.9.0"
+__version__ = "0.9.2"
 
 
 # This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release

diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py
@@ -25,7 +25,8 @@ def get_gpu_cards() -> List[GPUCard]:
 
     @staticmethod
     def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]:
-        return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory)
+        # return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory)
+        return GPUtil.getAvailable(order='random', limit=limit)
 
     @staticmethod
     def get_docker_gpu_device_mapping(gpu_ids: List[int], num_gpus: int = 0) -> Optional[Dict]:

diff --git a/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py b/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py
@@ -321,7 +321,7 @@ def callback_report_device_info(self, topic, payload):
         run_id = payload_json.get("run_id", 0)
         listen_edge_id = str(topic).split("/")[-1]
         context = payload_json.get("context", None)
-        need_gpu_info = payload_json.get("need_gpu_info", False)
+        need_gpu_info = payload_json.get("need_gpu_info", True)
         need_running_process_list = payload_json.get("need_running_process_list", False)
         model_master_device_id = payload_json.get("model_master_device_id", None)
         model_slave_device_id_list = payload_json.get("model_slave_device_id_list", None)

diff --git a/python/fedml/core/mlops/mlops_device_perfs.py b/python/fedml/core/mlops/mlops_device_perfs.py
@@ -233,7 +233,7 @@ def report_gpu_device_info(edge_id, mqtt_mgr=None):
         # Do not use the following two lines as the realtime available gpu ids.
         # gpu_available_ids = JobRunnerUtils.get_available_gpu_id_list(edge_id)
         # gpu_available_ids = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids)
-        gpu_cores_available = len(gpu_available_ids)
+        gpu_cores_available = len(gpu_available_ids) if gpu_available_ids is not None else 0
         deploy_worker_id_list = list()
         try:
             deploy_worker_id_list = json.loads(os.environ.get("FEDML_DEPLOY_WORKER_IDS", "[]"))

diff --git a/python/setup.py b/python/setup.py
@@ -126,7 +126,7 @@ def finalize_options(self):
 
 setup(
     name="fedml",
-    version="0.9.0",
+    version="0.9.2",
     author="FedML Team",
     author_email="[email protected]",
     description="A research and production integrated edge-cloud library for "