Skip to content

Commit

Permalink
Merge pull request #2240 from FedML-AI/alexleung/dev_v0700_4_sync
Browse files Browse the repository at this point in the history
check the gpu avaiablity using the random api to adapte the rental gpus.
  • Loading branch information
charlieyl authored Dec 20, 2024
2 parents cb489f8 + e27b830 commit 181621a
Show file tree
Hide file tree
Showing 5 changed files with 6 additions and 5 deletions.
2 changes: 1 addition & 1 deletion python/fedml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
_global_training_type = None
_global_comm_backend = None

__version__ = "0.9.0"
__version__ = "0.9.2"


# This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ def get_gpu_cards() -> List[GPUCard]:

@staticmethod
def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]:
return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory)
# return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory)
return GPUtil.getAvailable(order='random', limit=limit)

@staticmethod
def get_docker_gpu_device_mapping(gpu_ids: List[int], num_gpus: int = 0) -> Optional[Dict]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def callback_report_device_info(self, topic, payload):
run_id = payload_json.get("run_id", 0)
listen_edge_id = str(topic).split("/")[-1]
context = payload_json.get("context", None)
need_gpu_info = payload_json.get("need_gpu_info", False)
need_gpu_info = payload_json.get("need_gpu_info", True)
need_running_process_list = payload_json.get("need_running_process_list", False)
model_master_device_id = payload_json.get("model_master_device_id", None)
model_slave_device_id_list = payload_json.get("model_slave_device_id_list", None)
Expand Down
2 changes: 1 addition & 1 deletion python/fedml/core/mlops/mlops_device_perfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def report_gpu_device_info(edge_id, mqtt_mgr=None):
# Do not use the following two lines as the realtime available gpu ids.
# gpu_available_ids = JobRunnerUtils.get_available_gpu_id_list(edge_id)
# gpu_available_ids = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids)
gpu_cores_available = len(gpu_available_ids)
gpu_cores_available = len(gpu_available_ids) if gpu_available_ids is not None else 0
deploy_worker_id_list = list()
try:
deploy_worker_id_list = json.loads(os.environ.get("FEDML_DEPLOY_WORKER_IDS", "[]"))
Expand Down
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def finalize_options(self):

setup(
name="fedml",
version="0.9.0",
version="0.9.2",
author="FedML Team",
author_email="[email protected]",
description="A research and production integrated edge-cloud library for "
Expand Down

0 comments on commit 181621a

Please sign in to comment.