diff --git a/python/README.md b/python/README.md index ad85023e6d..1e30f2b167 100644 --- a/python/README.md +++ b/python/README.md @@ -43,5 +43,5 @@ Other low-level APIs related to security and privacy are also supported. All alg **utils**: Common utilities shared by other modules. -## About FedML, Inc. -https://FedML.ai +## About TensorOpera, Inc. +https://tensoropera.ai diff --git a/python/examples/README.md b/python/examples/README.md index 45086c27cf..32831a63f3 100644 --- a/python/examples/README.md +++ b/python/examples/README.md @@ -2,14 +2,14 @@ # FEDML Examples (Including Prebuilt Jobs in Jobs Store) - `FedML/python/examples` -- examples for training, deployment, and federated learning - - `FedML/python/examples/launch` -- examples for FEDML®Launch - - `FedML/python/examples/serving` -- examples for FEDML®Deploy - - `FedML/python/examples/train` -- examples for FEDML®Train - - `FedML/python/examples/cross_cloud` -- examples for FEDML®Train cross-cloud distributed training + - `FedML/python/examples/launch` -- examples for TensorOpera®Launch + - `FedML/python/examples/serving` -- examples for TensorOpera®Deploy + - `FedML/python/examples/train` -- examples for TensorOpera®Train + - `FedML/python/examples/cross_cloud` -- examples for TensorOpera®Train cross-cloud distributed training - `FedML/python/examples/federate/prebuilt_jobs` -- examples for federated learning prebuilt jobs (FedCV, FedNLP, FedGraphNN, Healthcare, etc.) - `FedML/python/examples/federate/cross_silo` -- examples for cross-silo federated learning - `FedML/python/examples/federate/cross_device` -- examples for cross-device federated learning - `FedML/python/examples/federate/simulation` -- examples for federated learning simulation - - `FedML/python/examples/federate/security` -- examples for FEDML®Federate security related features - - `FedML/python/examples/federate/privacy` -- examples for FEDML®Federate privacy related features - - `FedML/python/examples/federate/federated_analytics` -- examples for FEDML®Federate federated analytics (FA) + - `FedML/python/examples/federate/security` -- examples for TensorOpera®Federate security related features + - `FedML/python/examples/federate/privacy` -- examples for TensorOpera®Federate privacy related features + - `FedML/python/examples/federate/federated_analytics` -- examples for TensorOpera®Federate federated analytics (FA) diff --git a/python/examples/deploy/complex_example/README.md b/python/examples/deploy/complex_example/README.md index 1f67f587fd..b7a03aeea6 100644 --- a/python/examples/deploy/complex_example/README.md +++ b/python/examples/deploy/complex_example/README.md @@ -16,7 +16,7 @@ Use -cf to indicate the configuration file. curl -XPOST localhost:2345/predict -d '{"text": "Hello"}' ``` -## Option 2: Deploy to the Cloud (Using fedml®launch platform) +## Option 2: Deploy to the Cloud (Using TensorOpera®launch platform) - Uncomment the following line in config.yaml For information about the configuration, please refer to fedml ® launch. diff --git a/python/examples/deploy/complex_example/config.yaml b/python/examples/deploy/complex_example/config.yaml index 037183a066..cd658aae33 100644 --- a/python/examples/deploy/complex_example/config.yaml +++ b/python/examples/deploy/complex_example/config.yaml @@ -15,7 +15,7 @@ environment_variables: LOCAL_RANK: "0" # If you do not have any GPU resource but want to serve the model -# Try FedML® Nexus AI Platform, and Uncomment the following lines. +# Try TensorOpera® Nexus AI Platform, and Uncomment the following lines. # ------------------------------------------------------------ computing: minimum_num_gpus: 1 # minimum # of GPUs to provision diff --git a/python/examples/deploy/custom_inference_image/README.md b/python/examples/deploy/custom_inference_image/README.md deleted file mode 100644 index 1269e4c064..0000000000 --- a/python/examples/deploy/custom_inference_image/README.md +++ /dev/null @@ -1,48 +0,0 @@ -## Create a model card at local -First, create a model card at local -```bash -fedml model create -n custom_inference_image -cf custom_inference_image.yaml -``` - -## Low Code UI Deploy -Push the model to nexus ai platform -```bash -fedml model push -n custom_inference_image -``` -Do the following docs to deploy the model on nexus ai platform -https://docs-dev.fedml.ai/deploy/low_code_ui - -## CLI Deploy -### Deploy to current machine -Docs: https://docs-dev.fedml.ai/deploy/deploy_local -```bash -fedml model deploy -n custom_inference_image --local -``` - -### Deploy to On-premise -Docs: https://docs-dev.fedml.ai/deploy/deploy_on_premise -```bash -fedml device bind $api_key -``` -```bash -fedml model deploy -n my_model -m $master_ids -w $worker_ids -``` - -### Deploy to GPU Cloud -Docs: https://docs-dev.fedml.ai/deploy/deploy_cloud - -Change the `custom_inference_image.yaml` file, adding following lines -```yaml -computing: - minimum_num_gpus: 1 # minimum # of GPUs to provision - maximum_cost_per_hour: $3000 # max cost per hour for your job per gpu card - #allow_cross_cloud_resources: true # true, false - #device_type: CPU # options: GPU, CPU, hybrid - resource_type: A100-80G # e.g., A100-80G, - # please check the resource type list by "fedml show-resource-type" - # or visiting URL: https://fedml.ai/accelerator_resource_type -``` - -```bash -fedml model deploy -n custom_inference_image -``` \ No newline at end of file diff --git a/python/examples/deploy/custom_inference_image/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/custom_inference_image.yaml deleted file mode 100644 index 0c62767b40..0000000000 --- a/python/examples/deploy/custom_inference_image/custom_inference_image.yaml +++ /dev/null @@ -1,13 +0,0 @@ -workspace: "./" -job: | - echo "Start serving..." - python3 serve_main.py - -bootstrap: | - echo "Bootstrap start..." - echo "Bootstrap finished!" - -enable_custom_image: true -inference_image_name: "fedml/fedml-default-inference-backend" -deploy_timeout: 1000 - diff --git a/python/examples/deploy/custom_inference_image/quickstart/config.yaml b/python/examples/deploy/custom_inference_image/quickstart/config.yaml new file mode 100644 index 0000000000..382209ad1b --- /dev/null +++ b/python/examples/deploy/custom_inference_image/quickstart/config.yaml @@ -0,0 +1,12 @@ +workspace: "." +inference_image: "your_docker_hub_repo/your_image_name" + +workspace_mount_path: "/my_workspace" # Default is "/home/fedml/models_serving" + +container_run_command: "echo hello && python3 /my_workspace/main_entry.py" + +# If you want to install some packages +# Please write the command in the bootstrap.sh +bootstrap: | + echo "Install some packages..." + echo "Install finished!" diff --git a/python/examples/deploy/custom_inference_image/quickstart/main_entry.py b/python/examples/deploy/custom_inference_image/quickstart/main_entry.py new file mode 100644 index 0000000000..7b7caee87b --- /dev/null +++ b/python/examples/deploy/custom_inference_image/quickstart/main_entry.py @@ -0,0 +1,28 @@ +from fedml.serving import FedMLPredictor +from fedml.serving import FedMLInferenceRunner +import uuid + + +class Bot(FedMLPredictor): # Inherit FedMLClientPredictor + def __init__(self): + super().__init__() + + # --- Your model initialization code here, here is a example --- + self.uuid = uuid.uuid4() + # ------------------------------------------- + + def predict(self, request: dict): + input_dict = request + question: str = input_dict.get("text", "").strip() + + # --- Your model inference code here --- + response = f"I am a replica, my id is {self.uuid}" + # --------------------------------------- + + return {"v1_generated_text": f"V1: The answer to your question {question} is: {response}"} + + +if __name__ == "__main__": + chatbot = Bot() + fedml_inference_runner = FedMLInferenceRunner(chatbot) + fedml_inference_runner.run() diff --git a/python/examples/deploy/custom_inference_image/serve_main.py b/python/examples/deploy/custom_inference_image/serve_main.py deleted file mode 100644 index a7a1dd84f3..0000000000 --- a/python/examples/deploy/custom_inference_image/serve_main.py +++ /dev/null @@ -1,16 +0,0 @@ -from fedml.serving import FedMLPredictor -from fedml.serving import FedMLInferenceRunner - - -class DummyPredictor(FedMLPredictor): - def __init__(self): - super().__init__() - - def predict(self, request): - return {"Aloha": request} - - -if __name__ == "__main__": - predictor = DummyPredictor() - fedml_inference_runner = FedMLInferenceRunner(predictor) - fedml_inference_runner.run() \ No newline at end of file diff --git a/python/examples/deploy/custom_inference_image/template.yaml b/python/examples/deploy/custom_inference_image/template.yaml new file mode 100644 index 0000000000..1dd13e1530 --- /dev/null +++ b/python/examples/deploy/custom_inference_image/template.yaml @@ -0,0 +1,22 @@ +# Required +workspace: "./" # We will pacakge all the files in the workspace directory +expose_subdomains: true # For customized image, if you want to route all the subdomains, set to true. e.g. localhost:2345/{all-subdomain} +inference_image_name: "" # Container image name +container_run_command: "" # str or list, similar to CMD in the dockerfile +port: 80 # Service port, currently you can only indicate one arbitrary port + +# Optional, these are the default values +readiness_probe: # Probe for checking whether a container is ready for inference + httpGet: + path: "" +environment_variables: {} # Environment variables inside the container +volumes: # Volumes to mount to the container + - workspace_path: "" # Path to the volume in the workspace + mount_path: "" # Path to mount the volume inside the container +deploy_timeout_sec: 900 # Maximum time waiting for deployment to finish (Does not include the time to pull the image) +request_input_example: {} # Example of input request, will be shown in the UI +registry_specs: # Registry information for pulling the image + registry_name: "" + registry_provider: "DockerHub" + registry_user_name: "" + registry_user_password: "" \ No newline at end of file diff --git a/python/examples/deploy/custom_inference_image/tensorrt_llm/tensorrtllm.yaml b/python/examples/deploy/custom_inference_image/tensorrt_llm/tensorrtllm.yaml new file mode 100644 index 0000000000..a72c1f7753 --- /dev/null +++ b/python/examples/deploy/custom_inference_image/tensorrt_llm/tensorrtllm.yaml @@ -0,0 +1,17 @@ +workspace: "./" + +expose_subdomains: true +inference_image_name: "fedml/llama3-8b-tensorrtllm" + +# If you put the model repository in $workspace/model_repository, it will be mounted to /home/fedml/models_serving/model_repository +container_run_command: ["sh", "-c", "cd / && huggingface-cli login --token $your_hf_token && pip install sentencepiece protobuf && python3 tensorrtllm_backend/scripts/launch_triton_server.py --model_repo tensorrtllm_backend/all_models/inflight_batcher_llm --world_size 1 && tail -f /dev/null"] + +readiness_probe: + httpGet: + path: "/v2/health/ready" + +port: 8000 + +deploy_timeout_sec: 1600 + + diff --git a/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml b/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml new file mode 100644 index 0000000000..11ae9f82ff --- /dev/null +++ b/python/examples/deploy/custom_inference_image/triton_inference_server/template/custom_inference_image.yaml @@ -0,0 +1,20 @@ +workspace: "./" + +expose_subdomains: true +inference_image_name: "nvcr.io/nvidia/tritonserver:24.05-py3" + +volumes: + - workspace_path: "./model_repository" + mount_path: "/repo_inside_container" + +container_run_command: "tritonserver --model-repository=/repo_inside_container" + +readiness_probe: + httpGet: + path: "/v2/health/ready" + +port: 8000 + +deploy_timeout_sec: 1600 + +request_input_example: {"text_input": "Hello"} diff --git a/python/examples/deploy/custom_inference_image/triton_inference_server/template/model_repository/dummy/1/model.py b/python/examples/deploy/custom_inference_image/triton_inference_server/template/model_repository/dummy/1/model.py new file mode 100644 index 0000000000..0404a127ff --- /dev/null +++ b/python/examples/deploy/custom_inference_image/triton_inference_server/template/model_repository/dummy/1/model.py @@ -0,0 +1,25 @@ +import json +import numpy as np +import triton_python_backend_utils as pb_utils + +class TritonPythonModel: + def initialize(self, args): + self.model_name = args['model_name'] + + @staticmethod + def auto_complete_config(auto_complete_model_config): + auto_complete_model_config.add_input( {"name": "text_input", "data_type": "TYPE_STRING", "dims": [-1]}) + auto_complete_model_config.add_output({"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]}) + auto_complete_model_config.set_max_batch_size(0) + return auto_complete_model_config + + def execute(self, requests): + responses = [] + for request in requests: + in_numpy = pb_utils.get_input_tensor_by_name(request, "text_input").as_numpy() + assert np.object_ == in_numpy.dtype, 'in this demo, triton passes in a numpy array of size 1 with object_ dtype, this dtype encapsulates a python bytes-array' + print('in this demo len(in_numpy) is 1:', len(in_numpy.tolist())) + out_numpy = np.array([ (self.model_name + ': ' + python_byte_array.decode('utf-8') + ' World').encode('utf-8') for python_byte_array in in_numpy.tolist()], dtype = np.object_) + out_pb = pb_utils.Tensor("text_output", out_numpy) + responses.append(pb_utils.InferenceResponse(output_tensors = [out_pb])) + return responses diff --git a/python/examples/deploy/custom_inference_image/trt-llm-openai/config.yaml b/python/examples/deploy/custom_inference_image/trt-llm-openai/config.yaml new file mode 100644 index 0000000000..1bdcf32f75 --- /dev/null +++ b/python/examples/deploy/custom_inference_image/trt-llm-openai/config.yaml @@ -0,0 +1,22 @@ +workspace: "./" + +inference_image_name: "fedml/trt-llm-openai" + +# The image has its self-contained cmd, no need for rewriting the command +container_run_command: null + +port: 3000 + +readiness_probe: + httpGet: + path: "/health_check" + +# If you do not use serverless container mode, and you want to indicate another resource path, +# e.g. localhost:3000/v1/chat/completions, you can set the following uri: +service: + httpPost: + path: "/v1/chat/completions" + +deploy_timeout_sec: 1600 + +endpoint_api_type: "text2text_llm_openai_chat_completions" \ No newline at end of file diff --git a/python/examples/deploy/debug/inference_timeout/config.yaml b/python/examples/deploy/debug/inference_timeout/config.yaml new file mode 100644 index 0000000000..f6d2566e00 --- /dev/null +++ b/python/examples/deploy/debug/inference_timeout/config.yaml @@ -0,0 +1,10 @@ +workspace: "./src" +entry_point: "serve_main.py" +bootstrap: | + echo "Bootstrap start..." + sleep 5 + echo "Bootstrap finished" +auto_detect_public_ip: true +use_gpu: true + +request_timeout_sec: 10 diff --git a/python/examples/deploy/debug/inference_timeout/src/serve_main.py b/python/examples/deploy/debug/inference_timeout/src/serve_main.py new file mode 100644 index 0000000000..5884e41f85 --- /dev/null +++ b/python/examples/deploy/debug/inference_timeout/src/serve_main.py @@ -0,0 +1,32 @@ +from fedml.serving import FedMLPredictor +from fedml.serving import FedMLInferenceRunner +import uuid +import torch + +# Calculate the number of elements +num_elements = 1_073_741_824 // 4 # using integer division for whole elements + + +class DummyPredictor(FedMLPredictor): + def __init__(self): + super().__init__() + # Create a tensor with these many elements + tensor = torch.empty(num_elements, dtype=torch.float32) + + # Move the tensor to GPU + tensor_gpu = tensor.cuda() + + # for debug + with open("/tmp/dummy_gpu_occupier.txt", "w") as f: + f.write("GPU is occupied") + + self.worker_id = uuid.uuid4() + + def predict(self, request): + return {f"AlohaV0From{self.worker_id}": request} + + +if __name__ == "__main__": + predictor = DummyPredictor() + fedml_inference_runner = FedMLInferenceRunner(predictor) + fedml_inference_runner.run() diff --git a/python/examples/deploy/mnist/README.md b/python/examples/deploy/mnist/README.md index 11dd696234..b64b4bd70e 100644 --- a/python/examples/deploy/mnist/README.md +++ b/python/examples/deploy/mnist/README.md @@ -11,9 +11,9 @@ curl -XPOST localhost:2345/predict -d '{"arr":[$DATA]}' #For $DATA, please check the request_input_example, it is a 28*28=784 float array #Output:{"generated_text":"tensor([0.2333, 0.5296, 0.4350, 0.4537, 0.5424, 0.4583, 0.4803, 0.2862, 0.5507,\n 0.8683], grad_fn=)"} ``` -## Option 2: Deploy to the Cloud (Using fedml® launch platform) +## Option 2: Deploy to the Cloud (Using TensorOpera® launch platform) Uncomment the following line in mnist.yaml, -for infomation about the configuration, please refer to fedml® launch. +for infomation about the configuration, please refer to TensorOpera® launch. ```yaml # computing: # minimum_num_gpus: 1 diff --git a/python/examples/deploy/mnist/mnist.yaml b/python/examples/deploy/mnist/mnist.yaml index fe419abb1c..cae8050674 100644 --- a/python/examples/deploy/mnist/mnist.yaml +++ b/python/examples/deploy/mnist/mnist.yaml @@ -5,7 +5,7 @@ data_cache_dir: "" bootstrap: "" # If you do not have any GPU resource but want to serve the model -# Try FedML® Nexus AI Platform, and Uncomment the following lines. +# Try TensorOpera® Nexus AI Platform, and Uncomment the following lines. # ------------------------------------------------------------ computing: minimum_num_gpus: 1 # minimum # of GPUs to provision diff --git a/python/examples/deploy/multi_service/README.md b/python/examples/deploy/multi_service/README.md index 2b897d087a..59bd7429f3 100644 --- a/python/examples/deploy/multi_service/README.md +++ b/python/examples/deploy/multi_service/README.md @@ -15,7 +15,7 @@ fedml model create --name $model_name --config_file config.yaml ``` ## On-premsie Deploy -Register an account on FedML website: https://fedml.ai +Register an account on TensorOpera website: https://tensoropera.ai You will have a user id and api key, which can be found in the profile page. @@ -44,8 +44,8 @@ You will have a user id and api key, which can be found in the profile page. ``` - Result - See the deployment result in https://fedml.ai + See the deployment result in https://tensoropera.ai - OPT2: Deploy - UI - Follow the instructions on https://fedml.ai + Follow the instructions on https://tensoropera.ai diff --git a/python/examples/deploy/quick_start/README.md b/python/examples/deploy/quick_start/README.md index 1f67f587fd..b7a03aeea6 100644 --- a/python/examples/deploy/quick_start/README.md +++ b/python/examples/deploy/quick_start/README.md @@ -16,7 +16,7 @@ Use -cf to indicate the configuration file. curl -XPOST localhost:2345/predict -d '{"text": "Hello"}' ``` -## Option 2: Deploy to the Cloud (Using fedml®launch platform) +## Option 2: Deploy to the Cloud (Using TensorOpera®launch platform) - Uncomment the following line in config.yaml For information about the configuration, please refer to fedml ® launch. diff --git a/python/examples/deploy/quick_start/config.yaml b/python/examples/deploy/quick_start/config.yaml index 83479068e6..880ea92d2d 100644 --- a/python/examples/deploy/quick_start/config.yaml +++ b/python/examples/deploy/quick_start/config.yaml @@ -1,21 +1,8 @@ -workspace: "./src" +workspace: "." entry_point: "main_entry.py" + # If you want to install some packages # Please write the command in the bootstrap.sh bootstrap: | - echo "Bootstrap start..." - sh ./config/bootstrap.sh - echo "Bootstrap finished" - -# If you do not have any GPU resource but want to serve the model -# Try FedML® Nexus AI Platform, and Uncomment the following lines. -# ------------------------------------------------------------ -computing: - minimum_num_gpus: 1 # minimum # of GPUs to provision - maximum_cost_per_hour: $3000 # max cost per hour for your job per gpu card - #allow_cross_cloud_resources: true # true, false - #device_type: CPU # options: GPU, CPU, hybrid - resource_type: A100-80G # e.g., A100-80G, - # please check the resource type list by "fedml show-resource-type" - # or visiting URL: https://open.fedml.ai/accelerator_resource_type -# ------------------------------------------------------------ + echo "Install some packages..." + echo "Install finished!" diff --git a/python/examples/deploy/quick_start/main_entry.py b/python/examples/deploy/quick_start/main_entry.py new file mode 100644 index 0000000000..7c4fb910b0 --- /dev/null +++ b/python/examples/deploy/quick_start/main_entry.py @@ -0,0 +1,27 @@ +from fedml.serving import FedMLPredictor +from fedml.serving import FedMLInferenceRunner + + +class Bot(FedMLPredictor): # Inherit FedMLClientPredictor + def __init__(self): + super().__init__() + + # --- Your model initialization code here --- + + # ------------------------------------------- + + def predict(self, request: dict): + input_dict = request + question: str = input_dict.get("text", "").strip() + + # --- Your model inference code here --- + response = "I do not know the answer to your question." + # --------------------------------------- + + return {"generated_text": f"The answer to your question {question} is: {response}"} + + +if __name__ == "__main__": + chatbot = Bot() + fedml_inference_runner = FedMLInferenceRunner(chatbot) + fedml_inference_runner.run() diff --git a/python/examples/deploy/quick_start/src/config/bootstrap.sh b/python/examples/deploy/quick_start/src/config/bootstrap.sh deleted file mode 100644 index 950b749792..0000000000 --- a/python/examples/deploy/quick_start/src/config/bootstrap.sh +++ /dev/null @@ -1,14 +0,0 @@ -### don't modify this part ### -set -x -############################## - - -### please customize your script in this region #### -pip install langchain -pip install transformers -pip install accelerate -pip install "pydantic>=1.8.0,<2.0.0" - -### don't modify this part ### -exit 0 -############################## \ No newline at end of file diff --git a/python/examples/deploy/quick_start/src/main_entry.py b/python/examples/deploy/quick_start/src/main_entry.py deleted file mode 100644 index 82ff90155e..0000000000 --- a/python/examples/deploy/quick_start/src/main_entry.py +++ /dev/null @@ -1,67 +0,0 @@ -import os -from fedml.serving import FedMLPredictor -from fedml.serving import FedMLInferenceRunner -from langchain import PromptTemplate, LLMChain -from langchain.llms import HuggingFacePipeline -import torch -from transformers import ( - AutoConfig, - AutoModelForCausalLM, - AutoTokenizer, - TextGenerationPipeline, -) - -class Chatbot(FedMLPredictor): # Inherit FedMLClientPredictor - def __init__(self): - super().__init__() - PROMPT_FOR_GENERATION_FORMAT = f""""Below is an instruction that describes a task. Write a response that appropriately completes the request." - - ### Instruction: - {{instruction}} - - ### Response: - """ - - prompt = PromptTemplate( - input_variables=["instruction"], - template=PROMPT_FOR_GENERATION_FORMAT - ) - - config = AutoConfig.from_pretrained("EleutherAI/pythia-70m") - model = AutoModelForCausalLM.from_pretrained( - "EleutherAI/pythia-70m", - torch_dtype=torch.float32, # float 16 not supported on CPU - trust_remote_code=True, - device_map="auto" - ) - tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m", device_map="auto") - - hf_pipeline = HuggingFacePipeline( - pipeline=TextGenerationPipeline( - model=model, - tokenizer=tokenizer, - return_full_text=True, - task="text-generation", - do_sample=True, - max_new_tokens=256, - top_p=0.92, - top_k=0 - ) - ) - self.chatbot = LLMChain(llm=hf_pipeline, prompt=prompt, verbose=True) - - def predict(self, request:dict): - input_dict = request - question: str = input_dict.get("text", "").strip() - - if len(question) == 0: - response_text = "" - else: - response_text = self.chatbot.predict(instruction=question) - - return {"generated_text": str(response_text)} - -if __name__ == "__main__": - chatbot = Chatbot() - fedml_inference_runner = FedMLInferenceRunner(chatbot) - fedml_inference_runner.run() \ No newline at end of file diff --git a/python/examples/deploy/scalellm-multi-engine/README.md b/python/examples/deploy/scalellm-multi-engine/README.md index 4de6058c95..b65ad7dd5c 100644 --- a/python/examples/deploy/scalellm-multi-engine/README.md +++ b/python/examples/deploy/scalellm-multi-engine/README.md @@ -40,7 +40,7 @@ computing: #device_type: CPU # options: GPU, CPU, hybrid resource_type: A100-80G # e.g., A100-80G, # please check the resource type list by "fedml show-resource-type" - # or visiting URL: https://fedml.ai/accelerator_resource_type + # or visiting URL: https://tensoropera.ai/accelerator_resource_type ``` ```bash diff --git a/python/examples/deploy/scalellm/README.md b/python/examples/deploy/scalellm/README.md index 4de6058c95..b65ad7dd5c 100644 --- a/python/examples/deploy/scalellm/README.md +++ b/python/examples/deploy/scalellm/README.md @@ -40,7 +40,7 @@ computing: #device_type: CPU # options: GPU, CPU, hybrid resource_type: A100-80G # e.g., A100-80G, # please check the resource type list by "fedml show-resource-type" - # or visiting URL: https://fedml.ai/accelerator_resource_type + # or visiting URL: https://tensoropera.ai/accelerator_resource_type ``` ```bash diff --git a/python/examples/deploy/streaming_response/README.md b/python/examples/deploy/streaming_response/README.md index f91cda5278..b190b50dc7 100644 --- a/python/examples/deploy/streaming_response/README.md +++ b/python/examples/deploy/streaming_response/README.md @@ -16,7 +16,7 @@ Use -cf to indicate the configuration file. curl -XPOST localhost:2345/predict -d '{"text": "Hello"}' ``` -## Option 2: Deploy to the Cloud (Using fedml®launch platform) +## Option 2: Deploy to the Cloud (Using TensorOpera®launch platform) - Uncomment the following line in config.yaml For information about the configuration, please refer to fedml ® launch. diff --git a/python/examples/deploy/streaming_response/config.yaml b/python/examples/deploy/streaming_response/config.yaml index 83479068e6..1a18b9d85b 100644 --- a/python/examples/deploy/streaming_response/config.yaml +++ b/python/examples/deploy/streaming_response/config.yaml @@ -8,7 +8,7 @@ bootstrap: | echo "Bootstrap finished" # If you do not have any GPU resource but want to serve the model -# Try FedML® Nexus AI Platform, and Uncomment the following lines. +# Try TensorOpera® Nexus AI Platform, and Uncomment the following lines. # ------------------------------------------------------------ computing: minimum_num_gpus: 1 # minimum # of GPUs to provision diff --git a/python/examples/deploy/triton/README.md b/python/examples/deploy/triton/README.md index 4d861fb7ff..5430939d28 100644 --- a/python/examples/deploy/triton/README.md +++ b/python/examples/deploy/triton/README.md @@ -39,7 +39,7 @@ fedml model create --name $model_name --config_file config.yaml ``` ## On-premsie Deploy -Register an account on FedML website: https://fedml.ai +Register an account on TensorOpera website: https://tensoropera.ai You will have a user id and api key, which can be found in the profile page. @@ -68,8 +68,8 @@ You will have a user id and api key, which can be found in the profile page. ``` - Result - See the deployment result in https://fedml.ai + See the deployment result in https://tensoropera.ai - OPT2: Deploy - UI - Follow the instructions on https://fedml.ai + Follow the instructions on https://tensoropera.ai diff --git a/python/examples/deploy/your_own_llm/README.md b/python/examples/deploy/your_own_llm/README.md index fc7234293b..415db7fe92 100644 --- a/python/examples/deploy/your_own_llm/README.md +++ b/python/examples/deploy/your_own_llm/README.md @@ -9,9 +9,9 @@ fedml model deploy --name llm --local #INFO: Uvicorn running on http://0.0.0.0:2345 (Press CTRL+C to quit) curl -XPOST localhost:2345/predict -d '{"text": "Hello"}' ``` -## Option 2: Deploy to the Cloud (Using fedml®launch platform) +## Option 2: Deploy to the Cloud (Using TensorOpera®launch platform) Uncomment the following line in llm.yaml, -for infomation about the configuration, please refer to fedml®launch. +for infomation about the configuration, please refer to TensorOpera®launch. ```yaml # computing: # minimum_num_gpus: 1 diff --git a/python/examples/deploy/your_own_llm/llm.yaml b/python/examples/deploy/your_own_llm/llm.yaml index 5e5e09730b..b3b3d5da15 100644 --- a/python/examples/deploy/your_own_llm/llm.yaml +++ b/python/examples/deploy/your_own_llm/llm.yaml @@ -11,7 +11,7 @@ bootstrap: | echo "Bootstrap finished" # If you do not have any GPU resource but want to serve the model -# Try FedML® Nexus AI Platform, and Uncomment the following lines. +# Try TensorOpera® Nexus AI Platform, and Uncomment the following lines. # ------------------------------------------------------------ # computing: # minimum_num_gpus: 1 # minimum # of GPUs to provision diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/README.md b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/README.md index 47250ef894..3ee1850a03 100644 --- a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/README.md +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/README.md @@ -8,22 +8,25 @@ comm_args: grpc_ipconfig_path: config/grpc_ipconfig.csv ``` -`grpc_ipconfig_path` specifies the path of the config for gRPC communication. Config file specifies an ip address for each process through with they can communicate with each other. The config file should have the folliwng format: +`grpc_ipconfig_path` specifies the path of the config for gRPC communication. Config file specifies an ip address for each process through with they can communicate with each other. The config file should have the following format: ```csv -receiver_id,ip -0,127.0.0.1 -1,127.0.0.1 -2,127.0.0.1 +eid,rank,grpc_server_ip,grpc_server_port +0,0,0.0.0.0,8890 +1,1,0.0.0.0,8899 +2,2,0.0.0.0,8898 ``` -Here the `receiver_id` is the rank of the process. +Here, `eid, rank, ip, port` are the id, rank, ip address and port of the server or client process. For server processes the rank is always set to 0, while for clients is always set to 1 or above. ## One Line API Example -Example is provided at: +Examples are provided at: `python/examples/cross_silo/grpc_fedavg_mnist_lr_example/one_line` +`python/examples/cross_silo/grpc_fedavg_mnist_lr_example/step_by_step` +`python/examples/cross_silo/grpc_fedavg_mnist_lr_example/custom_data_and_model` + ### Training Script At the client side, the client ID (a.k.a rank) starts from 1. diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/custom_data_and_model/README.md b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/custom_data_and_model/README.md index d125847dd6..22e628e502 100644 --- a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/custom_data_and_model/README.md +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/custom_data_and_model/README.md @@ -1,7 +1,21 @@ -## Training Script -At the client side, the client ID (a.k.a rank) starts from 1. -Please also modify config/fedml_config.yaml, changing the `worker_num` the as the number of clients you plan to run. +## Prerequisites +At the client side, the client ID (a.k.a rank) starts from 1. +Please also modify `config/fedml_config.yaml` as you see fit. Changing the `worker_num` the as the number of clients you plan to run. +The default ip of every groc server is set to `0.0.0.0`, and all grpc ports start from 8890 and increase based on the client's rank. + +> **_NOTE:_** +> The `config/grpc_ipconfig.csv` file contains only one record referring to the grpc server of +> the aggregator (rank: 0). This record is mandatory. However, you can change the values of the `ip` and `port` +> attributes as you see fit, and more records for grpc server of the rest of clients. For instance: +``` +eid,rank,grpc_server_ip,grpc_server_port +0,0,0.0.0.0,8890 +1,1,0.0.0.0,8891 +2,2,0.0.0.0,8892 +``` + +## Start Script At the server side, run the following script: ``` @@ -18,7 +32,7 @@ bash run_client.sh 2 your_run_id ``` Note: please run the server first. -## A Better User-experience with FedML MLOps (open.fedml.ai) +## A Better User-experience with FedML FLOps (fedml.ai) To reduce the difficulty and complexity of these CLI commands. We recommend you to use our MLOps (open.fedml.ai). FedML MLOps provides: - Install Client Agent and Login diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/custom_data_and_model/config/grpc_ipconfig.csv b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/custom_data_and_model/config/grpc_ipconfig.csv index f97ee8f910..9729b81833 100644 --- a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/custom_data_and_model/config/grpc_ipconfig.csv +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/custom_data_and_model/config/grpc_ipconfig.csv @@ -1,4 +1,2 @@ -receiver_id,ip -0,127.0.0.1 -1,127.0.0.1 -2,127.0.0.1 \ No newline at end of file +eid,rank,grpc_server_ip,grpc_server_port +0,0,0.0.0.0,8890 diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/README.md b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/README.md new file mode 100644 index 0000000000..8c56622d06 --- /dev/null +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/README.md @@ -0,0 +1,51 @@ + +# Introduction +In this working example, we will run 1 aggregation server and 2 clients on the same machine using Docker + gRPC and we will use the TensorOpera.ai platform to run the FL job. + +# gRPC Configuration File +The content of the gRPC configuration file is as follows: +``` +eid,rank,grpc_server_ip,grpc_server_port,ingress_ip +0,0,0.0.0.0,8890,fedml_server +1,1,0.0.0.0,8899,fedml_client_1 +2,2,0.0.0.0,8898,fedml_client_2 +``` +The ingress_ip variable refers to the name of the container that we assign to either the server or the client, as we discuss in detail below: + + +# Docker Configuration +Before creating any docker container one our machine, we need to pull the latest fedml image (e.g., `fedml:v090`) and ensure that all spawned containers can communicate to each other through a network bridge (e.g., `fedml_grpc_network`). +Specifically, what you need to do is: +```bash +docker pull fedml:v090 +docker network create fedml_grpc_network +``` + +Once these two steps are configured we can start 1 aggregation server and 2 clients (without using a GPU) and register them using our with the fedml platform as follows: + +```bash +# Server +docker run -it -p 8890:8890 --entrypoint /bin/bash --name fedml_server --network fedml_grpc_network fedml:dev090 +redis-server --daemonize yes +source /fedml/bin/activate +fedml login -s +``` + +```bash +# Client 1 +docker run -it -p 8891:8891 --entrypoint /bin/bash --name fedml_client_1 --network fedml_grpc_network fedml:dev090 +redis-server --daemonize yes +source /fedml/bin/activate +fedml login -c +``` + +```bash +# Client-2 +docker run -it -p 8892:8892 --entrypoint /bin/bash --name fedml_client_2 --network fedml_grpc_network fedml:dev090 +redis-server --daemonize yes +source /fedml/bin/activate +fedml login -c +``` + +Then we only need to compile our job and submit to our dockerb-based cluster as it is also discussed in detail in the official TensorOpera documentation: https://tensoropera.ai/octopus/userGuides + diff --git a/python/examples/deploy/quick_start/__init__.py b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/__init__.py similarity index 100% rename from python/examples/deploy/quick_start/__init__.py rename to python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/__init__.py diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/config/bootstrap.bat b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/config/bootstrap.bat new file mode 100755 index 0000000000..fb0dd54d6d --- /dev/null +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/config/bootstrap.bat @@ -0,0 +1,12 @@ +:: ### don't modify this part ### +:: ############################## + + +:: ### please customize your script in this region #### +set DATA_PATH=%userprofile%\fedml_data +if exist %DATA_PATH% (echo Exist %DATA_PATH%) else mkdir %DATA_PATH% + + +:: ### don't modify this part ### +echo [FedML]Bootstrap Finished +:: ############################## \ No newline at end of file diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/config/bootstrap.sh b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/config/bootstrap.sh new file mode 100644 index 0000000000..3d969974b0 --- /dev/null +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/config/bootstrap.sh @@ -0,0 +1,7 @@ + +# pip install fedml==0.7.15 +#pip install --upgrade fedml + +### don't modify this part ### +echo "[FedML]Bootstrap Finished" +############################## diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/config/fedml_config.yaml b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/config/fedml_config.yaml new file mode 100644 index 0000000000..d7183b6ada --- /dev/null +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/config/fedml_config.yaml @@ -0,0 +1,52 @@ +common_args: + training_type: "cross_silo" + scenario: "horizontal" + using_mlops: false + random_seed: 0 + +environment_args: + bootstrap: config/bootstrap.sh + +data_args: + dataset: "mnist" + data_cache_dir: "../../../../data/mnist" + partition_method: "hetero" + partition_alpha: 0.5 + +model_args: + model: "lr" + model_file_cache_folder: "./model_file_cache" # will be filled by the server automatically + global_model_file_path: "./model_file_cache/global_model.pt" + +train_args: + federated_optimizer: "FedAvg" + client_id_list: + client_num_in_total: 1000 + client_num_per_round: 2 + comm_round: 50 + epochs: 1 + batch_size: 10 + client_optimizer: sgd + learning_rate: 0.03 + weight_decay: 0.001 + +validation_args: + frequency_of_the_test: 5 + +device_args: + worker_num: 2 + using_gpu: false + gpu_mapping_file: config/gpu_mapping.yaml + gpu_mapping_key: mapping_default + +comm_args: + backend: "GRPC" + grpc_ipconfig_path: config/grpc_ipconfig.csv + +tracking_args: + # When running on MLOps platform(open.fedml.ai), the default log path is at ~/.fedml/fedml-client/fedml/logs/ and ~/.fedml/fedml-server/fedml/logs/ + local_log_output_path: ./log + enable_wandb: false + wandb_key: ee0b5f53d949c84cee7decbe7a619e63fb1f8408 + wandb_project: fedml + wandb_name: fedml_torch_fedavg_mnist_lr \ No newline at end of file diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/config/grpc_ipconfig.csv b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/config/grpc_ipconfig.csv new file mode 100644 index 0000000000..8f461936dd --- /dev/null +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/config/grpc_ipconfig.csv @@ -0,0 +1,3 @@ +eid,rank,grpc_server_ip,grpc_server_port,ingress_ip +0,0,0.0.0.0,8890,fedml_server +1,1,0.0.0.0,8891,fedml_client_1 \ No newline at end of file diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/run_client.sh b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/run_client.sh new file mode 100644 index 0000000000..18d3cea9fe --- /dev/null +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/run_client.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +RANK=$1 +python3 torch_client.py --cf config/fedml_config.yaml --rank $RANK --role client \ No newline at end of file diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/run_server.sh b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/run_server.sh new file mode 100644 index 0000000000..08007b7e81 --- /dev/null +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/run_server.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +python3 torch_server.py --cf config/fedml_config.yaml --rank 0 --role server \ No newline at end of file diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/torch_client.py b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/torch_client.py new file mode 100644 index 0000000000..9085c85ebe --- /dev/null +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/torch_client.py @@ -0,0 +1,18 @@ +import fedml +from fedml import FedMLRunner + +if __name__ == "__main__": + args = fedml.init() + + # init device + device = fedml.device.get_device(args) + + # load data + dataset, output_dim = fedml.data.load(args) + + # load model + model = fedml.model.create(args, output_dim) + + # start training + fedml_runner = FedMLRunner(args, device, dataset, model) + fedml_runner.run() diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/torch_server.py b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/torch_server.py new file mode 100644 index 0000000000..9085c85ebe --- /dev/null +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/grpc_docker_fedmlai/torch_server.py @@ -0,0 +1,18 @@ +import fedml +from fedml import FedMLRunner + +if __name__ == "__main__": + args = fedml.init() + + # init device + device = fedml.device.get_device(args) + + # load data + dataset, output_dim = fedml.data.load(args) + + # load model + model = fedml.model.create(args, output_dim) + + # start training + fedml_runner = FedMLRunner(args, device, dataset, model) + fedml_runner.run() diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/one_line/README.md b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/one_line/README.md index d125847dd6..22e628e502 100644 --- a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/one_line/README.md +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/one_line/README.md @@ -1,7 +1,21 @@ -## Training Script -At the client side, the client ID (a.k.a rank) starts from 1. -Please also modify config/fedml_config.yaml, changing the `worker_num` the as the number of clients you plan to run. +## Prerequisites +At the client side, the client ID (a.k.a rank) starts from 1. +Please also modify `config/fedml_config.yaml` as you see fit. Changing the `worker_num` the as the number of clients you plan to run. +The default ip of every groc server is set to `0.0.0.0`, and all grpc ports start from 8890 and increase based on the client's rank. + +> **_NOTE:_** +> The `config/grpc_ipconfig.csv` file contains only one record referring to the grpc server of +> the aggregator (rank: 0). This record is mandatory. However, you can change the values of the `ip` and `port` +> attributes as you see fit, and more records for grpc server of the rest of clients. For instance: +``` +eid,rank,grpc_server_ip,grpc_server_port +0,0,0.0.0.0,8890 +1,1,0.0.0.0,8891 +2,2,0.0.0.0,8892 +``` + +## Start Script At the server side, run the following script: ``` @@ -18,7 +32,7 @@ bash run_client.sh 2 your_run_id ``` Note: please run the server first. -## A Better User-experience with FedML MLOps (open.fedml.ai) +## A Better User-experience with FedML FLOps (fedml.ai) To reduce the difficulty and complexity of these CLI commands. We recommend you to use our MLOps (open.fedml.ai). FedML MLOps provides: - Install Client Agent and Login diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/one_line/config/grpc_ipconfig.csv b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/one_line/config/grpc_ipconfig.csv index f97ee8f910..8d082a9613 100644 --- a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/one_line/config/grpc_ipconfig.csv +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/one_line/config/grpc_ipconfig.csv @@ -1,4 +1,2 @@ -receiver_id,ip -0,127.0.0.1 -1,127.0.0.1 -2,127.0.0.1 \ No newline at end of file +eid,rank,grpc_server_ip,grpc_server_port +0,0,0.0.0.0,8890 \ No newline at end of file diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/step_by_step/README.md b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/step_by_step/README.md index d125847dd6..22e628e502 100644 --- a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/step_by_step/README.md +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/step_by_step/README.md @@ -1,7 +1,21 @@ -## Training Script -At the client side, the client ID (a.k.a rank) starts from 1. -Please also modify config/fedml_config.yaml, changing the `worker_num` the as the number of clients you plan to run. +## Prerequisites +At the client side, the client ID (a.k.a rank) starts from 1. +Please also modify `config/fedml_config.yaml` as you see fit. Changing the `worker_num` the as the number of clients you plan to run. +The default ip of every groc server is set to `0.0.0.0`, and all grpc ports start from 8890 and increase based on the client's rank. + +> **_NOTE:_** +> The `config/grpc_ipconfig.csv` file contains only one record referring to the grpc server of +> the aggregator (rank: 0). This record is mandatory. However, you can change the values of the `ip` and `port` +> attributes as you see fit, and more records for grpc server of the rest of clients. For instance: +``` +eid,rank,grpc_server_ip,grpc_server_port +0,0,0.0.0.0,8890 +1,1,0.0.0.0,8891 +2,2,0.0.0.0,8892 +``` + +## Start Script At the server side, run the following script: ``` @@ -18,7 +32,7 @@ bash run_client.sh 2 your_run_id ``` Note: please run the server first. -## A Better User-experience with FedML MLOps (open.fedml.ai) +## A Better User-experience with FedML FLOps (fedml.ai) To reduce the difficulty and complexity of these CLI commands. We recommend you to use our MLOps (open.fedml.ai). FedML MLOps provides: - Install Client Agent and Login diff --git a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/step_by_step/config/grpc_ipconfig.csv b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/step_by_step/config/grpc_ipconfig.csv index f97ee8f910..8d082a9613 100644 --- a/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/step_by_step/config/grpc_ipconfig.csv +++ b/python/examples/federate/cross_silo/grpc_fedavg_mnist_lr_example/step_by_step/config/grpc_ipconfig.csv @@ -1,4 +1,2 @@ -receiver_id,ip -0,127.0.0.1 -1,127.0.0.1 -2,127.0.0.1 \ No newline at end of file +eid,rank,grpc_server_ip,grpc_server_port +0,0,0.0.0.0,8890 \ No newline at end of file diff --git a/python/examples/federate/cross_silo/trpc_fedavg_mnist_lr_example/one_line/config/trpc_master_config.csv b/python/examples/federate/cross_silo/trpc_fedavg_mnist_lr_example/one_line/config/trpc_master_config.csv index dbfb0c6a4b..9708cd18e6 100644 --- a/python/examples/federate/cross_silo/trpc_fedavg_mnist_lr_example/one_line/config/trpc_master_config.csv +++ b/python/examples/federate/cross_silo/trpc_fedavg_mnist_lr_example/one_line/config/trpc_master_config.csv @@ -1,2 +1,2 @@ master_ip, master_port -127.0.0.1,29600 +0.0.0.0,29600 diff --git a/python/examples/launch/README.md b/python/examples/launch/README.md index 1ded267276..fc79cbfe26 100644 --- a/python/examples/launch/README.md +++ b/python/examples/launch/README.md @@ -132,7 +132,7 @@ You just need to customize the following config items. 3. `bootstrap`, It is the bootstrap shell command which will be executed before running entry commands. -Then you can use the following example CLI to launch the job at FedML® Nexus AI Platform +Then you can use the following example CLI to launch the job at TensorOpera® Nexus AI Platform (Replace $YourApiKey with your own account API key from open.fedml.ai) Example: @@ -142,7 +142,7 @@ fedml launch hello_job.yaml After the launch CLI is executed, the output is as follows. Here you may open the job url to confirm and actually start the job. ``` -Submitting your job to FedML® Nexus AI Platform: 100%|████████████████████████████████████████████████████████████████████████████████████████| 6.07k/6.07k [00:01<00:00, 4.94kB/s] +Submitting your job to TensorOpera® Nexus AI Platform: 100%|████████████████████████████████████████████████████████████████████████████████████████| 6.07k/6.07k [00:01<00:00, 4.94kB/s] Searched and matched the following GPU resource for your job: +-----------+-------------------+---------+------------+-------------------------+---------+-------+----------+ diff --git a/python/examples/launch/dump.rdb b/python/examples/launch/dump.rdb new file mode 100644 index 0000000000..b805be10a8 Binary files /dev/null and b/python/examples/launch/dump.rdb differ diff --git a/python/examples/launch/federate_build_package/README.md b/python/examples/launch/federate_build_package/README.md index c0d3356150..325258407e 100644 --- a/python/examples/launch/federate_build_package/README.md +++ b/python/examples/launch/federate_build_package/README.md @@ -3,7 +3,7 @@ ``` Usage: fedml federate build [OPTIONS] [YAML_FILE] - Build federate packages for the FedML® Nexus AI Platform. + Build federate packages for the TensorOpera® Nexus AI Platform. Options: -h, --help Show this message and exit. diff --git a/python/examples/launch/hello_job_with_container.yaml b/python/examples/launch/hello_job_with_container.yaml index 2c520beb24..26202a3d98 100755 --- a/python/examples/launch/hello_job_with_container.yaml +++ b/python/examples/launch/hello_job_with_container.yaml @@ -43,7 +43,7 @@ job_type: train # options: train, deploy, federate job_subtype: generate_training docker: - image: fedml/fedml-default-launch:cu12.1-u22.04 + image: fedml/fedml-launch-job:cu12.1-u22.04 #registry: docker.io #username: my_hub_user #password: my_hub_password diff --git a/python/examples/launch/serve_mnist/fedml_model_config.yaml b/python/examples/launch/serve_mnist/fedml_model_config.yaml index 48254ccca4..f212dbb81d 100644 --- a/python/examples/launch/serve_mnist/fedml_model_config.yaml +++ b/python/examples/launch/serve_mnist/fedml_model_config.yaml @@ -1,6 +1,8 @@ workspace: "./" entry_point: "mnist_serve_main.py" +auto_detect_public_ip: true + data_cache_dir: "" bootstrap: "" diff --git a/python/examples/launch/train_build_package/README.md b/python/examples/launch/train_build_package/README.md index 03c8dbe71b..f0f1dff857 100644 --- a/python/examples/launch/train_build_package/README.md +++ b/python/examples/launch/train_build_package/README.md @@ -3,7 +3,7 @@ ``` Usage: fedml train build [OPTIONS] [YAML_FILE] - Build training packages for the FedML® Nexus AI Platform. + Build training packages for the TensorOpera® Nexus AI Platform. Options: -h, --help Show this message and exit. diff --git a/python/examples/train/README.md b/python/examples/train/README.md index 9a6853d740..0e301c86b2 100644 --- a/python/examples/train/README.md +++ b/python/examples/train/README.md @@ -1 +1 @@ -# Examples (Prebuilt Jobs) for FEDML®Train \ No newline at end of file +# Examples (Prebuilt Jobs) for TensorOpera®Train \ No newline at end of file diff --git a/python/examples/train/llm_train/job.yaml b/python/examples/train/llm_train/job.yaml index d1ba08ed4c..a9e81c91f7 100644 --- a/python/examples/train/llm_train/job.yaml +++ b/python/examples/train/llm_train/job.yaml @@ -44,4 +44,4 @@ computing: allow_cross_cloud_resources: false # true, false device_type: GPU # options: GPU, CPU, hybrid - resource_type: A100-80G # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://fedml.ai/accelerator_resource_type + resource_type: A100-80G # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://tensoropera.ai/accelerator_resource_type diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py index 92b72357a0..677d06b4e5 100644 --- a/python/fedml/__init__.py +++ b/python/fedml/__init__.py @@ -1,5 +1,5 @@ import logging -from copy import deepcopy +import platform import multiprocess as multiprocessing import os @@ -9,7 +9,10 @@ import torch import fedml +import dotenv + from .computing.scheduler.env.collect_env import collect_env +from fedml.computing.scheduler.env import set_env_kv, load_env from .constants import ( FEDML_BACKEND_SERVICE_URL_DEV, FEDML_BACKEND_SERVICE_URL_LOCAL, @@ -34,7 +37,7 @@ _global_training_type = None _global_comm_backend = None -__version__ = "0.8.30" +__version__ = "0.9.2" # This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release @@ -90,9 +93,7 @@ def init(args=None, check_env=True, should_init_logs=True): # Windows/Linux/MacOS compatability issues on multi-processing # https://github.com/pytorch/pytorch/issues/3492 """ - if multiprocessing.get_start_method() != "spawn": - # force all platforms (Windows/Linux/MacOS) to use the same way (spawn) for multiprocessing - multiprocessing.set_start_method("spawn", force=True) + _init_multiprocessing() """ # https://stackoverflow.com/questions/53014306/error-15-initializing-libiomp5-dylib-but-found-libiomp5-dylib-already-initial @@ -444,12 +445,33 @@ def _run_distributed(): pass +def _init_multiprocessing(): + """ + # Windows/Linux/MacOS compatability issues on multi-processing + # https://github.com/pytorch/pytorch/issues/3492 + """ + if platform.system() == "Windows": + if multiprocessing.get_start_method() != "spawn": + # force all platforms (Windows/Linux/macOS) to use the same way (spawn) for multiprocessing + multiprocessing.set_start_method("spawn", force=True) + else: + if multiprocessing.get_start_method() != "fork": + # force all platforms (Windows/Linux/macOS) to use the same way (fork) for multiprocessing + multiprocessing.set_start_method("fork", force=True) + + def set_env_version(version): - os.environ['FEDML_ENV_VERSION'] = version + set_env_kv("FEDML_ENV_VERSION", version) + load_env() def get_env_version(): - return "release" if os.environ.get('FEDML_ENV_VERSION') is None else os.environ['FEDML_ENV_VERSION'] + load_env() + version = os.getenv("FEDML_ENV_VERSION") + if version is None: + version = "release" + set_env_version(version) + return version def _get_backend_service(): @@ -489,7 +511,8 @@ def _get_mqtt_service(): def set_local_on_premise_platform_host(local_on_premise_platform_host): - os.environ['FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST'] = local_on_premise_platform_host + # Should Also update the .env file + set_env_kv("FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST", local_on_premise_platform_host) def get_local_on_premise_platform_host(): @@ -497,7 +520,7 @@ def get_local_on_premise_platform_host(): def set_local_on_premise_platform_port(local_on_premise_platform_port): - os.environ['FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_PORT'] = str(local_on_premise_platform_port) + set_env_kv("FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_PORT", str(local_on_premise_platform_port)) def get_local_on_premise_platform_port(): @@ -506,7 +529,7 @@ def get_local_on_premise_platform_port(): def _get_local_s3_like_service_url(): return FEDML_S3_DOMAIN_LOCAL - + from fedml import device from fedml import data diff --git a/python/fedml/api/__init__.py b/python/fedml/api/__init__.py index 4e004f07d3..6c82c9b9b3 100755 --- a/python/fedml/api/__init__.py +++ b/python/fedml/api/__init__.py @@ -19,11 +19,13 @@ from fedml.api.fedml_response import FedMLResponse from fedml.api.modules import launch, utils, build, device, logs, diagnosis, cluster, run, train, federate, storage, \ model as model_module # Since "model" has conflict with one of the input parameters, we need to rename it -from fedml.api.modules.launch import FeatureEntryPoint from fedml.api.modules.storage import StorageMetadata +from fedml.computing.scheduler.scheduler_core.general_constants import MarketplaceType from fedml.computing.scheduler.scheduler_entry.cluster_manager import FedMLClusterModelList from fedml.computing.scheduler.scheduler_entry.run_manager import FedMLRunStartedModel, FedMLGpuDevices, \ FedMLRunModelList, FeatureEntryPoint +from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants +from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants def fedml_login(api_key: str = None): @@ -179,13 +181,12 @@ def cluster_killall(api_key=None) -> bool: return cluster.kill(cluster_names=(), api_key=api_key) -def upload(data_path, api_key=None, service="R2", name=None, description=None, metadata=None, show_progress=False, +def upload(data_path, api_key=None, tag_list=[], service="R2", name=None, description=None, metadata=None, show_progress=False, out_progress_to_err=True, progress_desc=None) -> FedMLResponse: - return storage.upload(data_path=data_path, api_key=api_key, name=name, description=description, + return storage.upload(data_path=data_path, api_key=api_key, name=name, description=description, tag_list =tag_list, service=service, progress_desc=progress_desc, show_progress=show_progress, out_progress_to_err=out_progress_to_err, metadata=metadata) - def get_storage_user_defined_metadata(data_name, api_key=None) -> FedMLResponse: return storage.get_user_metadata(data_name=data_name, api_key=api_key) @@ -210,16 +211,25 @@ def fedml_build(platform, type, source_folder, entry_point, config_folder, dest_ return build.build(platform, type, source_folder, entry_point, config_folder, dest_folder, ignore) -def login(api_key, computing, server, supplier): - device_bind(api_key, computing, server, supplier) +def login(api_key, computing, server, supplier, + master_inference_gateway_port: int = ServerConstants.MODEL_INFERENCE_DEFAULT_PORT, + worker_inference_proxy_port: int = ClientConstants.LOCAL_CLIENT_API_PORT, + worker_connection_type: str = ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT, + marketplace_type: str = MarketplaceType.SECURE.name, price_per_hour: float = 0.0, name=""): + device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port, + worker_connection_type, marketplace_type, price_per_hour, name) def logout(computing, server): device_unbind(computing, server) -def device_bind(api_key, computing, server, supplier): - device.bind(api_key, computing, server, supplier) +def device_bind(api_key, computing, server, supplier, master_inference_gateway_port, worker_inference_proxy_port, + worker_connection_type, marketplace_type, price_per_hour, name): + device.bind(api_key=api_key, computing=computing, server=server, supplier=supplier, + master_inference_gateway_port=master_inference_gateway_port, + worker_inference_proxy_port=worker_inference_proxy_port, worker_connection_type=worker_connection_type, + marketplace_type=marketplace_type, price_per_hour=price_per_hour, name=name) def device_unbind(computing, server): diff --git a/python/fedml/api/api_test.py b/python/fedml/api/api_test.py index 54da088d0d..5a01a76448 100755 --- a/python/fedml/api/api_test.py +++ b/python/fedml/api/api_test.py @@ -4,9 +4,9 @@ import fedml # Login -fedml.set_env_version("local") +fedml.set_env_version("test") fedml.set_local_on_premise_platform_port(18080) -error_code, error_msg = fedml.api.fedml_login(api_key="1316b93c82da40ce90113a2ed12f0b14") +error_code, error_msg = fedml.api.fedml_login(api_key="") if error_code != 0: print("API Key is invalid!") exit(1) @@ -18,20 +18,23 @@ yaml_file = os.path.join(python_dir, "examples", "launch", "hello_job.yaml") # Launch job +launch_result_list = list() for i in range(0, 10): launch_result = fedml.api.launch_job(yaml_file) + launch_result_list.append(launch_result) # launch_result = fedml.api.launch_job_on_cluster(yaml_file, "alex-cluster") if launch_result.result_code != 0: print(f"Failed to launch job. Reason: {launch_result.result_message}") -exit(1) - # Get job status -log_result = fedml.api.run_logs(launch_result.run_id, 1, 100) -if log_result is None or log_result.run_status is None: - print(f"Failed to get job status.") - exit(1) -print(f"Run status {log_result.run_status}") +while len(launch_result_list) > 0: + for launch_result in launch_result_list: + log_result = fedml.api.run_logs(launch_result.run_id, 1, 5) + if log_result is None or log_result.run_status is None: + print(f"Failed to get job status.") + #exit(1) + print(f"Run {launch_result.run_id}, status {log_result.run_status}") + time.sleep(0.5) # Get job logs time.sleep(30) diff --git a/python/fedml/api/constants.py b/python/fedml/api/constants.py index e804775f74..313da61798 100755 --- a/python/fedml/api/constants.py +++ b/python/fedml/api/constants.py @@ -18,7 +18,7 @@ class ApiConstants: RESOURCE_MATCHED_STATUS_BIND_CREDIT_CARD_FIRST = \ """ - Before we can start a job, please add a credit card to your FEDML account at https://fedml.ai/billing/home. + Before we can start a job, please add a credit card to your FEDML account at https://tensoropera.ai/billing. Once it's added, please try to run the launch command again """ @@ -106,4 +106,3 @@ def get_run_enum_from_str(cls, run_status_str: str): if run_status.value == run_status_str: return run_status return cls.UNDETERMINED - diff --git a/python/fedml/api/modules/build.py b/python/fedml/api/modules/build.py index 7d23bc02ed..9299944bb0 100644 --- a/python/fedml/api/modules/build.py +++ b/python/fedml/api/modules/build.py @@ -22,7 +22,7 @@ def build(platform, type, source_folder, entry_point, config_folder, dest_folder if type == "client" or type == "server": click.echo( - "Now, you are building the fedml packages which will be used in the FedML® Nexus AI Platform " + "Now, you are building the fedml packages which will be used in the TensorOpera® Nexus AI Platform " "platform." ) click.echo( @@ -34,7 +34,7 @@ def build(platform, type, source_folder, entry_point, config_folder, dest_folder + "." ) click.echo( - "Then you may upload the packages on the configuration page in the FedML® Nexus AI Platform to " + "Then you may upload the packages on the configuration page in the TensorOpera® Nexus AI Platform to " "start your training flow." ) click.echo("Building...") diff --git a/python/fedml/api/modules/device.py b/python/fedml/api/modules/device.py index 84aa42e7b2..4ca5695523 100644 --- a/python/fedml/api/modules/device.py +++ b/python/fedml/api/modules/device.py @@ -7,27 +7,27 @@ import fedml from fedml.api.modules.constants import ModuleConstants from fedml.computing.scheduler.comm_utils import sys_utils +from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils from fedml.computing.scheduler.master.server_constants import ServerConstants +from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants as DeviceServerConstants from fedml.computing.scheduler.master.server_login import logout as server_logout from fedml.computing.scheduler.slave.client_constants import ClientConstants +from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants as DeviceClientConstants from fedml.computing.scheduler.slave.client_login import logout as client_logout from fedml.computing.scheduler.scheduler_entry.resource_manager import FedMLResourceManager def bind( - api_key, computing, server, supplier -): + api_key, computing, server, supplier, marketplace_type, price_per_hour, name, + master_inference_gateway_port=DeviceServerConstants.MODEL_INFERENCE_DEFAULT_PORT, + worker_inference_proxy_port=DeviceClientConstants.LOCAL_CLIENT_API_PORT, + worker_connection_type=DeviceClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT): userid = api_key runner_cmd = "{}" device_id = "0" os_name = "" docker = None - docker_rank = 1 - infer_host = "127.0.0.1" - redis_addr = "local" - redis_port = "6379" - redis_password = "fedml_default" role = "" is_client = computing is_server = server @@ -47,25 +47,26 @@ def bind( _bind( userid, computing, server, api_key, role, runner_cmd, device_id, os_name, - docker, docker_rank, infer_host, - redis_addr, redis_port, redis_password - ) + docker, master_inference_gateway_port, worker_inference_proxy_port, worker_connection_type, marketplace_type, + price_per_hour, name) def _bind( - userid, computing, server, - api_key, role, runner_cmd, device_id, os_name, - docker, docker_rank, infer_host, - redis_addr, redis_port, redis_password -): + userid, computing, server, api_key, role, runner_cmd, device_id, os_name, docker, master_inference_gateway_port, + worker_inference_proxy_port, worker_connection_type, marketplace_type, price_per_hour, name): + fedml.load_env() if os.getenv(ModuleConstants.ENV_FEDML_INFER_HOST) is None: - os.environ[ModuleConstants.ENV_FEDML_INFER_HOST] = infer_host + fedml.set_env_kv(ModuleConstants.ENV_FEDML_INFER_HOST, SchedulerConstants.REDIS_INFER_HOST) if os.getenv(ModuleConstants.ENV_FEDML_INFER_REDIS_ADDR) is None: - os.environ[ModuleConstants.ENV_FEDML_INFER_REDIS_ADDR] = redis_addr + fedml.set_env_kv(ModuleConstants.ENV_FEDML_INFER_REDIS_ADDR, SchedulerConstants.REDIS_ADDR) if os.getenv(ModuleConstants.ENV_FEDML_INFER_REDIS_PORT) is None: - os.environ[ModuleConstants.ENV_FEDML_INFER_REDIS_PORT] = redis_port + fedml.set_env_kv(ModuleConstants.ENV_FEDML_INFER_REDIS_PORT, SchedulerConstants.REDIS_PORT) if os.getenv(ModuleConstants.ENV_FEDML_INFER_REDIS_PASSWORD) is None: - os.environ[ModuleConstants.ENV_FEDML_INFER_REDIS_PASSWORD] = redis_password + fedml.set_env_kv(ModuleConstants.ENV_FEDML_INFER_REDIS_PASSWORD, SchedulerConstants.REDIS_PASSWORD) + + fedml.set_env_kv(DeviceServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, str(master_inference_gateway_port)) + fedml.set_env_kv(DeviceClientConstants.ENV_CLIENT_PROXY_PORT_KEY, str(worker_inference_proxy_port)) + fedml.set_env_kv(DeviceClientConstants.ENV_CONNECTION_TYPE_KEY, worker_connection_type) url = fedml._get_backend_service() platform_name = platform.system() @@ -77,7 +78,7 @@ def _bind( else: docker_install_url = "https://docs.docker.com/engine/install/" docker_config_text = " Moreover, you need to config the docker engine to run as a non-root user. Here is the docs. https://docs.docker.com/engine/install/linux-postinstall/" - print("\n Welcome to FedML.ai! \n Start to login the current device to the FedML® Nexus AI Platform\n") + print("\n Welcome to FedML.ai! \n Start to login the current device to the TensorOpera® Nexus AI Platform\n") print(" If you want to deploy models into this computer, you need to install the docker engine to serve your models.") print(f" Here is the docs for installation docker engine. {docker_install_url}") if docker_config_text is not None: @@ -136,7 +137,7 @@ def _bind( client_daemon_cmd = "client_daemon.py" client_daemon_pids = RunProcessUtils.get_pid_from_cmd_line(client_daemon_cmd) if client_daemon_pids is not None and len(client_daemon_pids) > 0: - print("Your computer has been logged into the FedML® Nexus AI Platform. " + print("Your computer has been logged into the TensorOpera® Nexus AI Platform. " "Before logging in again, please log out of the previous login using the command " "'fedml logout -c'. If it still doesn't work, run the command 'fedml logout -c' " "using your computer's administrator account.") @@ -176,7 +177,13 @@ def _bind( "-k", user_api_key, "-ngc", - "1" + "1", + "-mpt", + marketplace_type, + "-pph", + str(price_per_hour), + "-n", + str(name) ] ).pid sys_utils.save_login_process(ClientConstants.LOCAL_HOME_RUNNER_DIR_NAME, @@ -186,7 +193,7 @@ def _bind( server_daemon_cmd = "server_daemon.py" server_daemon_pids = RunProcessUtils.get_pid_from_cmd_line(server_daemon_cmd) if server_daemon_pids is not None and len(server_daemon_pids) > 0: - print("Your computer has been logged into the FedML® Nexus AI Platform. " + print("Your computer has been logged into the TensorOpera® Nexus AI Platform. " "Before logging in again, please log out of the previous login using the command " "'fedml logout -s'. If it still doesn't work, run the command 'fedml logout -s' " "using your computer's administrator account.") diff --git a/python/fedml/api/modules/model.py b/python/fedml/api/modules/model.py index ca5d0b95c1..3b4a7afd0b 100644 --- a/python/fedml/api/modules/model.py +++ b/python/fedml/api/modules/model.py @@ -21,6 +21,9 @@ def create(name: str, model: str = None, model_config: str = None) -> bool: return True else: return False + elif model.startswith("tutorial:quick_start"): + # ../../../python/examples/deploy/quick_start + return False else: # TODO: Support arbitrary model creation from GitHub / Nexus AI Job Store click.echo("Model {} is not supported yet.".format(model)) @@ -249,9 +252,9 @@ def deploy(name: str, endpoint_name: str = "", endpoint_id: str = None, local: b return FedMLModelCards.get_instance().serve_model_on_premise( name, endpoint_name, master_ids, worker_ids, use_remote, endpoint_id) else: - # FedML® Launch deploy mode + # TensorOpera® Launch deploy mode click.echo("Warning: You did not indicate the master device id and worker device id\n\ - Do you want to use FedML® Nexus AI Platform to find GPU Resources deploy your model?") + Do you want to use TensorOpera® Nexus AI Platform to find GPU Resources deploy your model?") answer = click.prompt("Please input your answer: (y/n)") if answer == "y" or answer == "Y": api_key = get_api_key() diff --git a/python/fedml/api/modules/run.py b/python/fedml/api/modules/run.py index 120a964316..cf50ce24b4 100644 --- a/python/fedml/api/modules/run.py +++ b/python/fedml/api/modules/run.py @@ -51,7 +51,7 @@ def start(platform: str, create_run_result: FedMLRunStartedModel, device_server: run_start_result = FedMLRunManager.get_instance().start_run(platform=platform, create_run_result=create_run_result, device_server=device_server, device_edges=device_edges, - api_key=api_key, + api_key=get_api_key(), feature_entry_point=feature_entry_point) return run_start_result @@ -79,7 +79,7 @@ def status(run_name: Optional[str], run_id: str, platform: str, api_key: str) -> _authenticate_and_validate_platform(api_key, platform) run_status = None - run_list_obj = list_run(run_name=run_name, run_id=run_id, platform=platform, api_key=api_key) + run_list_obj = list_run(run_name=run_name, run_id=run_id, platform=platform, api_key=get_api_key()) if run_list_obj is not None: if len(run_list_obj.run_list) > 1: @@ -93,12 +93,13 @@ def status(run_name: Optional[str], run_id: str, platform: str, api_key: str) -> # input: run_id, page_num, page_size, need_all_logs, platform, api_key # return RunLogResult(run_status, total_log_lines, total_log_pages, log_line_list, run_logs) def logs(run_id: str, page_num: int, page_size: int, need_all_logs: bool, platform: str, api_key: str) -> RunLogResult: - _authenticate_and_validate_platform(api_key, platform) + api_key = authenticate(api_key) + validate_platform(platform) if run_id is None: raise Exception("Please specify run id.") - _, run_status = status(run_name=None, run_id=run_id, platform=platform, api_key=get_api_key()) + _, run_status = status(run_name=None, run_id=run_id, platform=platform, api_key=api_key) total_log_nums, total_log_pages, log_line_list, run_logs = 0, 0, list(), None @@ -110,7 +111,7 @@ def logs(run_id: str, page_num: int, page_size: int, need_all_logs: bool, platfo user_api_key=api_key) if run_logs is not None: - total_log_pages, total_log_nums = run_logs.total_num, run_logs.total_pages + total_log_pages, total_log_nums = run_logs.total_pages, run_logs.total_num _parse_logs(log_line_list, run_logs) return RunLogResult(run_status=run_status, total_log_lines=total_log_nums, total_log_pages=total_log_pages, diff --git a/python/fedml/api/modules/storage.py b/python/fedml/api/modules/storage.py index 1582788e3a..33e781be08 100644 --- a/python/fedml/api/modules/storage.py +++ b/python/fedml/api/modules/storage.py @@ -3,6 +3,12 @@ import shutil import requests +import math +from enum import Enum, unique + +import requests.exceptions +import tqdm +import sys from fedml.api.modules.utils import authenticate from fedml.core.distributed.communication.s3.remote_storage import S3Storage from fedml.core.mlops.mlops_configs import Configs, MLOpsConfigs @@ -18,12 +24,18 @@ def __init__(self, data: dict): self.createdAt = data.get("createTime", None) self.updatedAt = data.get("updateTime", None) self.size = _get_size(data.get("fileSize",None)) + self.tag_list = data.get("tags", None) + self.download_url = data.get("fileUrl", None) +class DataType(Enum): + FILE = "file" + DIRECTORY = "directory" + INVALID = "invalid" # Todo (alaydshah): Store service name in metadata # Todo (alaydshah): If data already exists, don't upload again. Instead suggest to use update command - -def upload(data_path, api_key, name, description, service, show_progress, out_progress_to_err, progress_desc, +# Todo (bhargav) : Discuss and remove the service variable. Maybe needed sometime later. +def upload(data_path, api_key, name, description, tag_list, service, show_progress, out_progress_to_err, progress_desc, metadata) -> FedMLResponse: api_key = authenticate(api_key) @@ -31,34 +43,52 @@ def upload(data_path, api_key, name, description, service, show_progress, out_pr if user_id is None: return FedMLResponse(code=ResponseCode.FAILURE, message=message) + + data_type = _get_data_type(data_path) - if(not _check_data_path(data_path)): + if(data_type == DataType.INVALID): return FedMLResponse(code=ResponseCode.FAILURE,message="Invalid data path") - archive_path, message = _archive_data(data_path) - if not archive_path: + if(data_type == DataType.DIRECTORY): + to_upload_path, message = _archive_data(data_path) + name = os.path.splitext(os.path.basename(to_upload_path))[0] if name is None else name + file_name = name + ".zip" + else: + to_upload_path = data_path + base_name = os.path.basename(to_upload_path) + file_extension = os.path.splitext(base_name)[1] + given_extension = None + if name is not None: + given_extension = os.path.splitext(name)[1] + if given_extension is None or given_extension == "": + name = name + file_extension + else: + name = base_name + + file_name = name + + if not to_upload_path: return FedMLResponse(code=ResponseCode.FAILURE, message=message) - store = _get_storage_service(service) - name = os.path.splitext(os.path.basename(archive_path))[0] if name is None else name - file_name = name + ".zip" + #TODO(bhargav191098) - Better done on the backend. Remove and pass file_name once completed on backend. dest_path = os.path.join(user_id, file_name) - file_size = os.path.getsize(archive_path) + file_size = os.path.getsize(to_upload_path) - file_uploaded_url = store.upload_file_with_progress(src_local_path=archive_path, dest_s3_path=dest_path, - show_progress=show_progress, - out_progress_to_err=out_progress_to_err, - progress_desc=progress_desc, metadata=metadata) - os.remove(archive_path) + file_uploaded_url, message = _upload_multipart(api_key, dest_path, to_upload_path, show_progress, + out_progress_to_err, + progress_desc, metadata) + + if(data_type == "dir"): + os.remove(to_upload_path) if not file_uploaded_url: - return FedMLResponse(code=ResponseCode.FAILURE, message=f"Failed to upload file: {archive_path}") + return FedMLResponse(code=ResponseCode.FAILURE, message=f"Failed to upload file: {to_upload_path}") json_data = { "datasetName": name, "description": description, "fileSize": file_size, "fileUrl": file_uploaded_url, - "tagNameList": [], + "tagNameList": tag_list, } try: @@ -80,25 +110,49 @@ def download(data_name, api_key, service, dest_path, show_progress=True) -> FedM if user_id is None: return FedMLResponse(code=ResponseCode.FAILURE, message=message) - store = _get_storage_service(service) - zip_file_name = data_name + ".zip" - key = os.path.join(user_id, zip_file_name) - path_local = os.path.abspath(zip_file_name) - dest_path = os.path.abspath(dest_path) if dest_path else data_name - if store.download_file_with_progress(path_s3=key, path_local=path_local, show_progress=show_progress): - try: - shutil.unpack_archive(path_local, dest_path) - os.remove(path_local) - abs_dest_path = os.path.abspath(dest_path) - return FedMLResponse(code=ResponseCode.SUCCESS, message=f"Successfully downloaded and unzipped data at " - f"{abs_dest_path}", data=abs_dest_path) - except Exception as e: - error_message = f"Failed to unpack archive: {e}" + metadata_response = get_metadata(data_name, api_key) + if metadata_response.code == ResponseCode.SUCCESS: + metadata = metadata_response.data + if not metadata or not isinstance(metadata, StorageMetadata): + error_message = f"Unable to get the download URL" logging.error(error_message) return FedMLResponse(code=ResponseCode.FAILURE, message=error_message) + download_url = metadata.download_url + given_extension = os.path.splitext(data_name)[1] + is_file = True + if(given_extension is None or given_extension ==""): + is_file = False + + if not is_file: + download_file_name = data_name + ".zip" + else: + download_file_name = data_name + path_local = os.path.abspath(download_file_name) + dest_path = os.path.abspath(dest_path) if dest_path else data_name + if _download_using_presigned_url(download_url, download_file_name, show_progress=show_progress): + try: + if not is_file: + shutil.unpack_archive(path_local, dest_path) + os.remove(path_local) + else: + if not os.path.exists(dest_path): + os.makedirs(dest_path) + shutil.move(path_local,dest_path) + abs_dest_path = os.path.abspath(dest_path) + return FedMLResponse(code=ResponseCode.SUCCESS, message=f"Successfully downloaded and unzipped data at " + f"{abs_dest_path}", data=abs_dest_path) + except Exception as e: + error_message = f"Failed to unpack archive: {e}" + logging.error(error_message) + return FedMLResponse(code=ResponseCode.FAILURE, message=error_message) + + else: + error_message = "Failed to download data from source" + logging.error(error_message) + return FedMLResponse(code=ResponseCode.FAILURE, message=error_message) + else: - error_message = f"Failed to download data: {data_name}" - logging.error(error_message) + error_message = metadata_response.message return FedMLResponse(code=ResponseCode.FAILURE, message=error_message) @@ -195,6 +249,195 @@ def delete(data_name, service, api_key=None) -> FedMLResponse: logging.error(message, data_name, service) return FedMLResponse(code=ResponseCode.FAILURE, message=message, data=False) +def _get_num_chunks(file_size, max_chunk_size): + num_chunks = math.ceil(file_size / max_chunk_size) + return num_chunks + + +def get_chunks(file_path, chunk_size): + with open(file_path, 'rb') as file: + while True: + chunk = file.read(chunk_size) + if not chunk: + break + yield chunk + + +def _get_presigned_url(api_key, request_url, file_name, part_number=None): + cert_path = MLOpsConfigs.get_cert_path_with_version() + headers = ServerConstants.API_HEADERS + headers["Authorization"] = f"Bearer {api_key}" + params_dict = {'fileKey': file_name} + if part_number is not None: + params_dict['partNumber'] = part_number + if cert_path is None: + try: + requests.session().verify = cert_path + response = requests.get(request_url, verify=True, headers=headers, params=params_dict) + except requests.exceptions.SSLError as err: + MLOpsConfigs.install_root_ca_file() + response = requests.get(request_url, verify=True, headers=headers, params=params_dict) + else: + response = requests.get(request_url, verify=True, headers=headers, params=params_dict) + return response + + +def _upload_part(url,part_data,session): + response = session.put(url,data=part_data,verify=True) + return response + + +def _upload_chunk(presigned_url, chunk, part, pbar=None, max_retries=20,session=None): + for retry_attempt in range(max_retries): + try: + response = _upload_part(presigned_url,chunk,session) + except requests.exceptions.RequestException as e: + if retry_attempt < max_retries: + continue + else: + raise requests.exceptions.RequestException + + if(pbar is not None): + pbar.update(chunk.__sizeof__()) + return {'etag': response.headers['ETag'], 'partNumber': part} + raise requests.exceptions.RequestException + +def _process_post_response(response): + if response.status_code != 200: + message = (f"Failed to complete multipart upload with status code = {response.status_code}, " + f"response.content: {response.content}") + logging.error(message) + return None, message + else: + resp_data = response.json() + code = resp_data.get("code", None) + data_url = resp_data.get("data", None) + + if code is None or data_url is None or code == "FAILURE": + message = resp_data.get("message", None) + message = (f"Failed to complete multipart upload with following message: {message}, " + f"response.content: {response.content}") + return None, message + + return data_url, "Successfully uploaded the data! " + + +def _complete_multipart_upload(api_key, file_key, part_info, upload_id): + complete_multipart_url = ServerConstants.get_complete_multipart_upload_url() + body_dict = {"fileKey": file_key, 'partETags': part_info, 'uploadId': upload_id} + + cert_path = MLOpsConfigs.get_cert_path_with_version() + headers = ServerConstants.API_HEADERS + headers["Authorization"] = f"Bearer {api_key}" + if cert_path is None: + try: + requests.session().verify = cert_path + complete_multipart_response = requests.post(complete_multipart_url, json=body_dict, verify=True, + headers=headers) + except requests.exceptions.SSLError as err: + MLOpsConfigs.install_root_ca_file() + complete_multipart_response = requests.post(complete_multipart_url, json=body_dict, verify=True, + headers=headers) + else: + complete_multipart_response = requests.post(complete_multipart_url, json=body_dict, verify=True, + headers=headers) + + return _process_post_response(complete_multipart_response) + + +def _upload_multipart(api_key: str, file_key, archive_path, show_progress, out_progress_to_err, + progress_desc_text, metadata): + request_url = ServerConstants.get_presigned_multi_part_url() + + file_size = os.path.getsize(archive_path) + + max_chunk_size = 20 * 1024 * 1024 + + num_chunks = _get_num_chunks(file_size, max_chunk_size) + + upload_id = "" + presigned_urls = [] + + presigned_url_response = _get_presigned_url(api_key, request_url, file_key, num_chunks) + + if presigned_url_response.status_code != 200: + message = (f"Failed to get presigned URL with status code = {presigned_url_response.status_code}, " + f"response.content: {presigned_url_response.content}") + logging.error(message) + return None, message + else: + resp_data = presigned_url_response.json() + code = resp_data.get("code", None) + data = resp_data.get("data", None) + + if code is None or data is None or code == "FAILURE": + message = resp_data.get("message", None) + message = (f"Failed getting presigned URL with following message: {message}, " + f"response.content: {presigned_url_response.content}") + return None, message + + upload_id = data['uploadId'] + presigned_urls = data['urls'] + + parts = [] + chunks = get_chunks(archive_path, max_chunk_size) + part_info = [] + chunk_count = 0 + successful_chunks = 0 + #TODO: (bhargav191098) Using Thread pool and confirming openssl issue + atomic_session = requests.session() + atomic_session.verify = MLOpsConfigs.get_cert_path_with_version() + with tqdm.tqdm(total=file_size, unit="B", unit_scale=True, + file=sys.stderr if out_progress_to_err else sys.stdout, + desc=progress_desc_text, leave=False) as pbar: + for part, chunk in enumerate(chunks, start=1): + presigned_url = presigned_urls[part - 1] + chunk_count += 1 + if show_progress: + try: + part_data = _upload_chunk(presigned_url=presigned_url, chunk=chunk, part=part, + pbar=pbar,session=atomic_session) + part_info.append(part_data) + successful_chunks += 1 + except Exception as e: + return None, "unsuccessful" + + else: + try: + part_data = _upload_chunk(presigned_url=presigned_url, chunk=chunk, part=part, + pbar=pbar,session=atomic_session) + part_info.append(part_data) + successful_chunks += 1 + except Exception as e: + return None, "unsuccessful" + + if successful_chunks == chunk_count: + return _complete_multipart_upload(api_key, file_key, part_info, upload_id) + else: + return None, "Unsuccessful!" + + +def _download_using_presigned_url(url, fname, chunk_size=1024 * 1024, show_progress=True): + download_response = requests.get(url, verify=True, stream=True) + if download_response.status_code == 200: + total = int(download_response.headers.get('content-length', 0)) + if show_progress: + with open(fname, 'wb') as file, tqdm.tqdm( + desc=fname, + total=total, + unit='B', + unit_scale=True, + unit_divisor=1024, + ) as bar: + for data in download_response.iter_content(chunk_size=chunk_size): + size = file.write(data) + bar.update(size) + else: + with open(fname, "wb") as file: + for data in download_response.iter_content(chunk_size=chunk_size): + size = file.write(data) + return True + return False def _get_user_id_from_api_key(api_key: str) -> (str, str): user_url = ServerConstants.get_user_url() @@ -231,10 +474,12 @@ def _get_storage_service(service): else: raise NotImplementedError(f"Service {service} not implemented") -def _check_data_path(data_path): - if os.path.isdir(data_path) or os.path.isfile(data_path): - return True - return False +def _get_data_type(data_path): + if os.path.isdir(data_path): + return DataType.DIRECTORY + elif os.path.isfile(data_path): + return DataType.FILE + return DataType.INVALID def _archive_data(data_path: str) -> (str, str): diff --git a/python/fedml/api/modules/utils.py b/python/fedml/api/modules/utils.py index 76801ffe81..abbea71f9f 100644 --- a/python/fedml/api/modules/utils.py +++ b/python/fedml/api/modules/utils.py @@ -21,7 +21,7 @@ def _check_api_key(api_key=None): if api_key is None or api_key == "": saved_api_key = get_api_key() if saved_api_key is None or saved_api_key == "": - api_key = click.prompt("FedML® Launch API Key is not set yet, please input your API key") + api_key = click.prompt("TensorOpera® Launch API Key is not set yet, please input your API key") else: api_key = saved_api_key diff --git a/python/fedml/cli/README.md b/python/fedml/cli/README.md index f94200f258..425bf0c5de 100644 --- a/python/fedml/cli/README.md +++ b/python/fedml/cli/README.md @@ -27,7 +27,7 @@ fedml build \ --ignore __pycache__,*.git ``` -## 2. Login into the FedML® Nexus AI Platform (fedml.ai) +## 2. Login into the TensorOpera® Nexus AI Platform (fedml.ai) login as general computing device with local pip mode: ``` fedml login $YourApiKey @@ -38,7 +38,7 @@ login as federated-learning server with local pip mode: fedml login $YourApiKey -s ``` -### 2.1. Examples for Logining into the FedML® Nexus AI Platform (fedml.ai) +### 2.1. Examples for Logining into the TensorOpera® Nexus AI Platform (fedml.ai) ``` fedml login 113343dad999933 @@ -48,7 +48,7 @@ fedml login 113343dad999933 fedml login 113343dad999933 -s ``` -## 3. Logout from the FedML FedML® Nexus AI Platform (fedml.ai) +## 3. Logout from the FedML TensorOpera® Nexus AI Platform (fedml.ai) logout from computing device with local pip mode: ``` fedml logout @@ -81,17 +81,17 @@ fedml diagnosis --open --s3 --mqtt ``` ## 7. Jobs -Start a job at FedML® Nexus AI Platform +Start a job at TensorOpera® Nexus AI Platform ``` Usage: fedml jobs start [OPTIONS] -Start a job at FedML® Nexus AI Platform +Start a job at TensorOpera® Nexus AI Platform Options: -pf, --platform TEXT The platform name at the MLOps platform(options: octopus, parrot, spider, beehive). --prj, --project_name TEXT The project name at FedML® Nexus AI Platform --app, --application_name TEXT Application name in the My Application list at FedML® Nexus AI Platform --jn, --job_name TEXT The job name at FedML® Nexus AI Platform If you don't specify here, the job name from the job yaml file will be used. +-prj, --project_name TEXT The project name at TensorOpera® Nexus AI Platform +-app, --application_name TEXT Application name in the My Application list at TensorOpera® Nexus AI Platform +-jn, --job_name TEXT The job name at TensorOpera® Nexus AI Platform If you don't specify here, the job name from the job yaml file will be used. -ds, --devices_server TEXT The server to run the launching job, for the launch platform, we do not need to set this option. -de, --devices_edges TEXT The edge devices to run the launching job. Separated with ',', e.g. 705,704. For the launch platform, we do not need to set this option. -u, --user TEXT user id or api key. @@ -238,7 +238,7 @@ You just need to customize the following config items. 3. `bootstrap`, It is the bootstrap shell command which will be executed before running entry commands. -Then you can use the following example CLI to launch the job at FedML® Nexus AI Platform +Then you can use the following example CLI to launch the job at TensorOpera® Nexus AI Platform (Replace $YourApiKey with your own account API key from open.fedml.ai) Example: @@ -248,7 +248,7 @@ fedml launch hello_job.yaml After the launch CLI is executed, the output is as follows. Here you may open the job url to confirm and actually start the job. ``` -Submitting your job to FedML® Nexus AI Platform: 100%|████████████████████████████████████████████████████████████████████████████████████████| 6.07k/6.07k [00:01<00:00, 4.94kB/s] +Submitting your job to TensorOpera® Nexus AI Platform: 100%|████████████████████████████████████████████████████████████████████████████████████████| 6.07k/6.07k [00:01<00:00, 4.94kB/s] Searched and matched the following GPU resource for your job: +-----------+-------------------+---------+------------+-------------------------+---------+-------+----------+ diff --git a/python/fedml/cli/modules/build.py b/python/fedml/cli/modules/build.py index 4674a88e9e..2fd68492fd 100644 --- a/python/fedml/cli/modules/build.py +++ b/python/fedml/cli/modules/build.py @@ -3,14 +3,14 @@ import fedml.api -@click.command("build", help="Build packages for the FedML® Nexus AI Platform") +@click.command("build", help="Build packages for the TensorOpera® AI Platform") @click.help_option("--help", "-h") @click.option( "--platform", "-pf", type=str, default="octopus", - help="The platform name at the FedML® Nexus AI Platform (options: octopus, parrot, spider, beehive, falcon, launch).", + help="The platform name at the TensorOpera® AI Platform (options: octopus, parrot, spider, beehive, falcon, launch).", ) @click.option( "--type", diff --git a/python/fedml/cli/modules/cluster.py b/python/fedml/cli/modules/cluster.py index 95822e1c18..47617b1f12 100644 --- a/python/fedml/cli/modules/cluster.py +++ b/python/fedml/cli/modules/cluster.py @@ -7,10 +7,10 @@ # Message strings constants confirmation_message: str = "Are you sure you want to {} these clusters?" failure_message: str = ("Failed to {} the clusters, please check the arguments are valid and your network " - "connection and make sure be able to access the FedML® Nexus AI Platform.") -version_help: str = "specify version of FedML® Nexus AI Platform. It should be dev, test or release" + "connection and make sure be able to access the TensorOpera® AI Platform.") +version_help: str = "specify version of TensorOpera® AI Platform. It should be dev, test or release" api_key_help: str = "user api key." -cluster_action_help: str = "{} clusters from FedML® Nexus AI Platform" +cluster_action_help: str = "{} clusters from TensorOpera® AI Platform" @click.group("cluster") @@ -27,7 +27,7 @@ ) def fedml_clusters(api_key, version): """ - Manage clusters on FedML® Nexus AI Platform + Manage clusters on TensorOpera® AI Platform """ pass diff --git a/python/fedml/cli/modules/device.py b/python/fedml/cli/modules/device.py index 5c4804fa69..b21b3d09d2 100644 --- a/python/fedml/cli/modules/device.py +++ b/python/fedml/cli/modules/device.py @@ -7,12 +7,12 @@ @click.help_option("--help", "-h") def fedml_device(): """ - Bind/unbind devices to the FedML® Nexus AI Platform + Bind/unbind devices to the TensorOpera® AI Platform """ pass -@fedml_device.command("bind", help="Bind to the FedML® Nexus AI Platform") +@fedml_device.command("bind", help="Bind to the TensorOpera® AI Platform") @click.help_option("--help", "-h") @click.argument("api_key", nargs=-1) @click.option( @@ -20,13 +20,13 @@ def fedml_device(): "-v", type=str, default="release", - help="Bind to which version of FedML® Nexus AI Platform. It should be dev, test or release.", + help="Bind to which version of TensorOpera® AI Platform. It should be dev, test or release.", ) @click.option( "--compute_node", "-c", default=None, is_flag=True, help="Bind as the general compute node in FEDML Nexus AI compute network. This is enabled by default. " - "After binding, you can view and manage the device in the FEDML® Nexus AI Platform: https://fedml.ai/compute. " - "It can be grouped as a cluster and then you can use FEDML®Launch to schedule any job (training, deployment, federated learning) to it. " + "After binding, you can view and manage the device in the TensorOpera® AI Platform: https://tensoropera.ai/gpu/local?label=Private. " + "It can be grouped as a cluster and then you can use TensorOpera®Launch to schedule any job (training, deployment, federated learning) to it. " "You can not specify the option -c and -s simultaneously.", ) @click.option( @@ -36,7 +36,7 @@ def fedml_device(): ) @click.option( "--provider", "-p", default=None, is_flag=True, - help="Bind as the FedML compute node (GPU) provider (supplier). This is used by Nexus AI Platform - Share and Earn: https://fedml.ai/gpu-supplier. You can share your GPUs in this way and earn money. " + help="Bind as the FedML compute node (GPU) provider (supplier). This is used by Nexus AI Platform - Share and Earn: https://tensoropera.ai/share-and-earn. You can share your GPUs in this way and earn money. " "You can specify the option -p and -c simultaneously (can be used as provider for others as well compute node for your own jobs), but you can not specify -p and -s simultaneously.", ) def fedml_device_bind(api_key, version, compute_node, server, provider): @@ -47,14 +47,14 @@ def fedml_device_bind(api_key, version, compute_node, server, provider): fedml.api.device_bind(api_key, compute_node, server, provider) -@fedml_device.command("unbind", help="Unbind from the FedML® Nexus AI Platform") +@fedml_device.command("unbind", help="Unbind from the TensorOpera® AI Platform") @click.help_option("--help", "-h") @click.option( "--version", "-v", type=str, default="release", - help="Unbind which backend environment version of FedML® Nexus AI Platform. It should be dev, test, or release.", + help="Unbind which backend environment version of TensorOpera® AI Platform. It should be dev, test, or release.", ) @click.option( "--compute_node", "-c", default=None, is_flag=True, help="Unbind from the FedML general compute node.", @@ -75,7 +75,7 @@ def fedml_device_unbind(version, computing, server): "-v", type=str, default="release", - help="show resource type at which version of FedML® Nexus AI Platform. It should be dev, test or release", + help="show resource type at which version of TensorOpera® AI Platform. It should be dev, test or release", ) def resource_type(version): fedml.set_env_version(version) diff --git a/python/fedml/cli/modules/federate.py b/python/fedml/cli/modules/federate.py index 6f26b2bea8..ff4fd6c791 100644 --- a/python/fedml/cli/modules/federate.py +++ b/python/fedml/cli/modules/federate.py @@ -7,12 +7,12 @@ @click.help_option("--help", "-h") def fedml_federate(): """ - Manage federated learning resources on FedML® Nexus AI Platform + Manage federated learning resources on TensorOpera® AI Platform """ pass -@fedml_federate.command("build", help="Build federate packages for the FedML® Nexus AI Platform.") +@fedml_federate.command("build", help="Build federate packages for the TensorOpera® AI Platform.") @click.help_option("--help", "-h") @click.option( "--dest_folder", diff --git a/python/fedml/cli/modules/launch.py b/python/fedml/cli/modules/launch.py index 16450e08a9..c14bbac353 100644 --- a/python/fedml/cli/modules/launch.py +++ b/python/fedml/cli/modules/launch.py @@ -13,7 +13,7 @@ from fedml.computing.scheduler.scheduler_entry.run_manager import FedMLRunStartedModel, FeatureEntryPoint -@click.command("launch", help="Launch job at the FedML® Nexus AI Platform") +@click.command("launch", help="Launch job at the TensorOpera® AI Platform") @click.help_option("--help", "-h") @click.option( "--api_key", "-k", type=str, help="user api key.", @@ -56,7 +56,7 @@ @click.argument("yaml_file", nargs=-1) def fedml_launch(yaml_file, cluster, version, api_key, group, local_on_premise_platform, local_on_premise_platform_port): """ - Manage resources on the FedML® Nexus AI Platform. + Manage resources on the TensorOpera® AI Platform. """ set_env_version(version) fedml.set_local_on_premise_platform_host(local_on_premise_platform) diff --git a/python/fedml/cli/modules/login.py b/python/fedml/cli/modules/login.py index f2e4d76322..5e77910cbb 100644 --- a/python/fedml/cli/modules/login.py +++ b/python/fedml/cli/modules/login.py @@ -1,12 +1,16 @@ import os +from enum import Enum import click import fedml.api from fedml.api.modules.utils import authenticate +from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants +from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants +from fedml.computing.scheduler.scheduler_core.general_constants import MarketplaceType -@click.command("login", help="Login the FedML® Nexus AI Platform") +@click.command("login", help="Login the TensorOpera® AI Platform") @click.help_option("--help", "-h") @click.argument("api_key", nargs=-1) @click.option( @@ -14,13 +18,13 @@ "-v", type=str, default="release", - help="Login which backend environment version of FedML® Nexus AI Platform. It should be dev, test, or release.", + help="Login which backend environment version of TensorOpera® AI Platform. It should be dev, test, or release.", ) @click.option( "--compute_node", "-c", default=None, is_flag=True, help="Login as the general compute node in FEDML Nexus AI compute network. This is enabled by default. " - "After login, you can view and manage the device in the FEDML® Nexus AI Platform: https://fedml.ai/compute. " - "It can be grouped as a cluster and then you can use FEDML®Launch to schedule any job (training, deployment, federated learning) to it. " + "After login, you can view and manage the device in the TensorOpera® AI Platform: https://tensoropera.ai/gpu/local?label=Private. " + "It can be grouped as a cluster and then you can use TensorOpera®Launch to schedule any job (training, deployment, federated learning) to it. " "You can not specify the option -c and -s simultaneously.", ) @click.option( @@ -30,7 +34,7 @@ ) @click.option( "--provider", "-p", default=None, is_flag=True, - help="Login as the FedML compute node (GPU) provider (supplier). This is used by Nexus AI Platform - Share and Earn: https://fedml.ai/gpu-supplier. You can share your GPUs in this way and earn money. " + help="Login as the FedML compute node (GPU) provider (supplier). This is used by Nexus AI Platform - Share and Earn: https://tensoropera.ai/share-and-earn. You can share your GPUs in this way and earn money. " "You can specify the option -p and -c simultaneously (can be used as provider for others as well compute node for your own jobs), but you can not specify -p and -s simultaneously.", ) @click.option( @@ -51,13 +55,71 @@ default=80, help="The port for local on-premise Nexus AI Platform.", ) +@click.option( + "--master_inference_gateway_port", + "-mgp", + type=int, + default=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT, + help="The port for master inference gateway.", +) +@click.option( + "--worker_inference_proxy_port", + "-wpp", + type=int, + default=ClientConstants.LOCAL_CLIENT_API_PORT, + help="The port for worker inference proxy.", +) +@click.option( + "--worker_connection_type", + "-wct", + type=str, + default=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT, + help="The connection type for worker inference proxy.", +) +@click.option( + "--marketplace_type", + "-mpt", + type=click.Choice([marketplace_type for marketplace_type in MarketplaceType.__members__]), + default=MarketplaceType.SECURE.name, + help="Specify the marketplace type: 'SECURE' for Secure Cloud or 'COMMUNITY' for Community Cloud. " + "Defaults to Secure Cloud.", +) +@click.option( + "--price_per_hour", + "-pph", + type=click.FLOAT, + default=0.0, + help="Enter the price per GPU per hour as a non-negative floating-point number between 0.0 and 1000.0. For " + "example, if the cost of using an H100 node" + "for one hour is $1.5 per GPU, then you would input 1.5. Do not multiply this number by the total number of " + "GPUs in the node, as the system will automatically detect the number of GPUs and include it in the cost " + "calculation. Default is 0.0." + "Optionally, you can also set this price later through supplier page on the TensorOpera® AI Platform." +) +@click.option( + "--name", + "-n", + type=str, + default="", + help="Name of the node.", +) def fedml_login( api_key, version, compute_node, server, provider, deploy_worker_num, - local_on_premise_platform, local_on_premise_platform_port): + local_on_premise_platform, local_on_premise_platform_port, + master_inference_gateway_port, worker_inference_proxy_port, worker_connection_type, marketplace_type, + price_per_hour, name +): fedml.set_env_version(version) fedml.set_local_on_premise_platform_host(local_on_premise_platform) fedml.set_local_on_premise_platform_port(local_on_premise_platform_port) + try: + price_per_hour = float(price_per_hour) + except ValueError as e: + raise click.BadParameter(str(e), param_hint="price_per_hour") + + __validate_mpt_pph(marketplace_type, price_per_hour) + api_key = api_key[0] if len(api_key) > 0 else None try: authenticate(api_key) @@ -66,4 +128,16 @@ def fedml_login( print(f"Maybe you are using account id to login, we will try to login with account {api_key}.") pass os.environ["FEDML_MODEL_WORKER_NUM"] = str(deploy_worker_num) - fedml.api.login(api_key, compute_node, server, provider) + fedml.api.login(api_key, compute_node, server, provider, master_inference_gateway_port, + worker_inference_proxy_port, worker_connection_type, marketplace_type, price_per_hour, name) + + +def __validate_mpt_pph(marketplace_type, price_per_hour): + try: + MarketplaceType.from_str(marketplace_type) + except ValueError as e: + raise click.BadParameter(str(e), param_hint="marketplace_type") + + if price_per_hour < 0 or price_per_hour > 1000: + raise click.BadParameter(f"Price per hour should be a non-negative float ranging between 0 and 1000. Current " + f"input value {price_per_hour} is not valid", param_hint="price_per_hour") diff --git a/python/fedml/cli/modules/logout.py b/python/fedml/cli/modules/logout.py index 94a51b395a..ab2abfde95 100644 --- a/python/fedml/cli/modules/logout.py +++ b/python/fedml/cli/modules/logout.py @@ -3,7 +3,7 @@ import fedml.api -@click.command("logout", help="Logout from the FedML® Nexus AI Platform") +@click.command("logout", help="Logout from the TensorOpera® AI Platform") @click.help_option("--help", "-h") @click.option( "--computing", "-c", default=None, is_flag=True, help="Logout from the FedML general compute node.", @@ -16,7 +16,7 @@ "-v", type=str, default="release", - help="Logout which backend environment version of FedML® Nexus AI Platform. It should be dev, test, or release.", + help="Logout which backend environment version of TensorOpera® AI Platform. It should be dev, test, or release.", ) def fedml_logout(computing, server, version): fedml.set_env_version(version) diff --git a/python/fedml/cli/modules/run.py b/python/fedml/cli/modules/run.py index b4e8a947fd..a2c479897b 100644 --- a/python/fedml/cli/modules/run.py +++ b/python/fedml/cli/modules/run.py @@ -15,24 +15,24 @@ "-v", type=str, default="release", - help="version of FedML® Nexus AI Platform. It should be dev, test or release", + help="version of TensorOpera® AI Platform. It should be dev, test or release", ) @click.option( "--platform", "-pf", type=str, default="falcon", - help="The platform name at the FedML® Nexus AI Platform (options: octopus, parrot, spider, beehive, falcon, launch," + help="The platform name at the TensorOpera® AI Platform (options: octopus, parrot, spider, beehive, falcon, launch," "default is falcon).", ) def fedml_run(api_key, version, platform): """ - Manage runs on the FedML® Nexus AI Platform. + Manage runs on the TensorOpera® AI Platform. """ pass -@fedml_run.command("stop", help="Stop a run from the FedML® Nexus AI Platform.") +@fedml_run.command("stop", help="Stop a run from the TensorOpera® AI Platform.") @click.help_option("--help", "-h") @click.option( "--run_id", @@ -49,14 +49,14 @@ def fedml_run(api_key, version, platform): "-v", type=str, default="release", - help="stop a run at which version of FedML® Nexus AI Platform. It should be dev, test or release", + help="stop a run at which version of TensorOpera® AI Platform. It should be dev, test or release", ) @click.option( "--platform", "-pf", type=str, default="falcon", - help="The platform name at the FedML® Nexus AI Platform (options: octopus, parrot, spider, beehive, falcon, launch, " + help="The platform name at the TensorOpera® AI Platform (options: octopus, parrot, spider, beehive, falcon, launch, " "default is falcon).", ) def stop_run(platform, run_id, api_key, version): @@ -68,14 +68,14 @@ def stop_run(platform, run_id, api_key, version): click.echo(f"Failed to stop Run {run_id}. Please check if the run id is valid.") -@fedml_run.command("list", help="List runs from the FedML® Nexus AI Platform.") +@fedml_run.command("list", help="List runs from the TensorOpera® AI Platform.") @click.help_option("--help", "-h") @click.option( "--platform", "-pf", type=str, default="falcon", - help="The platform name at the FedML® Nexus AI Platform (options: octopus, parrot, spider, beehive, falcon, launch, " + help="The platform name at the TensorOpera® AI Platform (options: octopus, parrot, spider, beehive, falcon, launch, " "default is falcon).", ) @click.option( @@ -83,14 +83,14 @@ def stop_run(platform, run_id, api_key, version): "-r", type=str, default="", - help="Run name at the FedML® Nexus AI Platform.", + help="Run name at the TensorOpera® AI Platform.", ) @click.option( "--run_id", "-rid", type=str, default="", - help="Run id at the FedML® Nexus AI Platform.", + help="Run id at the TensorOpera® AI Platform.", ) @click.option( "--api_key", "-k", type=str, help="user api key.", @@ -100,7 +100,7 @@ def stop_run(platform, run_id, api_key, version): "-v", type=str, default="release", - help="list runs at which version of FedML® Nexus AI Platform. It should be dev, test or release", + help="list runs at which version of TensorOpera® AI Platform. It should be dev, test or release", ) def list_runs(platform, run_name, run_id, api_key, version): fedml.set_env_version(version) @@ -109,14 +109,14 @@ def list_runs(platform, run_name, run_id, api_key, version): _print_run_table(run_list_obj) -@fedml_run.command("status", help="Get status of run from the FedML® Nexus AI Platform.") +@fedml_run.command("status", help="Get status of run from the TensorOpera® AI Platform.") @click.help_option("--help", "-h") @click.option( "--platform", "-pf", type=str, default="falcon", - help="The platform name at the FedML® Nexus AI Platform (options: octopus, parrot, spider, beehive, falcon, launch, " + help="The platform name at the TensorOpera® AI Platform (options: octopus, parrot, spider, beehive, falcon, launch, " "default is falcon).", ) @click.option( @@ -124,14 +124,14 @@ def list_runs(platform, run_name, run_id, api_key, version): "-r", type=str, default=None, - help="Run name at the FedML® Nexus AI Platform.", + help="Run name at the TensorOpera® AI Platform.", ) @click.option( "--run_id", "-rid", type=str, default=None, - help="Run id at the FedML® Nexus AI Platform.", + help="Run id at the TensorOpera® AI Platform.", ) @click.option( "--api_key", "-k", type=str, help="user api key.", @@ -141,7 +141,7 @@ def list_runs(platform, run_name, run_id, api_key, version): "-v", type=str, default="release", - help="get status of run at which version of FedML® Nexus AI Platform. It should be dev, test or release", + help="get status of run at which version of TensorOpera® AI Platform. It should be dev, test or release", ) def status(platform, run_name, run_id, api_key, version): fedml.set_env_version(version) @@ -153,14 +153,14 @@ def status(platform, run_name, run_id, api_key, version): _print_run_table(run_list_obj) -@fedml_run.command("logs", help="Get logs of run from the FedML® Nexus AI Platform.") +@fedml_run.command("logs", help="Get logs of run from the TensorOpera® AI Platform.") @click.help_option("--help", "-h") @click.option( "--platform", "-pf", type=str, default="falcon", - help="The platform name at the FedML® Nexus AI Platform (options: octopus, parrot, spider, beehive, falcon, launch, " + help="The platform name at the TensorOpera® AI Platform (options: octopus, parrot, spider, beehive, falcon, launch, " "default is falcon).", ) @click.option( @@ -168,7 +168,7 @@ def status(platform, run_name, run_id, api_key, version): "-rid", type=str, default=None, - help="Run id at the FedML® Nexus AI Platform.", + help="Run id at the TensorOpera® AI Platform.", ) @click.option( "--api_key", "-k", type=str, help="user api key.", @@ -178,27 +178,27 @@ def status(platform, run_name, run_id, api_key, version): "-v", type=str, default="release", - help="get logs of run at which version of FedML® Nexus AI Platform. It should be dev, test or release", + help="get logs of run at which version of TensorOpera® AI Platform. It should be dev, test or release", ) @click.option( "--page_num", "-pn", type=int, - default=0, + default=1, help="request page num for logs. --need_all_logs should be set to False if you want to use this option.", ) @click.option( "--page_size", "-ps", type=int, - default=0, + default=10, help="request page size for logs, --need_all_logs should be set to False if you want to use this option.", ) @click.option( "--need_all_logs", "-a", type=bool, - default=True, + default=False, help="boolean value representing if all logs are needed. Default to True", ) def logs(platform, run_id, api_key, version, page_num, page_size, need_all_logs): @@ -217,8 +217,8 @@ def logs(platform, run_id, api_key, version, page_num, page_size, need_all_logs) return # Show run log summary info - log_head_table = PrettyTable(['Run ID', 'Total Log Lines', 'Log URL']) - log_head_table.add_row([run_id, run_log_result.total_log_lines, run_logs.log_full_url]) + log_head_table = PrettyTable(['Run ID', 'Printed Log Lines', 'Total Log Lines', 'Log URL']) + log_head_table.add_row([run_id, len(run_log_result.log_line_list), run_logs.total_num, run_logs.log_full_url]) click.echo("\nLogs summary info is as follows.") print(log_head_table) @@ -234,7 +234,7 @@ def logs(platform, run_id, api_key, version, page_num, page_size, need_all_logs) if len(run_log_result.log_line_list) > 0: click.echo("\nAll logs is as follows.") for log_line in run_log_result.log_line_list: - click.echo(log_line.rstrip('\n')) + click.echo(log_line) def _print_run_table(run_list_obj): diff --git a/python/fedml/cli/modules/storage.py b/python/fedml/cli/modules/storage.py index 93ce273e92..8b75075289 100644 --- a/python/fedml/cli/modules/storage.py +++ b/python/fedml/cli/modules/storage.py @@ -12,7 +12,7 @@ from fedml.api.fedml_response import ResponseCode # Message strings constants -version_help: str = "specify version of FedML® Nexus AI Platform. It should be dev, test or release" +version_help: str = "specify version of TensorOpera® AI Platform. It should be dev, test or release" api_key_help: str = "user api key." @@ -31,7 +31,7 @@ ) def fedml_storage(api_key, version): """ - Manage storage on FedML® Nexus AI Platform + Manage storage on TensorOpera® AI Platform """ pass @@ -43,16 +43,17 @@ def validate_argument(ctx, param, value): return value -@fedml_storage.command("upload", help="Upload data on FedML® Nexus AI Platform") +@fedml_storage.command("upload", help="Upload data on TensorOpera® AI Platform") @click.help_option("--help", "-h") @click.argument("data_path", nargs=1, callback=validate_argument) @click.option("--name", "-n", type=str, help="Name your data to store. If not provided, the name will be the same as " - "the data file or directory name.") + "the data file or directory name. For files, extension need not be mentioned!") @click.option("--description", "-d", type=str, help="Add description to your data to store. If not provided, " "the description will be empty.") @click.option("--user_metadata", "-um", type=str, help="User-defined metadata in the form of a dictionary, for instance, " " {'name':'value'} within double quotes. "" " "Defaults to None.") +@click.option("--tags", "-t", type=str, help="Add tags to your data to store. Give tags in comma separated form like 'cv,unet,segmentation' If not provided, the tags will be empty.") @click.option('--service', "-s", type=click.Choice(['R2']), default="R2", help="Storage service for object storage. " "Only R2 is supported as of now") @click.option( @@ -65,10 +66,11 @@ def validate_argument(ctx, param, value): default="release", help=version_help, ) -def upload(data_path: str, name: str, user_metadata: str, description: str, version: str, api_key: str, service): +def upload(data_path: str, name: str, user_metadata: str, description: str, version: str, api_key: str, tags:str, service): metadata = _parse_metadata(user_metadata) + tag_list = _parse_tags(tags) fedml.set_env_version(version) - response = fedml.api.upload(data_path=data_path, api_key=api_key, name=name, service=service, show_progress=True, + response = fedml.api.upload(data_path=data_path, api_key=api_key, name=name, tag_list = tag_list, service=service, show_progress=True, description=description, metadata=metadata) if response.code == ResponseCode.SUCCESS: click.echo(f"Data uploaded successfully. | url: {response.data}") @@ -76,7 +78,7 @@ def upload(data_path: str, name: str, user_metadata: str, description: str, vers click.echo(f"Failed to upload data. Error message: {response.message}") -@fedml_storage.command("list", help="List data stored on FedML® Nexus AI Platform") +@fedml_storage.command("list", help="List data stored on TensorOpera® AI Platform") @click.help_option("--help", "-h") @click.option( "--api_key", "-k", type=str, help=api_key_help, @@ -96,17 +98,17 @@ def list_data(version, api_key): if not response.data: click.echo(f"No stored objects found for account linked with apikey: {api_key}") return - object_list_table = PrettyTable(["Data Name", "Data Size", "Description", "Created At", "Updated At"]) + object_list_table = PrettyTable(["Data Name", "Data Size", "Description", "Data Tags","Created At", "Updated At"]) for stored_object in response.data: object_list_table.add_row( - [stored_object.dataName, stored_object.size, stored_object.description, stored_object.createdAt, stored_object.updatedAt]) + [stored_object.dataName, stored_object.size, stored_object.description, stored_object.tag_list,stored_object.createdAt, stored_object.updatedAt]) click.echo(object_list_table) else: click.echo(f"Failed to list stored objects for account linked with apikey {api_key}. " f"Error message: {response.message}") -@fedml_storage.command("get-user-metadata", help="Get user-defined metadata of data object stored on FedML® Nexus AI " +@fedml_storage.command("get-user-metadata", help="Get user-defined metadata of data object stored on TensorOpera® AI " "Platform") @click.help_option("--help", "-h") @click.argument("data_name", nargs=1, callback=validate_argument) @@ -134,7 +136,7 @@ def get_user_metadata(data_name, version, api_key): click.echo(f"Failed to fetch user-metadata for {data_name}. Error message: {response.message}") -@fedml_storage.command("get-metadata", help="Get metadata of data object stored on FedML® Nexus AI Platform") +@fedml_storage.command("get-metadata", help="Get metadata of data object stored on TensorOpera® AI Platform") @click.help_option("--help", "-h") @click.argument("data_name", nargs=1, callback=validate_argument) @click.option( @@ -157,15 +159,15 @@ def get_metadata(data_name, version, api_key): return click.echo(f"Successfully fetched metadata for object {data_name}:") # Todo (alaydshah): Add file size and tags - metadata_table = PrettyTable(["Data Name","Data Size","Description", "Created At", "Updated At"]) - metadata_table.add_row([metadata.dataName,metadata.size,metadata.description, metadata.createdAt, metadata.updatedAt]) + metadata_table = PrettyTable(["Data Name","Data Size","Description","Data Tags","Created At", "Updated At"]) + metadata_table.add_row([metadata.dataName,metadata.size,metadata.description,metadata.tag_list,metadata.createdAt, metadata.updatedAt]) click.echo(metadata_table) click.echo("") else: click.echo(f"Fetching metadata failed. Error message: {response.message}") -@fedml_storage.command("download", help="Download data stored on FedML® Nexus AI Platform") +@fedml_storage.command("download", help="Download data stored on TensorOpera® AI Platform") @click.help_option("--help", "-h") @click.argument("data_name", nargs=1, callback=validate_argument) @click.option("--dest_path", "-d", default=None, type=str, help="Destination path to download data. By default, " @@ -192,7 +194,7 @@ def download(data_name, dest_path, version, api_key, service): click.echo(f"Failed to download data {data_name}. Error message: {response.message}") -@fedml_storage.command("delete", help="Delete data stored on FedML® Nexus AI Platform") +@fedml_storage.command("delete", help="Delete data stored on TensorOpera® AI Platform") @click.argument("data_name", nargs=1, callback=validate_argument) @click.help_option("--help", "-h") @click.option( @@ -238,3 +240,9 @@ def _parse_metadata(metadata: str): click.echo( f"Input metadata cannot be evaluated. Please make sure metadata is in the correct format. Error: {e}.") exit() + +def _parse_tags(tags:str): + if not tags: + return [] + tag_list = tags.split(",") + return tag_list \ No newline at end of file diff --git a/python/fedml/cli/modules/train.py b/python/fedml/cli/modules/train.py index b4c36d1663..ae9c5fcbb1 100644 --- a/python/fedml/cli/modules/train.py +++ b/python/fedml/cli/modules/train.py @@ -7,12 +7,12 @@ @click.help_option("--help", "-h") def fedml_train(): """ - Manage training resources on FedML® Nexus AI Platform + Manage training resources on TensorOpera® AI Platform """ pass -@fedml_train.command("build", help="Build training packages for the FedML® Nexus AI Platform.") +@fedml_train.command("build", help="Build training packages for the TensorOpera® AI Platform.") @click.help_option("--help", "-h") @click.option( "--dest_folder", diff --git a/python/fedml/computing/scheduler/comm_utils/constants.py b/python/fedml/computing/scheduler/comm_utils/constants.py index b1294181bb..6e46ce207b 100644 --- a/python/fedml/computing/scheduler/comm_utils/constants.py +++ b/python/fedml/computing/scheduler/comm_utils/constants.py @@ -78,12 +78,10 @@ class SchedulerConstants: ENDPOINT_INFERENCE_READY_TIMEOUT = 15 ENDPOINT_STATUS_CHECK_TIMEOUT = 60 * 3 - MQTT_INFERENCE_TIMEOUT = 60 * 6 - TRAIN_PROVISIONING_TIMEOUT = 60 * 25 TRAIN_STARTING_TIMEOUT = 60 * 15 TRAIN_STOPPING_TIMEOUT = 60 * 5 - TRAIN_RUNNING_TIMEOUT = 60 * 60 * 12 + TRAIN_RUNNING_TIMEOUT = 60 * 60 * 24 * 2000 TRAIN_INIT_TIMEOUT = 60 * 5 PUBLIC_REDIS_PORT = 6379 @@ -103,7 +101,7 @@ class SchedulerConstants: RUN_PROCESS_TYPE_BOOTSTRAP_PROCESS = "bootstrap-process" FEDML_DEFAULT_LAUNCH_CONTAINER_PREFIX = "fedml_default_launch_container" - FEDML_DEFAULT_LAUNCH_IMAGE = "fedml/fedml-default-launch:cu12.1-u22.04" + FEDML_DEFAULT_LAUNCH_IMAGE = "fedml/fedml-launch-job:cu12.1-u22.04" FEDML_DEFAULT_LOG_DIR = ".fedml/fedml-client/fedml/logs" FEDML_DEFAULT_DATA_DIR = ".fedml/fedml-client/fedml/data" @@ -111,6 +109,11 @@ class SchedulerConstants: IMAGE_PULL_POLICY_IF_NOT_PRESENT = "IfNotPresent" IMAGE_PULL_POLICY_NEVER = "Never" + REDIS_INFER_HOST = "127.0.0.1" + REDIS_ADDR = "local" + REDIS_PORT = "6379" + REDIS_PASSWORD = "fedml_default" + @staticmethod def get_log_source(run_json): run_config = run_json.get("run_config", {}) diff --git a/python/fedml/computing/scheduler/comm_utils/container_utils.py b/python/fedml/computing/scheduler/comm_utils/container_utils.py index f337dd9997..2f5fa31fb5 100644 --- a/python/fedml/computing/scheduler/comm_utils/container_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/container_utils.py @@ -2,16 +2,18 @@ import os import traceback import datetime +from typing import List + from dateutil.parser import isoparse import docker from docker import errors from fedml.computing.scheduler.comm_utils import sys_utils +from fedml.computing.scheduler.comm_utils.hardware_utils import HardwareUtil from fedml.core.common.singleton import Singleton from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants import time -from GPUtil import getGPUs class ContainerUtils(Singleton): @@ -225,9 +227,8 @@ def pull_image_with_policy(self, image_pull_policy, image_name, client=None): raise Exception(f"Unsupported image pull policy: {image_pull_policy}") class ContainerMetrics: - def __init__(self, cpu_percent, mem_used_megabytes, mem_avail_megabytes, network_recv_megabytes, network_sent_megabytes, - blk_read_megabytes, blk_write_megabytes, timestamp, gpus_stat - ): + def __init__(self, cpu_percent, mem_used_megabytes, mem_avail_megabytes, network_recv_megabytes, + network_sent_megabytes, blk_read_megabytes, blk_write_megabytes, timestamp, gpus_stat): self.cpu_percent = cpu_percent self.mem_used_megabytes = mem_used_megabytes self.mem_avail_megabytes = mem_avail_megabytes @@ -252,7 +253,7 @@ def get_container_perf(self, c_name) -> ContainerMetrics: CPU % MEM USAGE / LIMIT MEM % NET I/O BLOCK I/O 0.26% 8.703GiB / 503.5GiB 1.73% 17.4GB / 176MB 545kB / 20.9GB - GPU: We currently use GPUtil to get the GPU stats on host machine since one GPU is not + GPU: We currently use HardwareUtil to get the GPU stats on host machine since one GPU is not shared by multiple containers (TODO: get the GPU stats inside the container) """ @@ -320,7 +321,7 @@ def get_container_perf(self, c_name) -> ContainerMetrics: round(blk_read_bytes / (1024 * 1024), 1), round(blk_write_bytes / (1024 * 1024), 1)) # Calculate the gpu usage - gpus_stat = self.generate_container_gpu_stats(c_name) + gpus_stat = self.generate_container_gpu_stats(container_name=c_name) # Record timestamp timestamp = stats["read"] @@ -328,39 +329,27 @@ def get_container_perf(self, c_name) -> ContainerMetrics: return ContainerUtils.ContainerMetrics(cpu_percent, mem_gb_used, mem_gb_avail, recv_megabytes, sent_megabytes, blk_read_bytes, blk_write_bytes, timestamp, gpus_stat) - def generate_container_gpu_stats(self, c_name): - gpu_ids = self.get_gpu_ids_by_container_name(c_name) + def generate_container_gpu_stats(self, container_name): + client = self.get_docker_client() + gpu_ids = HardwareUtil.get_docker_gpu_ids_by_container_name(container_name=container_name, docker_client=client) gpu_stats = self.gpu_stats(gpu_ids) return gpu_stats - def get_gpu_ids_by_container_name(self, c_name): - client = self.get_docker_client() - gpu_ids = [] - try: - gpu_ids = client.api.inspect_container(c_name)["HostConfig"]["DeviceRequests"][0]["DeviceIDs"] - gpu_ids = list(map(int, gpu_ids)) - except Exception as e: - logging.error(f"Failed to get GPU IDs: {e}") - pass - - return gpu_ids - @staticmethod - def gpu_stats(gpu_ids): + def gpu_stats(gpu_ids: List[int]): utilz, memory, temp = None, None, None gpu_stats_map = {} # gpu_id: int -> {"gpu_utilization", "gpu_memory_allocated", "gpu_temp"} + gpu_ids = set(gpu_ids) try: - gpus = getGPUs() - - for i in gpu_ids: - gpu = gpus[i] - gpu_stats_map[i] = { - "gpu_utilization": gpu.load*100, - "gpu_memory_allocated": gpu.memoryUtil*100, - "gpu_temp": gpu.temperature, - # "gpu_power_usage": pynvml.nvmlDeviceGetPowerUsage(handle) / 1000, # in watts - # "gpu_time_spent_accessing_memory": utilz.memory # in ms - } + for gpu in HardwareUtil.get_gpus(): + if gpu.id in gpu_ids: + gpu_stats_map[gpu.id] = { + "gpu_utilization": gpu.load * 100, + "gpu_memory_allocated": gpu.memoryUsed / gpu.memoryTotal * 100, + "gpu_temp": gpu.temperature, + # "gpu_power_usage": pynvml.nvmlDeviceGetPowerUsage(handle) / 1000, # in watts + # "gpu_time_spent_accessing_memory": utilz.memory # in ms + } except Exception as e: logging.error(f"Failed to get GPU stats: {e}") diff --git a/python/fedml/computing/scheduler/comm_utils/file_utils.py b/python/fedml/computing/scheduler/comm_utils/file_utils.py new file mode 100644 index 0000000000..1d8fc6ca83 --- /dev/null +++ b/python/fedml/computing/scheduler/comm_utils/file_utils.py @@ -0,0 +1,13 @@ +import os + + +def find_file_inside_folder(folder_path, file_name): + """ + Recursively search for a file inside a folder and its sub-folders. + return the full path of the file if found, otherwise return None. + """ + for root, dirs, files in os.walk(folder_path): + if file_name in files: + return os.path.join(root, file_name) + + return None diff --git a/python/examples/deploy/quick_start/src/__init__.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/__init__.py similarity index 100% rename from python/examples/deploy/quick_start/src/__init__.py rename to python/fedml/computing/scheduler/comm_utils/gpu_utils/__init__.py diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py new file mode 100644 index 0000000000..b48a3e85b7 --- /dev/null +++ b/python/fedml/computing/scheduler/comm_utils/gpu_utils/gpu_utils.py @@ -0,0 +1,62 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import Enum, auto +from typing import Optional, List, Dict + +from docker import DockerClient + + +class GPUCardType(Enum): + NVIDIA = auto() + QUALCOMM = auto() + UNKNOWN = auto() + + def __str__(self): + return self.name + + +@dataclass +class GPUCard: + id: int + name: str + driver: str + serial: str + vendor: str + memoryTotal: float + memoryFree: float + memoryUsed: float + memoryUtil: float + load: Optional[float] = 0.0 + device_path: Optional[str] = "" + uuid: Optional[str] = "" + display_mode: Optional[str] = "" + display_active: Optional[str] = "" + temperature: Optional[float] = 0.0 + + +class GPUCardUtil(ABC): + + @classmethod + @abstractmethod + def detect_gpu_card_type(cls) -> Optional[GPUCardType]: + raise NotImplementedError + + @staticmethod + @abstractmethod + def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]: + raise NotImplementedError + + @staticmethod + @abstractmethod + def get_gpu_cards() -> List[GPUCard]: + raise NotImplementedError + + @staticmethod + @abstractmethod + def get_docker_gpu_device_mapping(gpu_ids: Optional[List[int]], num_gpus: int = 0) -> Optional[Dict]: + raise NotImplementedError + + @staticmethod + @abstractmethod + def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]: + raise NotImplementedError diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py new file mode 100644 index 0000000000..a6717de8cb --- /dev/null +++ b/python/fedml/computing/scheduler/comm_utils/gpu_utils/nvidia_utils.py @@ -0,0 +1,66 @@ +import logging +import subprocess +from typing import List, Optional, Dict + +import docker +from docker import types, DockerClient +from GPUtil import GPUtil, GPU + +from fedml.computing.scheduler.comm_utils.gpu_utils.gpu_utils import GPUCard, GPUCardUtil, GPUCardType + + +class NvidiaGPUtil(GPUCardUtil): + + @classmethod + def detect_gpu_card_type(cls) -> Optional[GPUCardType]: + try: + subprocess.check_output(["nvidia-smi"], universal_newlines=True) + return GPUCardType.NVIDIA + except Exception: + return None + + @staticmethod + def get_gpu_cards() -> List[GPUCard]: + return [NvidiaGPUtil.__convert(gpu) for gpu in GPUtil.getGPUs()] + + @staticmethod + def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]: + # return GPUtil.getAvailable(order=order, limit=limit, maxLoad=max_load, maxMemory=max_memory) + return GPUtil.getAvailable(order='random', limit=limit) + + @staticmethod + def get_docker_gpu_device_mapping(gpu_ids: List[int], num_gpus: int = 0) -> Optional[Dict]: + if gpu_ids is not None and len(gpu_ids): + gpu_id_list = list(map(lambda x: str(x), gpu_ids)) + return {"device_requests": [docker.types.DeviceRequest(device_ids=gpu_id_list, capabilities=[["gpu"]])]} + else: + return {"device_requests": [docker.types.DeviceRequest(count=num_gpus, capabilities=[['gpu']])]} + + @staticmethod + def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]: + try: + gpu_ids = docker_client.api.inspect_container(container_name)["HostConfig"]["DeviceRequests"][0]["DeviceIDs"] + return list(map(int, gpu_ids)) + except Exception as e: + logging.error(f"Failed to get GPU IDs: {e}") + pass + return [] + + @staticmethod + def __convert(gpu: GPU) -> GPUCard: + return GPUCard( + id=gpu.id, + name=gpu.name, + driver=gpu.driver, + serial=gpu.serial, + vendor=GPUCardType.NVIDIA.name, + memoryTotal=gpu.memoryTotal, + memoryFree=gpu.memoryFree, + memoryUsed=gpu.memoryUsed, + memoryUtil=gpu.memoryUtil, + load=gpu.load, + uuid=gpu.uuid, + display_mode=gpu.display_mode, + display_active=gpu.display_active, + temperature=gpu.temperature, + ) diff --git a/python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py b/python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py new file mode 100644 index 0000000000..13131e362d --- /dev/null +++ b/python/fedml/computing/scheduler/comm_utils/gpu_utils/qualcomm_utils.py @@ -0,0 +1,150 @@ +import logging +import math +import re +import subprocess +import sys +from typing import List, Optional, Dict + +from docker import DockerClient + +from fedml.computing.scheduler.comm_utils.gpu_utils.gpu_utils import GPUCard, GPUCardUtil, GPUCardType + + +class QualcommNPUtil(GPUCardUtil): + NPU_CARD_PATH = "/dev/accel/accel" + + def __init__(self): + sys.path.append("/opt/qti-aic/dev/lib/x86_64/") + + @classmethod + def detect_gpu_card_type(cls) -> Optional[GPUCardType]: + try: + subprocess.check_output(["/opt/qti-aic/tools/qaic-util"], universal_newlines=True) + return GPUCardType.QUALCOMM + except Exception: + return None + + @staticmethod + def get_gpu_cards() -> List[GPUCard]: + return list(QualcommNPUtil.__get_gpu_cards().values()) + + @staticmethod + def __get_gpu_cards() -> Dict[int, GPUCard]: + from qaicrt import Util, QIDList, QDevInfo, QStatus + cards = dict() + util = Util() + status, card_list = util.getDeviceIds() + if status.value == 0: + for card in card_list: + status, card_info = util.getDeviceInfo(card) + if status.value == 0 and card_info.devStatus.value == 1: + gpu_card = QualcommNPUtil.__convert(card_info) + cards[gpu_card.id] = gpu_card + else: + logging.error("Qualcomm Cards Status not Healthy") + return cards + + @staticmethod + def get_available_gpu_card_ids(order: str, limit: int, max_load: float, max_memory: float) -> List[int]: + gpu_cards: List[GPUCard] = QualcommNPUtil.get_gpu_cards() + gpu_cards = list(filter(lambda card: (card.memoryUtil < max_memory and card.load < max_load), gpu_cards)) + if order == 'memory': + gpu_cards.sort(key=lambda card: float('inf') if math.isnan(card.memoryUtil) else card.memoryUtil, reverse=False) + elif order == 'load': + gpu_cards.sort(key=lambda card: float('inf') if math.isnan(card.memoryUtil) else card.load, reverse=False) + else: + raise NotImplementedError(f"Qualcomm utils doesn't have support to compute availability based on {order}. " + f"Supported criteria: [memory, load]") + + gpu_cards = gpu_cards[0:min(limit, len(gpu_cards))] + return list(map(lambda card: card.id, gpu_cards)) + + @staticmethod + def get_docker_gpu_device_mapping(gpu_ids: Optional[List[int]], num_gpus: int = 0) -> Optional[Dict]: + if gpu_ids is None or not len(gpu_ids): + return None + + devices = [] + gpu_cards = QualcommNPUtil.__get_gpu_cards() + + for gpu_id in gpu_ids: + if not (gpu_id in gpu_cards and gpu_cards[gpu_id].device_path): + logging.error("Failed to get gpu device mapping for docker") + break + else: + device_path = gpu_cards[gpu_id].device_path + devices.append(f"{device_path}:{device_path}") + + return {"devices": devices} if len(devices) == len(gpu_ids) else None + + @staticmethod + def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]: + gpu_ids = [] + try: + docker_inspect_info = docker_client.api.inspect_container(container_name) + gpu_ids = QualcommNPUtil.__parse_gpu_ids(docker_inspect_info.get("HostConfig", {})) + except Exception as e: + logging.error(f"Failed to get GPU IDs: {e}") + pass + return gpu_ids + + @staticmethod + def __convert(npu) -> GPUCard: + # TODO (alaydshah): Add support for temperature + memory_total = npu.devData.resourceInfo.dramTotal / 1024 + memory_free = npu.devData.resourceInfo.dramFree / 1024 + memory_used = memory_total - memory_free + memory_utilized = float(memory_used) / float(memory_total) + nsp_free = npu.devData.resourceInfo.nspFree + nsp_total = npu.devData.resourceInfo.nspTotal + load = (nsp_total - nsp_free) / nsp_total + + return GPUCard( + id=npu.mhiId, + device_path=npu.name, + name=npu.pciInfo.devicename, + driver=npu.devData.fwQCImageVersionString, + serial=npu.devData.serial, + vendor=GPUCardType.QUALCOMM.name, + memoryTotal=memory_total, + memoryFree=memory_free, + memoryUsed=memory_used, + memoryUtil=memory_utilized, + load=load, + ) + + @staticmethod + def __parse_gpu_ids(host_config: dict) -> List[int]: + devices = host_config.get('Devices', []) + gpu_ids = [] + for device in devices: + gpu_id = QualcommNPUtil.__extract_integer_from_host_path(device.get('PathOnHost', None)) + + # Check explicitly if gpu_id is not None, as gpu_id can be 0, which is a valid value to include. + if gpu_id is not None: + gpu_ids.append(gpu_id) + return gpu_ids + + @staticmethod + def __extract_integer_from_host_path(host_path: str) -> Optional[int]: + if not host_path: + logging.error("Host Path is None; GPU Id extraction Failed") + return None + + npu_card_path = QualcommNPUtil.NPU_CARD_PATH + + # Check if host_path starts with npu_card_path + if host_path.startswith(npu_card_path): + + # Extract the numeric suffix from the host path + suffix = host_path[len(npu_card_path):] # Get the substring after npu_card_path + match = re.match(r'^(\d+)', suffix) # Use regex to match the leading integer + if match: + return int(match.group(1)) # Return the extracted integer + else: + logging.error(f"Failed to extract GPU id from Host Path {host_path}") + else: + logging.error(f"Host Path {host_path} doesn't start with NPU Card Path {npu_card_path}") + + # Return None if extraction fails + return None diff --git a/python/fedml/computing/scheduler/comm_utils/hardware_utils.py b/python/fedml/computing/scheduler/comm_utils/hardware_utils.py new file mode 100644 index 0000000000..8e0763753f --- /dev/null +++ b/python/fedml/computing/scheduler/comm_utils/hardware_utils.py @@ -0,0 +1,95 @@ +import logging +from typing import Optional, List, Dict + +from docker import DockerClient + +from fedml.computing.scheduler.comm_utils.gpu_utils.gpu_utils import GPUCardUtil, GPUCard +from fedml.computing.scheduler.comm_utils.gpu_utils.nvidia_utils import NvidiaGPUtil +from fedml.computing.scheduler.comm_utils.gpu_utils.qualcomm_utils import QualcommNPUtil +from fedml.computing.scheduler.comm_utils.singleton import Singleton + +GPU_CARD_UTILS = [NvidiaGPUtil, QualcommNPUtil] + + +# This function is just for debugging, can be removed at later point +def get_gpu_list_and_realtime_gpu_available_ids() -> (List[dict], List[int]): + gpu_list = HardwareUtil.get_gpus() + gpu_count = len(gpu_list) + realtime_available_gpu_ids = HardwareUtil.get_available_gpu_ids(order='memory', limit=gpu_count, max_load=0.01, + max_memory=0.01) + return gpu_list, realtime_available_gpu_ids + +# This function is just for debugging, can be removed at later point +def trim_unavailable_gpu_ids(gpu_ids) -> List[int]: + # Trim the gpu ids based on the realtime available gpu id list. + available_gpu_ids = [int(gpu_id) for gpu_id in gpu_ids] + gpu_list, realtime_available_gpu_ids = get_gpu_list_and_realtime_gpu_available_ids() + unavailable_gpu_ids = list() + + for gpu_id in available_gpu_ids: + if gpu_id not in realtime_available_gpu_ids: + unavailable_gpu_ids.append(gpu_id) + + trimmed_gpu_ids = list(set(available_gpu_ids) - set(unavailable_gpu_ids)) + return trimmed_gpu_ids.copy() + + +class HardwareUtil(metaclass=Singleton): + __gpu_util: Optional[GPUCardUtil] = None + + @classmethod + def __get_util(cls) -> Optional[GPUCardUtil]: + if cls.__gpu_util is not None: + return cls.__gpu_util + + for gpu_util in GPU_CARD_UTILS: + try: + if gpu_util.detect_gpu_card_type() is not None: + cls.__gpu_util = gpu_util() + return cls.__gpu_util + except Exception as e: + pass + + # logging.error("No GPU card detected") + return None + + @staticmethod + def get_gpus() -> List[GPUCard]: + gpu_util = HardwareUtil.__get_util() + cards = gpu_util.get_gpu_cards() if gpu_util is not None else [] + # logging.info(f"hardware_utils Available GPU cards len ---> { len(cards)}") + return cards + + @staticmethod + def get_available_gpu_ids(order: str = "memory", limit: int = 1, max_load: float = 0.01, + max_memory: float = 0.01) -> List[int]: + gpu_util = HardwareUtil.__get_util() + card_ids = gpu_util.get_available_gpu_card_ids(order, limit, max_load, max_memory) if gpu_util is not None else [] + # logging.info(f"hardware_utils get_available_gpu_ids ids ---> {card_ids}, limit ---> {limit}") + return card_ids + + @staticmethod + def get_docker_gpu_device_mapping(gpu_ids: Optional[List[int]], num_gpus: int = 0) -> Optional[Dict]: + gpu_util = HardwareUtil.__get_util() + if gpu_util is not None: + return gpu_util.get_docker_gpu_device_mapping(gpu_ids, num_gpus) + return None + + @staticmethod + def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: DockerClient) -> List[int]: + gpu_ids = [] + gpu_util = HardwareUtil.__get_util() + if gpu_util is not None: + gpu_ids = gpu_util.get_docker_gpu_ids_by_container_name(container_name, docker_client) + return gpu_ids + + +if __name__ == "__main__": + gpus = HardwareUtil.get_gpus() + get_available_gpu_cards = HardwareUtil.get_available_gpu_ids(limit=len(gpus)) + trimmed_gpu_ids = trim_unavailable_gpu_ids(get_available_gpu_cards) + print(trimmed_gpu_ids) + device_mapping = HardwareUtil.get_docker_gpu_device_mapping(get_available_gpu_cards, len(get_available_gpu_cards)) + print(gpus) + print(get_available_gpu_cards) + print(device_mapping) diff --git a/python/fedml/computing/scheduler/comm_utils/job_cleanup.py b/python/fedml/computing/scheduler/comm_utils/job_cleanup.py index ed30c1bf2e..6700b0bc7a 100644 --- a/python/fedml/computing/scheduler/comm_utils/job_cleanup.py +++ b/python/fedml/computing/scheduler/comm_utils/job_cleanup.py @@ -44,6 +44,7 @@ def sync_run_process_gpu(self): ComputeCacheManager.get_instance().get_gpu_cache().get_run_info_sync_lock_key("") ): count = 0 + client_data_interface.FedMLClientDataInterface.get_instance().create_job_table() job_list = client_data_interface.FedMLClientDataInterface.get_instance().get_jobs_from_db() for job in job_list.job_list: count += 1 diff --git a/python/fedml/computing/scheduler/comm_utils/job_monitor.py b/python/fedml/computing/scheduler/comm_utils/job_monitor.py index 5874adfef7..b8237d93ba 100644 --- a/python/fedml/computing/scheduler/comm_utils/job_monitor.py +++ b/python/fedml/computing/scheduler/comm_utils/job_monitor.py @@ -40,6 +40,7 @@ from fedml.core.mlops.mlops_runtime_log import MLOpsRuntimeLog from fedml.core.mlops.mlops_utils import MLOpsLoggingUtils from fedml.core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon +from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants from ..scheduler_core.endpoint_sync_protocol import FedMLEndpointSyncProtocol from ..model_scheduler.device_server_constants import ServerConstants @@ -48,6 +49,7 @@ class JobMonitor(Singleton): ENDPOINT_CONTAINER_LOG_PREFIX = "endpoint" TIME_INTERVAL_FOR_INFERENCE_ON_GATEWAY = 60 * 10 + ENDPOINT_CONTAINER_LOG_SUBDIR = "monitor_endpoint_logs" def __init__(self): if not hasattr(self, "endpoint_unavailable_counter"): @@ -104,91 +106,95 @@ def autoscaler_reconcile_after_interval(self): for endpoint_settings in endpoints_settings_list: endpoint_state = endpoint_settings["state"] if endpoint_state == "DEPLOYED" and endpoint_settings["enable_auto_scaling"]: - logging.info(f"After interval, check the autoscaler for async future list." - f"{self.endpoints_autoscale_predict_future}") - # TODO(fedml-dimitris): The policy can be set dynamically or be user specific. - # Set the policy, here we use latency, but other metrics are possible as well, such as qps. - # For more advanced use cases look for the testing scripts under the autoscaler/test directory. - autoscaling_policy_config = \ - { - "current_replicas": int(endpoint_settings["replica_num"]), - "min_replicas": int(endpoint_settings["scale_min"]), - "max_replicas": int(endpoint_settings["scale_max"]), - "queries_per_replica": int(endpoint_settings["target_queries_per_replica"]), - "window_size_secs": int(endpoint_settings["aggregation_window_size_seconds"]), - "scaledown_delay_secs": int(endpoint_settings["scale_down_delay_seconds"]), - } - autoscaling_policy = ConcurrentQueryPolicy(**autoscaling_policy_config) - - e_id, e_name, model_name = endpoint_settings["endpoint_id"], endpoint_settings["endpoint_name"], \ - endpoint_settings["model_name"] - - logging.info(f"Querying the autoscaler for endpoint {e_id} with user settings {endpoint_settings}.") - - # For every endpoint we just update the policy configuration. - autoscaling_policy.min_replicas = endpoint_settings["scale_min"] - autoscaling_policy.max_replicas = endpoint_settings["scale_max"] - # We retrieve a list of replicas for every endpoint. The number - # of running replicas is the length of that list. - current_replicas = len(fedml_model_cache.get_endpoint_replicas_results(e_id)) - autoscaling_policy.current_replicas = current_replicas - logging.info(f"Endpoint {e_id} autoscaling policy: {autoscaling_policy}.") - - scale_op = autoscaler.scale_operation_endpoint( - autoscaling_policy, - str(e_id)) - - new_replicas = current_replicas + scale_op.value - - logging.info(f"Scaling operation {scale_op.value} for endpoint {e_id} .") - logging.info(f"New Replicas {new_replicas} for endpoint {e_id} .") - logging.info(f"Current Replicas {current_replicas} for endpoint {e_id} .") - if current_replicas == new_replicas: - # Basically the autoscaler decided that no scaling operation should take place. - logging.info(f"No scaling operation for endpoint {e_id}.") - return - - # Should scale in / out - curr_version = fedml.get_env_version() - - if curr_version == "release": - mlops_prefix = "https://open.fedml.ai/" - elif curr_version == "test": - mlops_prefix = "https://open-test.fedml.ai/" - else: - logging.error(f"Do not support the version {curr_version}.") - return - autoscale_url_path = "fedmlModelServer/api/v1/endpoint/auto-scale" - url = f"{mlops_prefix}{autoscale_url_path}" - - # Get cached token for authorization of autoscale request - cached_token = fedml_model_cache.get_end_point_token(e_id, e_name, model_name) - if cached_token is None: - logging.error(f"Failed to get the cached token for endpoint {e_id}.") - return - - req_header = { - "Authorization": f"Bearer {cached_token}" - } - req_body = { - "endpointId": int(e_id), - "replicasDesired": int(new_replicas) - } + try: # Should not let one endpoint affect the others + logging.info(f"After interval, check the autoscaler for async future list." + f"{self.endpoints_autoscale_predict_future}") + # TODO(fedml-dimitris): The policy can be set dynamically or be user specific. + # Set the policy, here we use latency, but other metrics are possible as well, such as qps. + # For more advanced use cases look for the testing scripts under the autoscaler/test directory. + autoscaling_policy_config = \ + { + "current_replicas": int(endpoint_settings["replica_num"]), + "min_replicas": int(endpoint_settings["scale_min"]), + "max_replicas": int(endpoint_settings["scale_max"]), + "queries_per_replica": int(endpoint_settings["target_queries_per_replica"]), + "window_size_secs": int(endpoint_settings["aggregation_window_size_seconds"]), + "scaledown_delay_secs": int(endpoint_settings["scale_down_delay_seconds"]), + } + autoscaling_policy = ConcurrentQueryPolicy(**autoscaling_policy_config) + + e_id, e_name, model_name = endpoint_settings["endpoint_id"], endpoint_settings["endpoint_name"], \ + endpoint_settings["model_name"] + + logging.info(f"Querying the autoscaler for endpoint {e_id} with user settings {endpoint_settings}.") + + # For every endpoint we just update the policy configuration. + autoscaling_policy.min_replicas = endpoint_settings["scale_min"] + autoscaling_policy.max_replicas = endpoint_settings["scale_max"] + # We retrieve a list of replicas for every endpoint. The number + # of running replicas is the length of that list. + current_replicas = len(fedml_model_cache.get_endpoint_replicas_results(e_id)) + autoscaling_policy.current_replicas = current_replicas + logging.info(f"Endpoint {e_id} autoscaling policy: {autoscaling_policy}.") + + scale_op = autoscaler.scale_operation_endpoint( + autoscaling_policy, + str(e_id)) + + new_replicas = current_replicas + scale_op.value + + logging.info(f"Scaling operation {scale_op.value} for endpoint {e_id} .") + logging.info(f"New Replicas {new_replicas} for endpoint {e_id} .") + logging.info(f"Current Replicas {current_replicas} for endpoint {e_id} .") + if current_replicas == new_replicas: + # Basically the autoscaler decided that no scaling operation should take place. + logging.info(f"No scaling operation for endpoint {e_id}.") + continue - try: - logging.info(f"Sending the autoscale request to MLOps platform. url {url}, " - f"body {req_body}., header {req_header}") - response = requests.post( - url, - headers=req_header, - json=req_body - ) - if response.status_code != 200: - logging.error(f"Failed to send the autoscale request to MLOps platform.") + # Should scale in / out + curr_version = fedml.get_env_version() + + if curr_version == "release": + mlops_prefix = "https://open.fedml.ai/" + elif curr_version == "test": + mlops_prefix = "https://open-test.fedml.ai/" else: - logging.info(f"Successfully sent the autoscale request to MLOps platform.") + logging.error(f"Do not support the version {curr_version}.") + continue + autoscale_url_path = "fedmlModelServer/api/v1/endpoint/auto-scale" + url = f"{mlops_prefix}{autoscale_url_path}" + + # Get cached token for authorization of autoscale request + cached_token = fedml_model_cache.get_end_point_token(e_id, e_name, model_name) + if cached_token is None: + logging.error(f"Failed to get the cached token for endpoint {e_id}.") + continue + + req_header = { + "Authorization": f"Bearer {cached_token}" + } + req_body = { + "endpointId": int(e_id), + "replicasDesired": int(new_replicas) + } + + try: + logging.info(f"Sending the autoscale request to MLOps platform. url {url}, " + f"body {req_body}., header {req_header}") + response = requests.post( + url, + headers=req_header, + json=req_body + ) + if response.status_code != 200: + logging.error(f"Failed to send the autoscale request to MLOps platform.") + else: + logging.info(f"Successfully sent the autoscale request to MLOps platform.") + except Exception as e: + logging.error(f"Failed to send the autoscale request to MLOps platform. {e}") except Exception as e: - logging.error(f"Failed to send the autoscale request to MLOps platform. {e}") + logging.error(f"Error in autoscaler reconcile after interval. {e}") + pass return @staticmethod @@ -204,6 +210,9 @@ def monitor_replicas_number(): endpoint_replicas_details = {} if isinstance(endpoint_detail, str): endpoint_replicas_details = json.loads(endpoint_detail) + # TODO: Check out this nested json + if isinstance(endpoint_replicas_details, str): + endpoint_replicas_details = json.loads(endpoint_replicas_details) if "result" in endpoint_replicas_details: endpoint_replica_details = {} @@ -214,16 +223,9 @@ def monitor_replicas_number(): endpoint_replica_details["end_point_id"], 0) + 1 for endpoint_id, num_replica in res_to_mlops.items(): - curr_version = fedml.get_env_version() num_replica_url_path = "fedmlModelServer/api/v1/endpoint/replica-info" - if curr_version == "release": - mlops_prefix = "https://open.fedml.ai/" - elif curr_version == "test": - mlops_prefix = "https://open-test.fedml.ai/" - else: - logging.error(f"Do not support the version {curr_version}.") - return - url = f"{mlops_prefix}{num_replica_url_path}" + mlops_prefix = fedml._get_backend_service() + url = f"{mlops_prefix}/{num_replica_url_path}" cached_token = FedMLModelCache.get_instance().get_end_point_token_with_eid(endpoint_id) if cached_token is None: @@ -344,7 +346,7 @@ def monitor_slave_run_process_status(self): break # Calc the timeout - started_time = int(float(job.started_time)) + started_time = JobMonitor.get_started_time(job) timeout = time.time() - started_time job_type = JobRunnerUtils.parse_job_type(job.running_json) @@ -352,15 +354,17 @@ def monitor_slave_run_process_status(self): continue # Check if all processes of the specific run are exited - run_process_list = client_constants.ClientConstants.get_learning_process_list(job.job_id) - all_run_processes_exited = True if len(run_process_list) <= 0 else False - if all_run_processes_exited: - if not self.released_runs.get(str(job.job_id), False): - self.released_runs[str(job.job_id)] = True - # Release the gpu ids - print( - f"[run/device][{job.job_id}/{job.edge_id}] Release gpu resource when run processes has exited on monioring slave runs periodically.") - JobRunnerUtils.get_instance().release_gpu_ids(job.job_id, job.edge_id) + # FIXME: Proactively release the gpu ids when the run processes have not even started yet as the docker + # image is being pulled + # run_process_list = client_constants.ClientConstants.get_learning_process_list(job.job_id) + # all_run_processes_exited = True if len(run_process_list) <= 0 else False + # if all_run_processes_exited: + # if not self.released_runs.get(str(job.job_id), False): + # self.released_runs[str(job.job_id)] = True + # # Release the gpu ids + # print( + # f"[run/device][{job.job_id}/{job.edge_id}] Release gpu resource when run processes has exited on monioring slave runs periodically.") + # JobRunnerUtils.get_instance().release_gpu_ids(job.job_id, job.edge_id) # Get the timeout threshold timeout_threshold = None @@ -379,8 +383,9 @@ def monitor_slave_run_process_status(self): # If the run processes have exited but run status is not completed and # timeout is out of the range, then release gpu ids and report failed status to the master agent. - if all_run_processes_exited and not SchedulerConstants.is_run_completed(job.status) and \ - timeout_threshold is not None and timeout > timeout_threshold: + # if all_run_processes_exited and not SchedulerConstants.is_run_completed(job.status) and \ + # timeout_threshold is not None and timeout > timeout_threshold: + if timeout_threshold is not None and timeout > timeout_threshold: # Report failed status to the master agent mlops.log_training_failed_status( run_id=job.job_id, edge_id=job.edge_id, enable_broadcast=True) @@ -430,6 +435,15 @@ def monitor_slave_run_process_status(self): logging.error(f"Exception when monitoring endpoint process on the slave agent.{traceback.format_exc()}") pass + @staticmethod + def get_started_time(job): + started_time = int(float(job.started_time)) + if started_time <= 0: + started_time = int(float(job.updated_time)) + if started_time <= 0: + started_time = time.time() + return started_time + def monitor_master_run_process_status(self, server_id, device_info_reporter=None): try: ComputeCacheManager.get_instance().set_redis_params() @@ -441,7 +455,7 @@ def monitor_master_run_process_status(self, server_id, device_info_reporter=None break # Calc the timeout - started_time = int(float(job.started_time)) + started_time = JobMonitor.get_started_time(job) timeout = time.time() - started_time # Get the timeout threshold @@ -574,6 +588,15 @@ def monitor_slave_endpoint_status(self): is_endpoint_ready = self._check_and_reset_endpoint_status( job.job_id, job.edge_id, deployment_result, only_check_inference_ready_status=True) + # [Hotfix] Under high-concurrency situation, the ready endpoint might not be available + # But the container is in health state + # In this case, we need to have an exact 503 code, instead of timeout to decide to restart + # TODO(Raphael): Split the /ready endpoint and predict endpoint traffic + if not self._lenient_check_replica_ready(deployment_result): + is_endpoint_ready = False + else: + is_endpoint_ready = True + # Get endpoint container name prefix, prepare for restart endpoint_container_name_prefix = \ (device_client_constants.ClientConstants.get_endpoint_container_name( @@ -689,7 +712,7 @@ def monitor_slave_endpoint_status(self): endpoint_name = endpoint_json.get("end_point_name", None) device_ids = endpoint_json.get("device_ids", []) - started_time = int(float(job.started_time)) + started_time = JobMonitor.get_started_time(job) timeout = time.time() - started_time if timeout > SchedulerConstants.ENDPOINT_DEPLOYMENT_DEPLOYING_TIMEOUT: print(f"[Worker][{job.job_id}:{job.edge_id}] Due to timeout, " @@ -736,6 +759,46 @@ def monitor_slave_endpoint_status(self): except Exception as e: pass + @staticmethod + def _lenient_check_replica_ready(deployment_result): + """ + Double-check the replica's liveness using /ready api: + if 200 -> return True + [Critical] if timeout -> Could be under high pressure -> return True + if HTTP_202_ACCEPTED -> unhealthy -> return False + """ + result_json = deployment_result + inference_url = result_json.get("model_url", None) + liveliness_check = result_json.get("model_metadata", {}).get("liveliness_check", None) + readiness_check = result_json.get("model_metadata", {}).get("readiness_check", None) + + if liveliness_check: + if liveliness_check == ClientConstants.LIVENESS_PROBE_DEFAULT: + liveliness_check = readiness_check # Follow the readiness check pattern + if not isinstance(liveliness_check, dict): + logging.warning(f"Healthiness check is not a dict. {liveliness_check}") + return True + if "path" not in liveliness_check: + logging.warning(f"Healthiness check does not have path. {liveliness_check}") + return True + response_ok = asyncio.run(FedMLHttpInference.is_inference_ready( + inference_url, timeout=SchedulerConstants.ENDPOINT_INFERENCE_READY_TIMEOUT, + path=liveliness_check["path"])) + if response_ok is None: + # This means the server return 202 + return False + return True + + # Make a curl get to inference_url/ready with timeout 5s + # TODO(Raphael): Also support PROXY and MQTT to check the readiness + response_ok = asyncio.run(FedMLHttpInference.is_inference_ready(inference_url, timeout=5)) + if response_ok is None: + # This means the server return 202 + return False + + # 200 or Timeout + return True + def _check_and_reset_endpoint_status( self, endpoint_id, device_id, deployment_result, only_check_inference_ready_status=False, should_release_gpu_ids=False @@ -761,6 +824,7 @@ def _check_and_reset_endpoint_status( if self.endpoint_unavailable_counter.get(str(endpoint_id)) is None: self.endpoint_unavailable_counter[str(endpoint_id)] = 0 + if not response_ok: self.endpoint_unavailable_counter[str(endpoint_id)] += 1 else: @@ -1011,8 +1075,11 @@ def monitor_endpoint_logs(self): model_version = model_config.get("model_version", None) endpoint_name = endpoint_json.get("end_point_name", None) + log_file_dir = os.path.join( + device_client_constants.ClientConstants.get_log_file_dir(), + JobMonitor.ENDPOINT_CONTAINER_LOG_SUBDIR) log_file_path, program_prefix = MLOpsLoggingUtils.build_log_file_path_with_run_params( - job.job_id, int(job.edge_id), device_server_constants.ServerConstants.get_log_file_dir(), is_server=True, + job.job_id, int(job.edge_id), log_file_dir, is_server=False, log_file_prefix=JobMonitor.ENDPOINT_CONTAINER_LOG_PREFIX, ) @@ -1086,8 +1153,9 @@ def monitor_endpoint_logs(self): nano_second_str = container_time.split(".")[1][:9] t_datetime_obj = isoparse(container_time) - if t_sec_offset is not None: - t_datetime_obj = t_datetime_obj + datetime.timedelta(seconds=t_sec_offset) + # ISSUE: this will cause the timestamp is not correct. + #if t_sec_offset is not None: + # t_datetime_obj = t_datetime_obj + datetime.timedelta(seconds=t_sec_offset) except Exception as e: logging.error(f"Exception when parsing the container log time {e}") t_datetime_obj = datetime.datetime.now() diff --git a/python/fedml/computing/scheduler/comm_utils/job_utils.py b/python/fedml/computing/scheduler/comm_utils/job_utils.py index 384cbacd1d..b1ecd9b11b 100644 --- a/python/fedml/computing/scheduler/comm_utils/job_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/job_utils.py @@ -2,7 +2,6 @@ import os import platform import traceback -import GPUtil import docker import fedml from docker import errors, DockerClient @@ -87,16 +86,37 @@ def occupy_gpu_ids(self, run_id, request_gpu_num, device_id, inner_id=None, # Get the available GPU list, FEDML_GLOBAL_DEVICE_AVAILABLE_GPU_IDS_TAG-${device_id} available_gpu_ids = ComputeCacheManager.get_instance().get_gpu_cache().get_device_available_gpu_ids( device_id) + logging.info(f"Available GPU Ids fetched from cache: {available_gpu_ids}") logging.info(f"Check worker({device_id})'s realtime gpu availability in DB" f" for run {run_id}: {available_gpu_ids}") + + # Get realtime GPU availability list from the system + realtime_available_gpu_ids = JobRunnerUtils.get_realtime_gpu_available_ids().copy() + logging.info(f"Cache not set yet, fetching realtime available GPU Ids: {realtime_available_gpu_ids}") # If the available GPU list is not in the cache, set it to the current system available GPU list if available_gpu_ids is None: # Get realtime GPU availability list from the system - available_gpu_ids = JobRunnerUtils.get_realtime_gpu_available_ids().copy() + available_gpu_ids = realtime_available_gpu_ids else: available_gpu_ids = JobRunnerUtils.trim_unavailable_gpu_ids(available_gpu_ids) + logging.info(f"Trimmed available GPU Ids: {available_gpu_ids}") + + initial_available_gpu_ids = ComputeCacheManager.get_instance().get_gpu_cache().get_device_initial_available_gpu_ids( + device_id) + # calculate the difference between realtime_available_gpu_ids and initial_available_gpu_ids + # if the difference is not empty, then add to available gpu ids + diff_gpu_ids = list(set(realtime_available_gpu_ids) - set(initial_available_gpu_ids)) + if diff_gpu_ids: + available_gpu_ids.extend(diff_gpu_ids) + available_gpu_ids = list(set(available_gpu_ids)) + available_gpu_ids.sort() + logging.info(f"Device {device_id} available GPU ids is changed because of the system gpu resource change, " + f"initial available gpu ids: {initial_available_gpu_ids}, " + f"realtime available gpu ids: {realtime_available_gpu_ids}, " + f"diff gpu ids: {diff_gpu_ids}, " + f"new available gpu ids: {available_gpu_ids}") # Get the matched gpu ids string by the request gpu num cuda_visible_gpu_ids_str, matched_gpu_num = JobRunnerUtils.request_gpu_ids(request_gpu_num, @@ -120,6 +140,8 @@ def occupy_gpu_ids(self, run_id, request_gpu_num, device_id, inner_id=None, ComputeCacheManager.get_instance().get_gpu_cache().set_device_available_gpu_ids( device_id, available_gpu_ids) + + logging.info(f"Updated cache with following available gpu ids: {available_gpu_ids}") # For a single run, could be scale up. So if existed such a key, should extend, not replace existed_gpu_nums = ComputeCacheManager.get_instance().get_gpu_cache().get_device_run_num_gpus( @@ -159,23 +181,8 @@ def occupy_gpu_ids(self, run_id, request_gpu_num, device_id, inner_id=None, @staticmethod def search_and_refresh_available_gpu_ids(available_gpu_ids): trimmed_gpu_ids = JobRunnerUtils.trim_unavailable_gpu_ids(available_gpu_ids) - # if len(trimmed_gpu_ids) <= 0: - # available_gpu_ids = JobRunnerUtils.balance_available_gpu_ids(trimmed_gpu_ids) return trimmed_gpu_ids - @staticmethod - def balance_available_gpu_ids(available_gpu_ids): - gpu_list, realtime_available_gpu_ids = JobRunnerUtils.get_gpu_list_and_realtime_gpu_available_ids() - available_gpu_ids = realtime_available_gpu_ids - if len(available_gpu_ids) <= 0: - for gpu in gpu_list: - gpu = GPUtil.GPU(gpu) - if gpu.memoryUtil > 0.8: - continue - available_gpu_ids.append(gpu.id) - - return available_gpu_ids.copy() - @staticmethod def request_gpu_ids(request_gpu_num, available_gpu_ids): available_gpu_count = len(available_gpu_ids) @@ -330,6 +337,9 @@ def get_available_gpu_id_list(device_id): # Get realtime GPU availability list from the system gpu_ids = JobRunnerUtils.get_realtime_gpu_available_ids().copy() ComputeCacheManager.get_instance().get_gpu_cache().set_device_available_gpu_ids(device_id, gpu_ids) + # Set the initial available GPU ids to the cache, use to check if the device all available GPU ids is changed because of the system resource change + ComputeCacheManager.get_instance().get_gpu_cache().set_device_initial_available_gpu_ids(device_id, gpu_ids) + logging.info(f"Set device {device_id} initial available GPU ids: {gpu_ids}") available_gpu_ids = gpu_ids return available_gpu_ids @@ -348,6 +358,9 @@ def reset_available_gpu_id_list(device_id): current_available_gpu_ids = JobRunnerUtils.get_realtime_gpu_available_ids().copy() ComputeCacheManager.get_instance().get_gpu_cache().set_device_available_gpu_ids(device_id, current_available_gpu_ids) + # Set the initial available GPU ids to the cache, use to check if the device all available GPU ids is changed because of the system resource change + ComputeCacheManager.get_instance().get_gpu_cache().set_device_initial_available_gpu_ids(device_id, current_available_gpu_ids) + gpu_list = sys_utils.get_gpu_list() ComputeCacheManager.get_instance().get_gpu_cache().set_device_total_num_gpus(device_id, len(gpu_list)) except Exception as e: @@ -360,6 +373,7 @@ def get_realtime_gpu_available_ids(): gpu_list = sys_utils.get_gpu_list() gpu_count = len(gpu_list) realtime_available_gpu_ids = sys_utils.get_available_gpu_id_list(limit=gpu_count) + logging.info(f"get_available_gpu_id_list limit:{gpu_count}, available_gpu_ids:{realtime_available_gpu_ids}") return realtime_available_gpu_ids @staticmethod @@ -583,11 +597,21 @@ def get_run_container_name(run_id: int) -> str: container_name = f"{container_prefix}__{run_id}" return container_name + @staticmethod + def docker_client_exists() -> bool: + try: + client = docker.from_env() + client.ping() + return True + except docker.errors.DockerException: + return False + @staticmethod def get_docker_client(docker_args: DockerArgs) -> DockerClient: try: client = docker.from_env() - client.login(username=docker_args.username, password=docker_args.password, registry=docker_args.registry) + if docker_args.username != "" and docker_args.registry != "": + client.login(username=docker_args.username, password=docker_args.password, registry=docker_args.registry) except Exception as e: raise Exception(f"Failed to connect to the docker daemon, please ensure that you have " f"installed Docker Desktop or Docker Engine, and the docker is running. Exception {e}") @@ -727,6 +751,9 @@ def parse_job_type(running_json): job_type = job_yaml.get("job_type", None) job_type = job_yaml.get("task_type", SchedulerConstants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type + model_config = running_json_obj.get("model_config", None) + if model_config is not None: + job_type = SchedulerConstants.JOB_TASK_TYPE_DEPLOY return job_type @staticmethod diff --git a/python/fedml/computing/scheduler/comm_utils/network_util.py b/python/fedml/computing/scheduler/comm_utils/network_util.py new file mode 100644 index 0000000000..b03b0428d0 --- /dev/null +++ b/python/fedml/computing/scheduler/comm_utils/network_util.py @@ -0,0 +1,29 @@ +import os +from urllib.parse import urlparse +from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants + + +def return_this_device_connectivity_type() -> str: + """ + Return -> "http" | "http_proxy" |"mqtt" + """ + # Get the environmental variable's value and convert to lower case. + env_conn_type = os.getenv(ClientConstants.ENV_CONNECTION_TYPE_KEY, "").lower() + if env_conn_type in [ + ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP, + ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY, + ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT + ]: + return env_conn_type + else: + return ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT + + +def replace_url_with_path(url: str, path: str) -> str: + """ + Replace the path of the URL with the given path. + """ + if path is None: + return url + url_parsed = urlparse(url) + return f"{url_parsed.scheme}://{url_parsed.netloc}/{path}" diff --git a/python/fedml/computing/scheduler/comm_utils/run_process_utils.py b/python/fedml/computing/scheduler/comm_utils/run_process_utils.py index e64e708fb5..a84b078b54 100644 --- a/python/fedml/computing/scheduler/comm_utils/run_process_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/run_process_utils.py @@ -14,8 +14,10 @@ def get_run_process_prefix(prefix, run_id): return f"{prefix}-run@{run_id}@pid@" @staticmethod - def cleanup_run_process(run_id, data_dir, info_dir, - info_file_prefix=SchedulerConstants.RUN_PROCESS_TYPE_RUNNER_PROCESS): + def cleanup_run_process( + run_id, data_dir, info_dir, + info_file_prefix=SchedulerConstants.RUN_PROCESS_TYPE_RUNNER_PROCESS, not_kill_subprocess=False + ): try: local_pkg_data_dir = data_dir run_process_dir = os.path.join(local_pkg_data_dir, info_dir) @@ -43,12 +45,13 @@ def cleanup_run_process(run_id, data_dir, info_dir, try: process = psutil.Process(int(process_id)) - child_processes = process.children(recursive=True) - for sub_process in child_processes: - if platform.system() == 'Windows': - os.system("taskkill /PID {} /T /F".format(sub_process.pid)) - else: - os.kill(sub_process.pid, signal.SIGKILL) + if not not_kill_subprocess: + child_processes = process.children(recursive=True) + for sub_process in child_processes: + if platform.system() == 'Windows': + os.system("taskkill /PID {} /T /F".format(sub_process.pid)) + else: + os.kill(sub_process.pid, signal.SIGKILL) if process is not None: if platform.system() == 'Windows': @@ -163,26 +166,39 @@ def generate_yaml_doc(run_config_object, yaml_file): @staticmethod def get_pid_from_cmd_line(cmd_line, break_on_first=True): ret_pids = list() - pids = psutil.process_iter() - for pid in pids: - try: - for cmd in pid.cmdline(): - if cmd.find(cmd_line) != -1: - is_running = False - try: - process = psutil.Process(pid.pid) - if process.status() == psutil.STATUS_RUNNING or \ - process.status() == psutil.STATUS_SLEEPING or \ - process.status() == psutil.STATUS_IDLE: - is_running = True - except Exception as e: - pass - if is_running: - ret_pids.append(pid.pid) - if break_on_first: - return ret_pids - except Exception as e: - pass + try: + for pid in psutil.process_iter(): + try: + try: + _ = pid.as_dict(attrs=['cpu_times', 'name', 'pid', 'status']) + except psutil.ZombieProcess: + # Filter out zombie processes + continue + except psutil.NoSuchProcess: + continue + + for cmd in pid.cmdline(): + if cmd.find(cmd_line) != -1: + is_running = False + try: + process = psutil.Process(pid.pid) + if process.status() == psutil.STATUS_RUNNING or \ + process.status() == psutil.STATUS_SLEEPING or \ + process.status() == psutil.STATUS_IDLE: + is_running = True + except Exception as e: + print(f"Error in get_pid_from_cmd_line inner loop: {e}") + pass + if is_running: + ret_pids.append(pid.pid) + if break_on_first: + return ret_pids + except Exception as e: + # print(f"Error in get_pid_from_cmd_line inner loop: {e}") + continue + except Exception as e: + print(f"Error in get_pid_from_cmd_line outer loop: {e}") + pass return ret_pids diff --git a/python/fedml/computing/scheduler/model_scheduler/autoscaler/utils/singleton.py b/python/fedml/computing/scheduler/comm_utils/singleton.py similarity index 56% rename from python/fedml/computing/scheduler/model_scheduler/autoscaler/utils/singleton.py rename to python/fedml/computing/scheduler/comm_utils/singleton.py index 5c76acea97..dd403965c1 100644 --- a/python/fedml/computing/scheduler/model_scheduler/autoscaler/utils/singleton.py +++ b/python/fedml/computing/scheduler/comm_utils/singleton.py @@ -1,3 +1,6 @@ +import threading + + class Singleton(type): """ @@ -8,8 +11,14 @@ class Singleton(type): """ _instances = {} + # For thread safety + _lock = threading.Lock() def __call__(cls, *args, **kwargs): if cls not in cls._instances: - cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) + with cls._lock: + # Another thread might have created the instance before the lock was acquired. + # So check again if the instance is already created. + if cls not in cls._instances: + cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) return cls._instances[cls] diff --git a/python/fedml/computing/scheduler/comm_utils/sys_utils.py b/python/fedml/computing/scheduler/comm_utils/sys_utils.py index 64313b0864..6dbef9bde3 100644 --- a/python/fedml/computing/scheduler/comm_utils/sys_utils.py +++ b/python/fedml/computing/scheduler/comm_utils/sys_utils.py @@ -10,6 +10,7 @@ import psutil import yaml +from fedml.computing.scheduler.comm_utils.hardware_utils import HardwareUtil from fedml.computing.scheduler.comm_utils.yaml_utils import load_yaml_config import json from urllib import request @@ -18,9 +19,6 @@ from packaging import version import sys import subprocess -import GPUtil - -from fedml.computing.scheduler.slave.client_constants import ClientConstants FETAL_ERROR_START_CODE = 128 @@ -95,7 +93,7 @@ def get_sys_runner_info(): pass try: - gpus = GPUtil.getGPUs() + gpus = HardwareUtil.get_gpus() memory_total = 0.0 memory_free = 0.0 for gpu in gpus: @@ -105,9 +103,11 @@ def get_sys_runner_info(): gpu_available_mem = "{:.1f} G".format(memory_free / 1024.0) gpu_total_mem = "{:.1f}G".format(memory_total / 1024.0) gpu_count = len(gpus) - gpu_vendor = "nvidia" + if gpu_count: + gpu_vendor = gpus[0].vendor + gpu_device_name = gpus[0].name - gpu_device_name = torch.cuda.get_device_name(0) + # gpu_device_name = torch.cuda.get_device_name(0) gpu_info = gpu_device_name except: pass @@ -168,7 +168,7 @@ def get_gpu_list(): return ret_gpu_list[0:simulation_gpu_count] - gpu_list = GPUtil.getGPUs() + gpu_list = HardwareUtil.get_gpus() ret_gpu_list = list() for gpu in gpu_list: ret_gpu_item = {"ID": gpu.id, "uuid": gpu.uuid, "load": gpu.load, @@ -189,7 +189,8 @@ def get_available_gpu_id_list(limit=1) -> List[int]: available_gpu_ids.append(count) return available_gpu_ids[0:simulation_gpu_count] - gpu_available_list = GPUtil.getAvailable(order='memory', limit=limit, maxLoad=0.01, maxMemory=0.01) + gpu_available_list = HardwareUtil.get_available_gpu_ids(order='memory', limit=limit, max_load=0.01, + max_memory=0.01) return gpu_available_list @@ -219,9 +220,10 @@ def get_gpu_count_vendor(): gpu_count = 0 gpu_vendor = "" try: - gpus = GPUtil.getGPUs() + gpus = HardwareUtil.get_gpus() gpu_count = len(gpus) - gpu_vendor = "nvidia" + if gpu_count: + gpu_vendor = gpus[0].vendor except: pass @@ -299,249 +301,303 @@ def save_login_process(runner_home_dir, runner_info_dir, edge_process_id): def cleanup_all_fedml_client_learning_processes(): # Cleanup all fedml client learning processes. - for process in psutil.process_iter(): - try: - pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) - found_learning_process = False - found_client_process = False - for cmd in pinfo["cmdline"]: - if str(cmd).find("fedml_config.yaml") != -1: - found_learning_process = True - - if str(cmd).find("client") != -1: - found_client_process = True - - if found_learning_process and found_client_process: - # click.echo("find client learning process at {}.".format(process.pid)) - if platform.system() == 'Windows': - os.system("taskkill /PID {} /T /F".format(process.pid)) - else: - os.killpg(os.getpgid(process.pid), signal.SIGKILL) - except Exception as e: - pass + try: + for process in psutil.process_iter(): + try: + pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) + found_learning_process = False + found_client_process = False + for cmd in pinfo["cmdline"]: + if str(cmd).find("fedml_config.yaml") != -1: + found_learning_process = True + + if str(cmd).find("client") != -1: + found_client_process = True + + if found_learning_process and found_client_process: + # click.echo("find client learning process at {}.".format(process.pid)) + if platform.system() == 'Windows': + os.system("taskkill /PID {} /T /F".format(process.pid)) + else: + os.killpg(os.getpgid(process.pid), signal.SIGKILL) + except Exception as e: + print(f"Failed to cleanup the client learning process due to {e}.") + pass + except Exception as e: + print(f"Failed to cleanup the client learning process due to {e}.") + pass def cleanup_all_fedml_client_diagnosis_processes(): # Cleanup all fedml client learning processes. - for process in psutil.process_iter(): - try: - pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) - found_client_diagnosis_process = False - for cmd in pinfo["cmdline"]: - if str(cmd).find("client_diagnosis") != -1: - found_client_diagnosis_process = True - - if found_client_diagnosis_process: - # click.echo("find client diagnosis process at {}.".format(process.pid)) - if platform.system() == 'Windows': - os.system("taskkill /PID {} /T /F".format(process.pid)) - else: - os.killpg(os.getpgid(process.pid), signal.SIGKILL) - except Exception as e: - pass + try: + for process in psutil.process_iter(): + try: + pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) + found_client_diagnosis_process = False + for cmd in pinfo["cmdline"]: + if str(cmd).find("client_diagnosis") != -1: + found_client_diagnosis_process = True + + if found_client_diagnosis_process: + # click.echo("find client diagnosis process at {}.".format(process.pid)) + if platform.system() == 'Windows': + os.system("taskkill /PID {} /T /F".format(process.pid)) + else: + os.killpg(os.getpgid(process.pid), signal.SIGKILL) + except Exception as e: + print(f"Failed to cleanup the client diagnosis process due to {e}.") + pass + except Exception as e: + print(f"Failed to cleanup the client diagnosis process due to {e}.") + pass def cleanup_all_fedml_client_login_processes(login_program, clean_process_group=True): # Cleanup all fedml client login processes. - for process in psutil.process_iter(): - try: - pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) - for cmd in pinfo["cmdline"]: - if str(cmd).find(login_program) != -1: - if os.path.basename(cmd) == login_program: - # click.echo("find client login process at {}.".format(process.pid)) - if platform.system() == "Windows": - os.system("taskkill /PID {} /T /F".format(process.pid)) - else: - os.kill(process.pid, signal.SIGKILL) - if clean_process_group: - os.killpg(os.getpgid(process.pid), signal.SIGKILL) - except Exception as e: - pass + try: + for process in psutil.process_iter(): + try: + pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) + for cmd in pinfo["cmdline"]: + if str(cmd).find(login_program) != -1: + if os.path.basename(cmd) == login_program: + # click.echo("find client login process at {}.".format(process.pid)) + if platform.system() == "Windows": + os.system("taskkill /PID {} /T /F".format(process.pid)) + else: + os.kill(process.pid, signal.SIGKILL) + if clean_process_group: + os.killpg(os.getpgid(process.pid), signal.SIGKILL) + except Exception as e: + print(f"Failed to cleanup the client login process due to {e}.") + pass + except Exception as e: + print(f"Failed to cleanup the client login process since psutil.process_iter() failed.") + pass def cleanup_all_fedml_server_learning_processes(): # Cleanup all fedml server learning processes. - for process in psutil.process_iter(): - try: - pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) - found_learning_process = False - found_server_process = False - for cmd in pinfo["cmdline"]: - if str(cmd).find("fedml_config.yaml") != -1: - found_learning_process = True - - if str(cmd).find("server") != -1: - found_server_process = True - - if found_learning_process and found_server_process: - # click.echo("find server learning process at {}.".format(process.pid)) - if platform.system() == 'Windows': - os.system("taskkill /PID {} /T /F".format(process.pid)) - else: - os.killpg(os.getpgid(process.pid), signal.SIGKILL) - except Exception as e: - pass + try: + for process in psutil.process_iter(): + try: + pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) + found_learning_process = False + found_server_process = False + for cmd in pinfo["cmdline"]: + if str(cmd).find("fedml_config.yaml") != -1: + found_learning_process = True + + if str(cmd).find("server") != -1: + found_server_process = True + + if found_learning_process and found_server_process: + # click.echo("find server learning process at {}.".format(process.pid)) + if platform.system() == 'Windows': + os.system("taskkill /PID {} /T /F".format(process.pid)) + else: + os.killpg(os.getpgid(process.pid), signal.SIGKILL) + except Exception as e: + print(f"Failed to cleanup the server learning process due to {e}.") + pass + except Exception as e: + print(f"Failed to cleanup the server learning process due to {e}.") + pass def cleanup_all_fedml_client_api_processes(kill_all=False, is_model_device=False): # Cleanup all fedml client api processes. - for process in psutil.process_iter(): - try: - pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) - find_api_process = False - for cmd in pinfo["cmdline"]: - if is_model_device: - if str(cmd).find("model_scheduler.device_client_api:api") != -1: - find_api_process = True - else: - if str(cmd).find("slave.client_api:api") != -1: - find_api_process = True + try: + for process in psutil.process_iter(): + try: + pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) + find_api_process = False + for cmd in pinfo["cmdline"]: + if is_model_device: + if str(cmd).find("model_scheduler.device_client_api:api") != -1: + find_api_process = True + else: + if str(cmd).find("slave.client_api:api") != -1: + find_api_process = True - if find_api_process: - # click.echo("find client api process at {}.".format(process.pid)) - if platform.system() == 'Windows': - os.system("taskkill /PID {} /T /F".format(process.pid)) - else: - if kill_all: - os.killpg(os.getpgid(process.pid), signal.SIGKILL) + if find_api_process: + # click.echo("find client api process at {}.".format(process.pid)) + if platform.system() == 'Windows': + os.system("taskkill /PID {} /T /F".format(process.pid)) else: - os.kill(process.pid, signal.SIGKILL) - except Exception as e: - pass + if kill_all: + os.killpg(os.getpgid(process.pid), signal.SIGKILL) + else: + os.kill(process.pid, signal.SIGKILL) + except Exception as e: + print(f"Failed to cleanup the client api process due to {e}.") + pass + except Exception as e: + print(f"Failed to cleanup the client api process due to {e}.") + pass def cleanup_all_fedml_server_api_processes(kill_all=False, is_model_device=False): # Cleanup all fedml server api processes. - for process in psutil.process_iter(): - try: - pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) - find_api_process = False - for cmd in pinfo["cmdline"]: - if is_model_device: - if str(cmd).find("model_scheduler.device_server_api:api") != -1: - find_api_process = True - - if str(cmd).find("model_scheduler.device_model_inference:api") != -1: - find_api_process = True - else: - if str(cmd).find("master.server_api:api") != -1: - find_api_process = True - - if find_api_process: - # click.echo("find server api process at {}.".format(process.pid)) - if platform.system() == 'Windows': - os.system("taskkill /PID {} /T /F".format(process.pid)) - else: - if kill_all: - os.killpg(os.getpgid(process.pid), signal.SIGKILL) + try: + for process in psutil.process_iter(): + try: + pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) + find_api_process = False + for cmd in pinfo["cmdline"]: + if is_model_device: + if str(cmd).find("model_scheduler.device_server_api:api") != -1: + find_api_process = True + + if str(cmd).find("model_scheduler.device_model_inference:api") != -1: + find_api_process = True else: - os.kill(process.pid, signal.SIGKILL) - except Exception as e: - pass + if str(cmd).find("master.server_api:api") != -1: + find_api_process = True + if find_api_process: + # click.echo("find server api process at {}.".format(process.pid)) + if platform.system() == 'Windows': + os.system("taskkill /PID {} /T /F".format(process.pid)) + else: + if kill_all: + os.killpg(os.getpgid(process.pid), signal.SIGKILL) + else: + os.kill(process.pid, signal.SIGKILL) + except Exception as e: + print(f"Failed to cleanup the server api process due to {e}.") + pass + except Exception as e: + print(f"Failed to cleanup the server api process due to {e}.") + pass def cleanup_all_fedml_server_login_processes(login_program, clean_process_group=False): # Cleanup all fedml client login processes. - for process in psutil.process_iter(): - try: - pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) - for cmd in pinfo["cmdline"]: - if str(cmd).find(login_program) != -1: - if os.path.basename(cmd) == login_program: - # click.echo("find server login process at {}.".format(process.pid)) - if platform.system() == 'Windows': - os.system("taskkill /PID {} /T /F".format(process.pid)) - else: - os.kill(process.pid, signal.SIGKILL) - if clean_process_group: - os.killpg(os.getpgid(process.pid), signal.SIGKILL) - except Exception as e: - pass + try: + for process in psutil.process_iter(): + try: + pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) + for cmd in pinfo["cmdline"]: + if str(cmd).find(login_program) != -1: + if os.path.basename(cmd) == login_program: + # click.echo("find server login process at {}.".format(process.pid)) + if platform.system() == 'Windows': + os.system("taskkill /PID {} /T /F".format(process.pid)) + else: + os.kill(process.pid, signal.SIGKILL) + if clean_process_group: + os.killpg(os.getpgid(process.pid), signal.SIGKILL) + except Exception as e: + print(f"Failed to cleanup the server login process due to {e}.") + pass + except Exception as e: + print(f"Failed to cleanup the server login process due to {e}.") + pass def cleanup_all_bootstrap_processes(bootstrap_program, clean_process_group=False): # Cleanup all fedml bootstrap processes. - for process in psutil.process_iter(): - try: - pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) - for cmd in pinfo["cmdline"]: - if str(cmd).find(bootstrap_program) != -1: - if os.path.basename(cmd) == bootstrap_program: - # click.echo("find server login process at {}.".format(process.pid)) - if platform.system() == 'Windows': - os.system("taskkill /PID {} /T /F".format(process.pid)) - else: - os.kill(process.pid, signal.SIGKILL) - if clean_process_group: - os.killpg(os.getpgid(process.pid), signal.SIGKILL) - except Exception as e: - pass + try: + for process in psutil.process_iter(): + try: + pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) + for cmd in pinfo["cmdline"]: + if str(cmd).find(bootstrap_program) != -1: + if os.path.basename(cmd) == bootstrap_program: + # click.echo("find server login process at {}.".format(process.pid)) + if platform.system() == 'Windows': + os.system("taskkill /PID {} /T /F".format(process.pid)) + else: + os.kill(process.pid, signal.SIGKILL) + if clean_process_group: + os.killpg(os.getpgid(process.pid), signal.SIGKILL) + except Exception as e: + print(f"Failed to cleanup the bootstrap process due to {e}.") + pass + except Exception as e: + print(f"Failed to cleanup the bootstrap process due to {e}.") + pass def cleanup_model_monitor_processes(run_id, end_point_name, model_id, model_name, model_version): # Cleanup all fedml server api processes. - for process in psutil.process_iter(): - try: - pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) - find_monitor_process = False - find_monitor_name_arg = False - find_endpoint_id_name_arg = False - for cmd in pinfo["cmdline"]: - if str(cmd).endswith("device_model_monitor.py"): - find_monitor_name_arg = True - - if find_monitor_name_arg and str(cmd) == f"-ep": - find_endpoint_id_name_arg = True - - if find_monitor_name_arg and find_endpoint_id_name_arg and str(cmd) == f"{run_id}": - find_monitor_process = True - break + try: + for process in psutil.process_iter(): + try: + pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) + find_monitor_process = False + find_monitor_name_arg = False + find_endpoint_id_name_arg = False + for cmd in pinfo["cmdline"]: + if str(cmd).endswith("device_model_monitor.py"): + find_monitor_name_arg = True + + if find_monitor_name_arg and str(cmd) == f"-ep": + find_endpoint_id_name_arg = True + + if find_monitor_name_arg and find_endpoint_id_name_arg and str(cmd) == f"{run_id}": + find_monitor_process = True + break - if find_monitor_process: - # click.echo("find the monitor process at {}.".format(process.pid)) - if platform.system() == 'Windows': - os.system("taskkill /PID {} /T /F".format(process.pid)) - else: - os.kill(process.pid, signal.SIGKILL) - break - except Exception as e: - pass + if find_monitor_process: + # click.echo("find the monitor process at {}.".format(process.pid)) + if platform.system() == 'Windows': + os.system("taskkill /PID {} /T /F".format(process.pid)) + else: + os.kill(process.pid, signal.SIGKILL) + break + except Exception as e: + logging.error(f"Failed to cleanup the model monitor process due to {e}.") + pass + except Exception as e: + logging.error(f"For loop failed to stop the model inference monitor due to {e}.") + pass def get_process_running_count(process_name): count = 0 - for process in psutil.process_iter(): - try: - pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) - for cmd in pinfo["cmdline"]: - if str(cmd).find(process_name) != -1: - if os.path.basename(cmd) == process_name: - count += 1 - except Exception as e: - pass + try: + for process in psutil.process_iter(): + try: + pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) + for cmd in pinfo["cmdline"]: + if str(cmd).find(process_name) != -1: + if os.path.basename(cmd) == process_name: + count += 1 + except Exception as e: + print(f"Error in get_process_running_count: {e}") + pass + except Exception as e: + print(f"Error in get_process_running_count: {e}") + pass return count def edge_simulator_has_login(login_program="client_login.py"): - for process in psutil.process_iter(): - try: - pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) - found_login_process = False - found_simulator_process = False - for cmd in pinfo["cmdline"]: - if str(cmd).find(login_program) != -1: - if os.path.basename(cmd) == login_program: - found_login_process = True - - if str(cmd).find("edge_simulator") != -1: - found_simulator_process = True - - if found_login_process and found_simulator_process: - return True - except Exception as e: - pass + try: + for process in psutil.process_iter(): + try: + pinfo = process.as_dict(attrs=["pid", "name", "cmdline"]) + found_login_process = False + found_simulator_process = False + for cmd in pinfo["cmdline"]: + if str(cmd).find(login_program) != -1: + if os.path.basename(cmd) == login_program: + found_login_process = True + + if str(cmd).find("edge_simulator") != -1: + found_simulator_process = True + + if found_login_process and found_simulator_process: + return True + except Exception as e: + print(f"Error in edge_simulator_has_login: {e}") + pass + except Exception as e: + print(f"Error in edge_simulator_has_login: {e}") + pass return False @@ -813,6 +869,8 @@ def daemon_ota_upgrade_with_version(in_version="release"): def run_cmd(command, show_local_console=False): + # Had to import ClientConstans here because otherwise it was raising circular import errors. + from fedml.computing.scheduler.slave.client_constants import ClientConstants process = ClientConstants.exec_console_with_script(command, should_capture_stdout=True, should_capture_stderr=True) ret_code, out, err = ClientConstants.get_console_pipe_out_err_results(process) diff --git a/python/fedml/computing/scheduler/env/__init__.py b/python/fedml/computing/scheduler/env/__init__.py index e69de29bb2..0f71de6038 100644 --- a/python/fedml/computing/scheduler/env/__init__.py +++ b/python/fedml/computing/scheduler/env/__init__.py @@ -0,0 +1 @@ +from .collect_env import load_env, set_env_kv diff --git a/python/fedml/computing/scheduler/env/collect_env.py b/python/fedml/computing/scheduler/env/collect_env.py index dcece6a720..39654eac6c 100644 --- a/python/fedml/computing/scheduler/env/collect_env.py +++ b/python/fedml/computing/scheduler/env/collect_env.py @@ -1,18 +1,19 @@ import os import traceback -import GPUtil - import fedml +import dotenv +from fedml.computing.scheduler.comm_utils.hardware_utils import HardwareUtil from fedml.computing.scheduler.slave.client_diagnosis import ClientDiagnosis +from ..slave.client_constants import ClientConstants def collect_env(): - print("\n======== FedML (https://fedml.ai) ========") + print("\n======== FedML (https://tensoropera.ai) ========") print("FedML version: " + str(fedml.__version__)) env_version = fedml.get_env_version() print("FedML ENV version: " + str(env_version)) - + print("Execution path:" + str(os.path.abspath(fedml.__file__))) print("\n======== Running Environment ========") @@ -59,31 +60,29 @@ def collect_env(): try: print("\n======== GPU Configuration ========") - import GPUtil - gpus = GPUtil.getGPUs() + gpus = HardwareUtil.get_gpus() memory_total = 0.0 memory_free = 0.0 gpu_name = "" + vendor = "" for gpu in gpus: memory_total += gpu.memoryTotal memory_free += gpu.memoryFree gpu_name = gpu.name + vendor = gpu.vendor - print("NVIDIA GPU Info: " + gpu_name) + print(f"{vendor} GPU Info: " + gpu_name) print("Available GPU memory: {:.1f} G / {:.1f}G".format( memory_free / 1024.0, memory_total / 1024.0)) + device_count = len(gpus) + print("device_count = {}".format(device_count)) + import torch torch_is_available = torch.cuda.is_available() print("torch_is_available = {}".format(torch_is_available)) - device_count = torch.cuda.device_count() - print("device_count = {}".format(device_count)) - - device_name = torch.cuda.get_device_name(0) - print("device_name = {}".format(device_name)) - except: print("No GPU devices") @@ -110,4 +109,26 @@ def collect_env(): print(f"You can not connect to {mqtt_url}.\n") except Exception as e: print(f"The connection exception: {traceback.format_exc()}") - pass \ No newline at end of file + pass + + +def get_env_file(): + global_services_dir = ClientConstants.get_global_services_dir() + env_config_file = os.path.join(global_services_dir, ".env") + # Create file if not exists + if not os.path.exists(env_config_file): + with open(env_config_file, 'w') as f: + f.write("") + return env_config_file + + +def load_env(): + env_config_file = get_env_file() + dotenv.load_dotenv(dotenv_path=env_config_file, override=True) + + +def set_env_kv(key, value): + os.environ[key] = value + env_config_file = get_env_file() + dotenv.set_key(env_config_file, key, value) + load_env() diff --git a/python/fedml/computing/scheduler/master/base_master_agent.py b/python/fedml/computing/scheduler/master/base_master_agent.py new file mode 100755 index 0000000000..3aff523c24 --- /dev/null +++ b/python/fedml/computing/scheduler/master/base_master_agent.py @@ -0,0 +1,126 @@ + +from multiprocessing import Process +from ..comm_utils import sys_utils +from ..comm_utils.job_cleanup import JobCleanup +from ....core.mlops import MLOpsRuntimeLog, MLOpsMetrics +from ..scheduler_core.master_api_daemon import MasterApiDaemon +from ..scheduler_core.account_manager import FedMLAccountManager +from ..scheduler_core.general_constants import GeneralConstants +from abc import ABC, abstractmethod + + +class FedMLBaseMasterAgent(ABC): + + def __init__(self): + self.agent_args = None + self.master_api_daemon = None + self.master_api_process = None + self.mlops_metrics = MLOpsMetrics() + self.status_reporter = None + self.enable_simulation_cloud_agent = False + self.use_local_process_as_cloud_server = False + self.protocol_mgr = None + + def login( + self, user_id, api_key=None, device_id=None, + os_name=None, role=None, runner_cmd=None + ): + # Login account + login_result = FedMLAccountManager.get_instance().login( + user_id, api_key=api_key, device_id=device_id, + os_name=os_name, role=role, runner_cmd=runner_cmd + ) + if login_result is not None: + self.agent_args = login_result + else: + return None + + # Save the bound info + self._save_agent_info( + login_result.current_device_id + "." + login_result.os_name, login_result.edge_id) + + # Init the logs for protocol manager + self._init_logs(login_result, login_result.edge_id) + + # Create the protocol manager to communicate with the slave agents and MLOps. + self._create_protocol_manager(role, login_result) + + # Initialize the protocol manager + # noinspection PyBoardException + try: + self._initialize_protocol_manager() + except Exception as e: + FedMLAccountManager.write_login_failed_file(is_client=False) + self.protocol_mgr.stop() + raise e + + # Start the protocol manager to process the messages from MLOps and slave agents. + self.protocol_mgr.start() + + @staticmethod + def logout(): + GeneralConstants.cleanup_run_process(None, is_master=True) + sys_utils.cleanup_all_fedml_server_api_processes() + + def _create_protocol_manager(self, role, login_result): + if self.protocol_mgr is not None: + return + self.protocol_mgr = self._generate_protocol_manager_instance( + login_result, agent_config=login_result.agent_config) + self.protocol_mgr.run_as_edge_server_and_agent = True \ + if role == FedMLAccountManager.ROLE_EDGE_SERVER else False + self.protocol_mgr.run_as_cloud_agent = True if role == FedMLAccountManager.ROLE_CLOUD_AGENT else False + self.protocol_mgr.run_as_cloud_server = True if role == FedMLAccountManager.ROLE_CLOUD_SERVER else False + self.protocol_mgr.args = login_result + self.protocol_mgr.edge_id = login_result.edge_id + self.protocol_mgr.unique_device_id = login_result.unique_device_id + self.protocol_mgr.user_name = login_result.user_name + self.protocol_mgr.agent_config = login_result.agent_config + self.protocol_mgr.enable_simulation_cloud_agent = self.enable_simulation_cloud_agent + self.protocol_mgr.use_local_process_as_cloud_server = self.use_local_process_as_cloud_server + + def _initialize_protocol_manager(self): + # Init local database + self._init_database() + + # Initialize the master protocol + self.protocol_mgr.initialize() + + # Report the IDLE status to MLOps + self.mlops_metrics.report_server_training_status( + None, GeneralConstants.MSG_MLOPS_SERVER_STATUS_IDLE, edge_id=self.agent_args.edge_id) + + # Cleanup data when startup + JobCleanup.get_instance().sync_data_on_startup(self.agent_args.edge_id, is_client=False) + + # Start the API server on master agent + self.master_api_daemon = MasterApiDaemon() + self.master_api_process = Process(target=self.master_api_daemon.run) + self.master_api_process.start() + + def _init_logs(self, agent_args, edge_id): + # Init runtime logs + in_args = agent_args + in_args.log_file_dir = self._get_log_file_dir() + in_args.run_id = 0 + in_args.role = "server" + in_args.edge_id = edge_id + in_args.using_mlops = True + in_args.server_agent_id = edge_id + MLOpsRuntimeLog.get_instance(in_args).init_logs() + + @abstractmethod + def _get_log_file_dir(self): + pass + + @abstractmethod + def _save_agent_info(self, unique_device_id, edge_id): + pass + + @abstractmethod + def _init_database(self): + pass + + @abstractmethod + def _generate_protocol_manager_instance(self, args, agent_config=None): + return None diff --git a/python/fedml/computing/scheduler/master/base_master_job_runner.py b/python/fedml/computing/scheduler/master/base_master_job_runner.py new file mode 100755 index 0000000000..9ebab258bb --- /dev/null +++ b/python/fedml/computing/scheduler/master/base_master_job_runner.py @@ -0,0 +1,715 @@ + +import json +import logging +import multiprocessing +import platform +import queue +import os +import time +import traceback +from ..scheduler_entry.constants import Constants +from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog +from ..master.server_constants import ServerConstants +from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon +from ..comm_utils import sys_utils +from .server_data_interface import FedMLServerDataInterface +from ....core.mlops.mlops_utils import MLOpsUtils +from ..scheduler_core.log_manager import LogsManager +from ..scheduler_core.metrics_manager import MetricsManager +from fedml.utils.debugging import debug +from ..scheduler_core.status_center import JobStatus +from ..scheduler_core.compute_cache_manager import ComputeCacheManager +from multiprocessing import Process, Queue +from ..scheduler_core.general_constants import GeneralConstants +from ..scheduler_core.scheduler_base_job_runner import FedMLSchedulerBaseJobRunner, RunnerError, RunnerCompletedError +from abc import ABC, abstractmethod +from ..scheduler_core.scheduler_matcher import SchedulerMatcher +import fedml + + +class FedMLBaseMasterJobRunner(FedMLSchedulerBaseJobRunner, ABC): + debug_cloud_server = False + + def __init__(self, args, run_id=0, request_json=None, agent_config=None, edge_id=0, + cuda_visible_gpu_ids_str=None, + agent_data_dir=None, agent_package_download_dir=None, + agent_package_unzip_dir=None, agent_log_file_dir=None): + FedMLSchedulerBaseJobRunner.__init__( + self, args, edge_id=edge_id, request_json=request_json, agent_config=agent_config, run_id=run_id, + cuda_visible_gpu_ids_str=cuda_visible_gpu_ids_str, agent_data_dir=agent_data_dir, + agent_package_download_dir=agent_package_download_dir, + agent_package_unzip_dir=agent_package_unzip_dir, + agent_log_file_dir=agent_package_unzip_dir, + is_master_runner=True + ) + + self.run_edge_id_status_queue = Queue() + self.run_metrics_queue = Queue() + self.run_events_queue = Queue() + self.run_artifacts_queue = Queue() + self.run_logs_queue = Queue() + self.run_edge_device_info_queue = Queue() + self.run_edge_device_info_global_queue = Queue() + self.run_extend_queue_list = None + self.async_check_timeout = 0 + self.enable_async_cluster = False + self.origin_fedml_config_object = None + self.server_agent_id = 0 + if request_json is not None: + self.server_agent_id = request_json.get("server_id", 0) + self.fedml_data_base_package_dir = os.path.join("/", "fedml", "data") + self.fedml_data_local_package_dir = os.path.join("/", "fedml", "fedml-package", "fedml", "data") + self.fedml_data_dir = self.fedml_data_base_package_dir + self.fedml_config_dir = os.path.join("/", "fedml", "conf") + + @debug + def run( + self, process_event, completed_event, edge_id_status_queue=None, + edge_device_info_queue=None, run_metrics_queue=None, run_event_queue=None, + run_artifacts_queue=None, run_logs_queue=None, edge_device_info_global_queue=None, + run_extend_queue_list=None, sender_message_center_queue=None, listener_message_queue=None, + status_center_queue=None + ): + print(f"Master job runner process id {os.getpid()}, run id {self.run_id}") + + if platform.system() != "Windows": + os.setsid() + + os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' + os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') + + self.run_process_event = process_event + self.run_process_completed_event = completed_event + try: + MLOpsUtils.set_ntp_offset(self.ntp_offset) + + self.rebuild_message_status_center(sender_message_center_queue, listener_message_queue, status_center_queue) + + self.run_impl( + edge_id_status_queue, edge_device_info_queue, run_metrics_queue, + run_event_queue, run_artifacts_queue, run_logs_queue, edge_device_info_global_queue, + run_extend_queue_list=run_extend_queue_list, sender_message_queue=sender_message_center_queue, + listener_message_queue=listener_message_queue, status_center_queue=status_center_queue + ) + except RunnerError: + logging.info("Runner stopped.") + self.status_reporter.report_server_id_status( + self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED, edge_id=self.edge_id, + server_id=self.edge_id, server_agent_id=self.edge_id) + except RunnerCompletedError: + logging.info("Runner completed.") + except Exception as e: + logging.error("Runner exits with exceptions. {}".format(traceback.format_exc())) + self.status_reporter.report_server_id_status( + self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, + server_id=self.edge_id, server_agent_id=self.edge_id) + finally: + logging.info("Release resources.") + self._process_run_metrics_queue(run_metrics_queue) + self._process_run_logs_queue(run_logs_queue) + MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) + if self.mlops_metrics is not None: + self.mlops_metrics.stop_sys_perf() + time.sleep(3) + self.cleanup_runner_process(self.run_id) + ServerConstants.cleanup_learning_process(self.run_id) + ServerConstants.cleanup_bootstrap_process(self.run_id) + + def cleanup_runner_process(self, run_id): + ServerConstants.cleanup_run_process(run_id) + + @debug + @abstractmethod + def run_impl( + self, edge_id_status_queue, edge_device_info_queue, run_metrics_queue, + run_event_queue, run_artifacts_queue, run_logs_queue, edge_device_info_global_queue, + run_extend_queue_list=None, sender_message_queue=None, listener_message_queue=None, + status_center_queue=None + ): + run_id = self.request_json["runId"] + run_config = self.request_json["run_config"] + data_config = run_config["data_config"] + edge_ids = self.request_json["edgeids"] + + self.check_runner_stop_event() + + self.run_id = run_id + self.args.run_id = self.run_id + MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) + + logging.info("Detect all status of Edge ids: " + str(edge_ids)) + + self.status_reporter.report_server_id_status( + self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_STARTING, edge_id=self.edge_id, + server_id=self.edge_id, server_agent_id=self.edge_id) + + status_ok, active_edge_info_dict, inactivate_edges = self.detect_edges_status( + edge_device_info_queue, edge_device_info_global_queue=edge_device_info_global_queue, + callback_when_edges_ready=self.send_training_request_to_edges) + logging.info(f"Status OK: {status_ok}, Active edge info dict: {active_edge_info_dict}, " + f"inactivate edges: {inactivate_edges}") + if not status_ok: + logging.error(f"Status of edge device is not OK. Active edge info dict: {active_edge_info_dict}, " + f"Inactivate edges: {inactivate_edges}") + return + + if not self.should_continue_run_job(run_id): + if FedMLBaseMasterJobRunner.debug_cloud_server: + while True: + time.sleep(30) + # Check if the run status is normal + self.aggregate_run_metrics_logs( + run_id, edge_ids, edge_id_status_queue, edge_device_info_queue, + edge_device_info_global_queue, + run_metrics_queue, run_logs_queue) + return + + # Start the server job + self.start_runner_process( + run_id, self.request_json, edge_id=self.edge_id, is_server_job=True, + sender_message_queue=sender_message_queue, + listener_message_queue=listener_message_queue, + status_center_queue=status_center_queue + ) + + # Check if the run status is normal + self.aggregate_run_metrics_logs( + run_id, edge_ids, edge_id_status_queue, edge_device_info_queue, + edge_device_info_global_queue, + run_metrics_queue, run_logs_queue) + + @abstractmethod + def _generate_extend_queue_list(self): + return list() + + def aggregate_run_metrics_logs( + self, run_id, edge_id_list, edge_id_status_queue, edge_device_info_queue, + edge_device_info_global_queue, run_metrics_queue, run_logs_queue): + + ComputeCacheManager.get_instance().set_redis_params() + + while True: + self.check_runner_stop_event() + + # Process run metrics + self._process_run_metrics_queue(run_metrics_queue) + + # Process run logs + self._process_run_logs_queue(run_logs_queue) + + # Check the job status + job_status = ComputeCacheManager.get_instance().get_status_cache().get_job_status(run_id) + if JobStatus.is_job_completed(job_status): + break + + def _process_run_metrics_queue(self, run_metrics_queue): + # Fetch metrics from the run metrics queue + while True: + try: + metrics_item = run_metrics_queue.get(block=False, timeout=3) + MetricsManager.get_instance().save_metrics(metrics_item) + metric_json = json.loads(metrics_item) + if metric_json.get("is_endpoint", False): + metric_json().pop("is_endpoint") + self.mlops_metrics.report_endpoint_metric({}, payload=json.dumps(metric_json)) + else: + self.mlops_metrics.report_server_training_metric({}, payload=metrics_item) + except queue.Empty as e: # If queue is empty, then break loop + break + + def _process_run_logs_queue(self, run_logs_queue): + # Fetch logs from the run logs queue + while True: + try: + logs_item = run_logs_queue.get(block=False, timeout=3) + LogsManager.save_logs(logs_item) + except queue.Empty as e: # If queue is empty, then break loop + break + + def run_server_job( + self, process_event, completed_event, edge_id_status_queue=None, + edge_device_info_queue=None, run_metrics_queue=None, run_event_queue=None, + run_artifacts_queue=None, run_logs_queue=None, edge_device_info_global_queue=None, + run_extend_queue_list=None, sender_message_center_queue=None, listener_message_queue=None, + status_center_queue=None + ): + print(f"Server runner process id {os.getpid()}, run id {self.run_id}") + + if platform.system() != "Windows": + os.setsid() + + os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' + os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') + + self.run_process_event = process_event + self.run_process_completed_event = completed_event + try: + MLOpsUtils.set_ntp_offset(self.ntp_offset) + + self.rebuild_message_status_center(sender_message_center_queue, listener_message_queue, status_center_queue) + + self.run_server_job_impl(process_event, completed_event, + message_center_queue=sender_message_center_queue) + except RunnerError: + logging.info("Runner stopped.") + self.status_reporter.report_server_id_status( + self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED, edge_id=self.edge_id, + server_id=self.edge_id, server_agent_id=self.edge_id) + except RunnerCompletedError: + logging.info("Runner completed.") + except Exception as e: + logging.error("Runner exits with exceptions. {}".format(traceback.format_exc())) + self.status_reporter.report_server_id_status( + self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, + server_id=self.edge_id, server_agent_id=self.edge_id) + finally: + logging.info("Release resources.") + MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) + if self.mlops_metrics is not None: + self.mlops_metrics.stop_sys_perf() + time.sleep(3) + ServerConstants.cleanup_run_process(self.run_id) + ServerConstants.cleanup_learning_process(self.run_id) + ServerConstants.cleanup_bootstrap_process(self.run_id) + + def run_server_job_impl(self, process_event, completed_event, + message_center_queue=None): + run_id = self.request_json["runId"] + run_config = self.request_json["run_config"] + data_config = run_config["data_config"] + edge_ids = self.request_json["edgeids"] + + self.check_runner_stop_event() + + self.run_id = run_id + self.args.run_id = self.run_id + MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) + + self.status_reporter.report_server_id_status( + run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_RUNNING, edge_id=self.edge_id, + server_id=self.edge_id, server_agent_id=self.edge_id) + + # get training params + private_local_data_dir = data_config.get("privateLocalData", "") + is_using_local_data = 0 + # if private_local_data_dir is not None and len(str(private_local_data_dir).strip(' ')) > 0: + # is_using_local_data = 1 + + # start a run according to the hyper-parameters + # fedml_local_data_dir = self.cur_dir + "/fedml_data/run_" + run_id_str + "_edge_" + str(edge_id) + fedml_local_data_dir = os.path.join(self.cur_dir, "fedml_data") + fedml_local_config_dir = os.path.join(self.cur_dir, "fedml_config") + if is_using_local_data: + fedml_local_data_dir = private_local_data_dir + self.fedml_data_dir = self.fedml_data_local_package_dir + + self.check_runner_stop_event() + + logging.info("download packages and run the bootstrap script...") + + # update local config with real time parameters from server and dynamically replace variables value + unzip_package_path, fedml_config_object = self.update_local_fedml_config(run_id, run_config) + if unzip_package_path is None or fedml_config_object is None: + logging.info("failed to update local fedml config.") + self.check_runner_stop_event() + self.report_exception_status(run_id) + return + + logging.info("cleanup the previous aggregation process and check downloaded packages...") + + entry_file_config = fedml_config_object["entry_config"] + dynamic_args_config = fedml_config_object["dynamic_args"] + entry_file = str(entry_file_config["entry_file"]).replace('\\', os.sep).replace('/', os.sep) + entry_file = os.path.basename(entry_file) + conf_file = entry_file_config["conf_file"] + conf_file = str(conf_file).replace('\\', os.sep).replace('/', os.sep) + ServerConstants.cleanup_learning_process(run_id) + self.check_runner_stop_event() + if not os.path.exists(unzip_package_path): + logging.info("failed to unzip file.") + self.check_runner_stop_event() + self.report_exception_status(run_id) + return + os.chdir(os.path.join(unzip_package_path, "fedml")) + + self.check_runner_stop_event() + + logging.info("starting the server user process...") + + entry_file_full_path = os.path.join(unzip_package_path, "fedml", entry_file) + conf_file_full_path = os.path.join(unzip_package_path, "fedml", conf_file) + logging.info(" ") + logging.info(" ") + logging.info("====Your Run Logs Begin===") + + process, is_launch_task, error_list = self.execute_job_task( + unzip_package_path=unzip_package_path, entry_file_full_path=entry_file_full_path, + conf_file_full_path=conf_file_full_path, dynamic_args_config=dynamic_args_config, + fedml_config_object=self.fedml_config_object) + + logging.info("====Your Run Logs End===") + logging.info(" ") + logging.info(" ") + + ret_code, out, err = process.returncode, None, None + is_run_ok = sys_utils.is_runner_finished_normally(process.pid) + if is_launch_task: + is_run_ok = True + if error_list is not None and len(error_list) > 0: + is_run_ok = False + if ret_code is None or ret_code <= 0: + self.check_runner_stop_event() + + if is_run_ok: + if out is not None: + out_str = sys_utils.decode_our_err_result(out) + if out_str != "": + logging.info("{}".format(out_str)) + + self.status_reporter.report_server_id_status( + run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED, edge_id=self.edge_id, + server_id=self.edge_id, server_agent_id=self.edge_id) + + if is_launch_task: + sys_utils.log_return_info(f"job {run_id}", 0) + else: + sys_utils.log_return_info(entry_file, 0) + else: + is_run_ok = False + + if not is_run_ok: + # If the run status is killed or finished, then return with the normal state. + current_job = FedMLServerDataInterface.get_instance().get_job_by_id(run_id) + if current_job is not None and (current_job.status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED or + current_job.status == ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED): + return + + self.check_runner_stop_event() + + logging.error("failed to run the aggregation process...") + + if err is not None: + err_str = sys_utils.decode_our_err_result(err) + if err_str != "": + logging.error("{}".format(err_str)) + + if is_launch_task: + sys_utils.log_return_info(f"job {run_id}", ret_code) + else: + sys_utils.log_return_info(entry_file, ret_code) + + self.report_exception_status(run_id) + + @abstractmethod + def _generate_job_runner_instance(self, args, run_id=None, request_json=None, agent_config=None, edge_id=None): + return None + + def start_runner_process( + self, run_id, request_json, edge_id=None, is_server_job=False, + sender_message_queue=None, listener_message_queue=None, + status_center_queue=None, + ): + server_runner = self._generate_job_runner_instance( + self.args, run_id=run_id, request_json=request_json, + agent_config=self.agent_config, edge_id=edge_id + ) + + run_id_str = str(run_id) + server_runner.edge_id = self.edge_id + server_runner.server_agent_id = self.server_agent_id + server_runner.start_request_json = json.dumps(request_json) + self.run_process_event = multiprocessing.Event() + server_runner.run_process_event = self.run_process_event + self.run_process_completed_event = multiprocessing.Event() + server_runner.run_process_completed_event = self.run_process_completed_event + server_runner.edge_id_status_queue = self.run_edge_id_status_queue + server_runner.edge_device_info_queue = self.run_edge_device_info_queue + self.run_extend_queue_list = self._generate_extend_queue_list() + self.run_process = Process( + target=server_runner.run if not is_server_job else server_runner.run_server_job, args=( + self.run_process_event, self.run_process_completed_event, self.run_edge_id_status_queue, + self.run_edge_device_info_queue, self.run_metrics_queue, self.run_events_queue, + self.run_artifacts_queue, self.run_logs_queue, self.run_edge_device_info_global_queue, + self.run_extend_queue_list, sender_message_queue, listener_message_queue, status_center_queue + ) + ) + self.run_process.start() + ServerConstants.save_run_process(run_id, self.run_process.pid) + return self.run_process + + def put_run_edge_device_info_to_queue(self, run_id, edge_id, device_info): + edge_ids = self.request_json.get("edgeids", None) + if edge_ids is None: + return + if int(edge_id) in edge_ids or str(edge_id) in edge_ids: + run_id_str = str(run_id) + if self.run_edge_device_info_queue is None: + self.run_edge_device_info_queue = Queue() + self.run_edge_device_info_queue.put(device_info) + + def should_continue_run_job(self, run_id): + run_config = self.request_json["run_config"] + run_params = run_config.get("parameters", {}) + job_yaml = run_params.get("job_yaml", {}) + job_yaml_default_none = run_params.get("job_yaml", None) + framework_type = job_yaml.get("framework_type", None) + job_type = job_yaml.get("job_type", None) + job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type + if job_yaml_default_none is not None: + if job_type == Constants.JOB_TASK_TYPE_FEDERATE: + return True + + if framework_type is None or framework_type != Constants.JOB_FRAMEWORK_TYPE_FEDML: + self.status_reporter.report_server_id_status( + run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_RUNNING, edge_id=self.edge_id, + server_id=self.edge_id, server_agent_id=self.edge_id) + return False + + return True + + @debug + def detect_edges_status( + self, edge_device_info_queue, edge_device_info_global_queue=None, callback_when_edges_ready=None, + status_timeout=None, + need_to_trigger_exception=True, status_check_context=None, given_edge_ids=None, + callback_when_detecting=None, args_for_callback_when_detecting=None + ): + run_id = self.request_json["runId"] + run_id_str = str(run_id) + edge_id_list = self.request_json["edgeids"] + if given_edge_ids is not None: + edge_id_list = given_edge_ids + + # Init realtime status of all edges + run_edges_realtime_status = dict() + run_edges_realtime_status[run_id_str] = dict() + + total_sleep_seconds = 0 + status_check_sleep_seconds = 10 + allowed_status_check_sleep_seconds = 60 * 2 if status_timeout is None else status_timeout + allowed_status_check_sleep_seconds_for_async = 30 + inactivate_edges = list() + active_edge_info_dict = dict() + while True: + if callback_when_detecting is not None: + callback_when_detecting(args_for_callback_when_detecting) + + # Fetch edge info from the edge status queue, which will be added to realtime status map + while True: + self.check_runner_stop_event() + + try: + edge_info = edge_device_info_queue.get(block=False, timeout=1) + if edge_info is not None: + edge_id = edge_info.get("edge_id", None) + if edge_id is not None: + run_edges_realtime_status[run_id_str][edge_id] = edge_info + except queue.Empty as e: # If queue is empty, then break loop + break + + self.check_runner_stop_event() + + # Check all edges which don't send response status successfully + # and retry to send the status checking message. + active_edges_count = 0 + inactivate_edges.clear() + active_edge_info_dict.clear() + for edge_id in edge_id_list: + edge_info_dict = run_edges_realtime_status.get(run_id_str, {}) + edge_info = edge_info_dict.get(edge_id, None) + edge_info = edge_info_dict.get(str(edge_id), None) if edge_info is None else edge_info + if edge_info is not None: + active_edges_count += 1 + active_edge_info_dict[str(edge_id)] = edge_info + else: + inactivate_edges.append(edge_id) + + # If all edges are ready then send the starting job message to them + if active_edges_count == len(edge_id_list): + logging.info(f"All edges are ready. Active edge id list is as follows. {active_edge_info_dict}") + if callback_when_edges_ready is not None: + logging.info("All edges are ready. Start to process the callback function.") + callback_when_edges_ready(self.request_json, active_edge_info_dict=active_edge_info_dict) + else: + logging.info("All edges are ready. No callback function to process.") + break + else: + logging.info(f"All edges are not ready. Active edge id list: {active_edge_info_dict}, " + f"Inactive edge id list: {inactivate_edges}") + + # Check if runner needs to stop and sleep specific time + self.check_runner_stop_event() + time.sleep(status_check_sleep_seconds) + total_sleep_seconds += status_check_sleep_seconds + + # Check if the status response message has timed out to receive + if total_sleep_seconds >= allowed_status_check_sleep_seconds: + # If so, send failed message to MLOps and send exception message to all edges. + logging.error(f"There are inactive edge devices. " + f"Inactivate edge id list is as follows. {inactivate_edges}") + if need_to_trigger_exception: + self.status_reporter.report_server_id_status( + run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, + server_id=self.edge_id, server_agent_id=self.server_agent_id) + self.report_exception_status(run_id) + return False, active_edge_info_dict, inactivate_edges + + # If we enable the mode for async cluster, then sleep some time and send messages to all clients. + if callback_when_edges_ready is not None and self.should_process_async_cluster is not None: + should_async, async_timeout = self.should_process_async_cluster() + if should_async and total_sleep_seconds >= allowed_status_check_sleep_seconds_for_async: + if async_timeout > allowed_status_check_sleep_seconds_for_async: + time.sleep(async_timeout - allowed_status_check_sleep_seconds_for_async) + self.send_training_request_to_edges(self.request_json, active_edge_info_dict) + return True, active_edge_info_dict, inactivate_edges + + return True, active_edge_info_dict, inactivate_edges + + def report_exception_status(self, run_id): + self.mlops_metrics.report_job_status(run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION) + + def callback_run_logs(self, topic, payload): + run_id = str(topic).split('/')[-1] + run_id_str = str(run_id) + if self.run_logs_queue is None: + self.run_logs_queue = Queue() + self.run_logs_queue.put(payload) + + def callback_run_metrics(self, topic, payload): + print(f"callback_run_metrics topic {topic}, payload {payload}") + run_id = str(topic).split('/')[-1] + run_id_str = str(run_id) + if self.run_metrics_queue is None: + self.run_metrics_queue = Queue() + self.run_metrics_queue.put(payload) + + # def send_training_request_to_edges(self, active_edge_info_dict): + # topic = GeneralConstants.MSG_TOPIC_SEND_TRAINING_REQUEST_TO_EDGES + # payload = json.dumps(active_edge_info_dict) + # self.message_center.receive_message(topic, payload) + def send_training_request_to_edges(self, request_json, active_edge_info_dict=None): + run_id = request_json["runId"] + edge_id_list = request_json["edgeids"] + run_config = request_json.get("run_config", {}) + run_params = run_config.get("parameters", {}) + job_yaml = run_params.get("job_yaml", {}) + job_yaml_default_none = run_params.get("job_yaml", None) + computing = job_yaml.get("computing", {}) + request_num_gpus = computing.get("minimum_num_gpus", None) + job_gpu_id_list = request_json.get("job_gpu_id_list", None) + assigned_gpu_num_dict = dict() + assigned_gpu_ids_dict = dict() + master_node_addr = "" + master_node_port = 0 + + logging.info(f"Send training request to Edge ids: {edge_id_list}, run_id {run_id}") + + should_match_gpu = False + if job_yaml_default_none is not None and request_num_gpus is not None and \ + int(request_num_gpus) > 0 and active_edge_info_dict is not None: + should_match_gpu = True + SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(active_edge_info_dict, show_gpu_list=True) + + # Match and assign gpus to each device + assigned_gpu_num_dict, assigned_gpu_ids_dict = SchedulerMatcher.match_and_assign_gpu_resources_to_devices( + request_num_gpus, edge_id_list, active_edge_info_dict, job_gpu_id_list=job_gpu_id_list) + if assigned_gpu_num_dict is None or assigned_gpu_ids_dict is None: + # If no resources available, send failed message to MLOps and send exception message to all edges. + gpu_count, gpu_available_count = SchedulerMatcher.parse_and_print_gpu_info_for_all_edges( + active_edge_info_dict, should_print=True) + err_info = f"No resources available." \ + f"Total available GPU count {gpu_available_count} is less than " \ + f"request GPU count {request_num_gpus}" + logging.error(err_info) + + self.status_reporter.report_server_id_status( + run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, + server_id=self.edge_id, server_agent_id=self.server_agent_id) + self.report_exception_status(run_id) + + serving_args = job_yaml.get("serving_args", {}) + endpoint_id = serving_args.get("endpoint_id", None) + if endpoint_id is not None: + fedml.mlops.log_endpoint_status( + endpoint_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_FAILED) + fedml.mlops.log_run_log_lines( + endpoint_id, 0, [err_info], + log_source=GeneralConstants.FEDML_LOG_SOURCE_TYPE_MODEL_END_POINT + ) + return + + # Generate master node addr and port + master_node_addr, master_node_port = SchedulerMatcher.get_master_node_info(edge_id_list, + active_edge_info_dict) + + # Generate new edge id list after matched + edge_id_list = SchedulerMatcher.generate_new_edge_list_for_gpu_matching(assigned_gpu_num_dict) + if len(edge_id_list) <= 0: + gpu_count, gpu_available_count = SchedulerMatcher.parse_and_print_gpu_info_for_all_edges( + active_edge_info_dict, should_print=True) + logging.error(f"Request parameter for GPU num is invalid." + f"Total available GPU count {gpu_available_count}." + f"Request GPU num {request_num_gpus}") + self.status_reporter.report_server_id_status( + run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, + server_id=self.edge_id, server_agent_id=self.server_agent_id) + self.report_exception_status(run_id) + return + + if should_match_gpu: + # Report gpu num and related infos to MLOps. + serving_args = job_yaml.get("serving_args", {}) + endpoint_id = serving_args.get("endpoint_id", None) + if endpoint_id is not None: + endpoint_info = list() + for edge_id_item, gpu_num in assigned_gpu_num_dict.items(): + edge_info = active_edge_info_dict.get(str(edge_id_item), {}) + endpoint_info.append({ + "machine_id": edge_id_item, "endpoint_gpu_count": gpu_num, + "master_deploy_id": edge_info.get("master_device_id", 0), + "slave_deploy_id": edge_info.get("slave_device_id", 0)}) + topic_name = f"compute/mlops/endpoint" + endpoint_info_json = {"endpoint_id": endpoint_id, "endpoint_info": endpoint_info} + print(f"endpoint_info_json {endpoint_info_json}") + self.message_center.send_message(topic_name, json.dumps(endpoint_info_json)) + + client_rank = 1 + for edge_id in edge_id_list: + topic_start_train = "flserver_agent/" + str(edge_id) + "/start_train" + logging.info("start_train: send topic " + topic_start_train + " to client...") + request_json["client_rank"] = client_rank + client_rank += 1 + + if active_edge_info_dict is not None: + edge_info = active_edge_info_dict.get(str(edge_id), {}) + model_master_device_id = edge_info.get("master_device_id", None) + model_slave_device_id = edge_info.get("slave_device_id", None) + model_slave_device_id_list = edge_info.get("slave_device_id_list", None) + + if should_match_gpu: + request_json["scheduler_match_info"] = SchedulerMatcher.generate_match_info_for_scheduler( + edge_id, edge_id_list, master_node_addr, master_node_port, + assigned_gpu_num_dict, assigned_gpu_ids_dict, + model_master_device_id=model_master_device_id, + model_slave_device_id=model_slave_device_id, + model_slave_device_id_list=model_slave_device_id_list + ) + + self.message_center.send_message(topic_start_train, json.dumps(request_json)) + + def should_process_async_cluster(self): + run_config = self.request_json.get("run_config", {}) + run_params = run_config.get("parameters", {}) + common_args = run_params.get("common_args", {}) + self.enable_async_cluster = common_args.get("enable_async_cluster", False) + self.async_check_timeout = common_args.get("async_check_timeout", 0) + if self.enable_async_cluster: + return True, self.async_check_timeout + + return False, self.async_check_timeout + + def get_client_id_list(self, server_edge_id_list): + return server_edge_id_list + + + diff --git a/python/fedml/computing/scheduler/master/base_master_job_runner_manager.py b/python/fedml/computing/scheduler/master/base_master_job_runner_manager.py new file mode 100755 index 0000000000..6831c9d034 --- /dev/null +++ b/python/fedml/computing/scheduler/master/base_master_job_runner_manager.py @@ -0,0 +1,95 @@ +import base64 +import json +import logging +import time +from abc import ABC +from multiprocessing import Process +from .cloud_server_manager import FedMLCloudServerManager +from ..scheduler_core.scheduler_base_job_runner_manager import FedMLSchedulerBaseJobRunnerManager + + +class FedMLBaseMasterJobRunnerManager(FedMLSchedulerBaseJobRunnerManager, ABC): + def __init__(self): + FedMLSchedulerBaseJobRunnerManager.__init__(self) + + # Override + def start_job_runner( + self, run_id, request_json, args=None, edge_id=None, is_server_job=False, + sender_message_queue=None, listener_message_queue=None, status_center_queue=None, + should_start_cloud_server=False, use_local_process_as_cloud_server=False, + cuda_visible_gpu_ids_str=None + ): + if should_start_cloud_server: + self._start_cloud_server(args, run_id, request_json, edge_id=edge_id, + use_local_process_as_cloud_server=use_local_process_as_cloud_server) + return + + run_id_str = str(run_id) + self.job_runners[run_id_str] = self._generate_job_runner_instance( + args, run_id=run_id, request_json=request_json, + agent_config=args.agent_config, edge_id=edge_id, + ) + self.job_runners[run_id_str].start_runner_process( + run_id, request_json, edge_id=edge_id, is_server_job=is_server_job, + sender_message_queue=sender_message_queue, + listener_message_queue=listener_message_queue, + status_center_queue=status_center_queue + ) + + def stop_job_runner( + self, run_id, args=None, server_id=None, request_json=None, + run_as_cloud_agent=False, run_as_cloud_server=False + ): + super().stop_job_runner(run_id) + + if run_as_cloud_agent or run_as_cloud_server: + stopping_process = Process( + target=FedMLCloudServerManager.stop_cloud_server, args=(run_id, server_id, args.agent_config)) + stopping_process.start() + + def complete_job_runner( + self, run_id, args=None, server_id=None, request_json=None, + run_as_cloud_agent=False, run_as_cloud_server=False + ): + super().complete_job_runner(run_id) + + if run_as_cloud_agent or run_as_cloud_server: + stopping_process = Process( + target=FedMLCloudServerManager.stop_cloud_server, args=(run_id, server_id, args.agent_config)) + stopping_process.start() + + def _start_cloud_server( + self, args, run_id, request_json, edge_id=None, + use_local_process_as_cloud_server=False + ): + run_id_str = str(run_id) + cloud_server_mgr = FedMLCloudServerManager( + args, run_id=run_id, edge_id=edge_id, request_json=request_json, + agent_config=args.agent_config + ) + if not use_local_process_as_cloud_server: + self.cloud_run_process_map[run_id_str] = Process(target=cloud_server_mgr.start_cloud_server_process_entry) + self.cloud_run_process_map[run_id_str].start() + else: + message_bytes = json.dumps(request_json).encode("ascii") + base64_bytes = base64.b64encode(message_bytes) + runner_cmd_encoded = base64_bytes.decode("ascii") + cloud_device_id = request_json.get("cloudServerDeviceId", "0") + + logging.info("runner_cmd_encoded: {}".format(runner_cmd_encoded)) + + self.cloud_run_process_map[run_id_str] = Process( + target=cloud_server_mgr.start_local_cloud_server, + args=(args.account_id, args.version, cloud_device_id, runner_cmd_encoded)) + self.cloud_run_process_map[run_id_str].start() + time.sleep(1) + + def callback_run_logs(self, run_id, topic, payload): + run_id_str = str(run_id) + if self.job_runners.get(run_id_str, None) is not None: + self.job_runners[run_id_str].callback_run_logs(topic, payload) + + def callback_run_metrics(self, run_id, topic, payload): + run_id_str = str(run_id) + if self.job_runners.get(run_id_str, None) is not None: + self.job_runners[run_id_str].callback_run_metrics(topic, payload) diff --git a/python/fedml/computing/scheduler/master/base_master_protocol_manager.py b/python/fedml/computing/scheduler/master/base_master_protocol_manager.py new file mode 100755 index 0000000000..1c4cbba4f4 --- /dev/null +++ b/python/fedml/computing/scheduler/master/base_master_protocol_manager.py @@ -0,0 +1,556 @@ + +import base64 +import json +import logging +import fedml +from ..comm_utils.constants import SchedulerConstants +from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog +from ....core.mlops.mlops_configs import MLOpsConfigs +from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon +from ..scheduler_core.compute_cache_manager import ComputeCacheManager +from ..scheduler_core.ota_upgrade import FedMLOtaUpgrade +from .deploy_job_launcher import FedMLDeployJobLauncher +from ..scheduler_core.general_constants import GeneralConstants +from ..scheduler_core.scheduler_base_protocol_manager import FedMLSchedulerBaseProtocolManager +from abc import ABC, abstractmethod + + +class FedMLBaseMasterProtocolManager(FedMLSchedulerBaseProtocolManager, ABC): + def __init__(self, args, agent_config=None): + FedMLSchedulerBaseProtocolManager.__init__(self, args, agent_config=agent_config, is_master=True) + + self.async_check_timeout = 0 + self.enable_async_cluster = False + self.request_json = None + self.run_edge_ids = dict() + self.version = fedml.get_env_version() + self.args = args + self.run_id = None + self.edge_id = args.edge_id + self.server_agent_id = args.edge_id + self.current_device_id = args.current_device_id + self.unique_device_id = args.unique_device_id + self.agent_config = agent_config + self.topic_start_train = None + self.topic_stop_train = None + self.topic_complete_job = None + self.topic_report_status = None + self.topic_ota_msg = None + self.topic_response_device_info = None + self.topic_request_device_info_from_mlops = None + self.topic_requesst_job_status = None + self.topic_requesst_device_status_in_job = None + self.topic_send_training_request_to_edges = None + self.run_as_cloud_agent = False + self.run_as_cloud_server = False + self.run_as_edge_server_and_agent = False + self.run_as_cloud_server_and_agent = False + self.enable_simulation_cloud_agent = False + self.use_local_process_as_cloud_server = False + self.ota_upgrade = FedMLOtaUpgrade(edge_id=args.edge_id) + self.running_request_json = dict() + self.start_request_json = None + self.deploy_job_launcher = FedMLDeployJobLauncher() + + @abstractmethod + def generate_topics(self): + # The MQTT message topic format is as follows: // + + # The topic for stopping training + self.topic_start_train = "mlops/flserver_agent_" + str(self.edge_id) + "/start_train" + + # The topi for stopping training + self.topic_stop_train = "mlops/flserver_agent_" + str(self.edge_id) + "/stop_train" + + # The topic for completing job + self.topic_complete_job = GeneralConstants.get_topic_complete_job(self.edge_id) + + # The topic for reporting current device status. + self.topic_report_status = "mlops/report_device_status" + + # The topic for OTA messages from the MLOps. + self.topic_ota_msg = "mlops/flserver_agent_" + str(self.edge_id) + "/ota" + + # The topic for requesting device info from the client. + self.topic_response_device_info = "client/server/response_device_info/" + str(self.edge_id) + + # The topic for requesting device info from mlops. + self.topic_request_device_info_from_mlops = f"deploy/mlops/master_agent/request_device_info/{self.edge_id}" + + # The topic for getting job status from the status center. + self.topic_requesst_job_status = f"anywhere/master_agent/request_job_status/{self.edge_id}" + + # The topic for getting device status of job from the status center. + self.topic_requesst_device_status_in_job = f"anywhere/master_agent/request_device_status_in_job/{self.edge_id}" + + # The topic for reporting online status + self.topic_active = "flserver_agent/active" + + # The topic for last-will messages. + self.topic_last_will = "flserver_agent/last_will_msg" + + # Subscribe topics for starting train, stopping train and fetching client status. + self.subscribed_topics.clear() + self.add_subscribe_topic(self.topic_start_train) + self.add_subscribe_topic(self.topic_stop_train) + self.add_subscribe_topic(self.topic_complete_job) + self.add_subscribe_topic(self.topic_report_status) + self.add_subscribe_topic(self.topic_ota_msg) + self.add_subscribe_topic(self.topic_response_device_info) + self.add_subscribe_topic(self.topic_request_device_info_from_mlops) + self.add_subscribe_topic(self.topic_requesst_job_status) + self.add_subscribe_topic(self.topic_requesst_device_status_in_job) + + @abstractmethod + def add_protocol_handler(self): + # Add the message listeners for all topics, the following is an example. + # self.add_message_listener(self.topic_start_train, self.callback_start_train) + # Add the message listeners for all topics + self.add_message_listener(self.topic_start_train, self.callback_start_train) + self.add_message_listener(self.topic_stop_train, self.callback_stop_train) + self.add_message_listener(self.topic_complete_job, self.callback_complete_job) + self.add_message_listener(self.topic_ota_msg, FedMLBaseMasterProtocolManager.callback_server_ota_msg) + self.add_message_listener(self.topic_report_status, self.callback_report_current_status) + self.add_message_listener(self.topic_response_device_info, self.callback_response_device_info) + self.add_message_listener(self.topic_request_device_info_from_mlops, + self.callback_request_device_info_from_mlops) + self.add_message_listener(self.topic_requesst_job_status, self.callback_request_job_status) + self.add_message_listener(self.topic_requesst_device_status_in_job, self.callback_request_device_status_in_job) + + @abstractmethod + def _get_job_runner_manager(self): + return None + + @abstractmethod + def _init_extra_items(self): + pass + + def add_subscribe_topic(self, topic): + self.subscribed_topics.append(topic) + + def on_agent_communication_connected(self, mqtt_client_object): + super().on_agent_communication_connected(mqtt_client_object) + + if self.run_as_cloud_server: + # Start the FedML cloud server + message_bytes = self.args.runner_cmd.encode("ascii") + base64_bytes = base64.b64decode(message_bytes) + payload = base64_bytes.decode("ascii") + self.receive_message_json(self.topic_start_train, payload) + + def callback_start_train(self, topic=None, payload=None): + # Fetch config from MLOps + # noinspection PyBroadException + try: + MLOpsConfigs.fetch_all_configs() + except Exception: + pass + + # Parse the parameters + # [NOTES] Example Request JSON: + # https://fedml-inc.larksuite.com/wiki/ScnIwUif9iupbjkYS0LuBrd6sod#WjbEdhYrvogmlGxKTOGu98C6sSb + request_json = json.loads(payload) + is_retain = request_json.get("is_retain", False) + if is_retain: + return + run_id = request_json["runId"] + run_id_str = str(run_id) + + # Process the log when running in the edge server mode. + if self.run_as_edge_server_and_agent or self.enable_simulation_cloud_agent: + # Start log processor for current run + self.args.run_id = run_id + self.args.edge_id = self.edge_id + MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) + MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor( + run_id, self.edge_id, SchedulerConstants.get_log_source(request_json)) + # Process the log when running in the cloud agent mode. + elif self.run_as_cloud_agent: + # Start log processor for current run + MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor( + run_id, request_json.get("server_id", "0"), SchedulerConstants.get_log_source(request_json) + ) + # Process the log when running in the cloud server mode. + elif self.run_as_cloud_server: + # Parse the parameters. + self.server_agent_id = request_json.get("cloud_agent_id", self.edge_id) + run_id = request_json["runId"] + run_id_str = str(run_id) + + # Start log processor for current run. + self.args.run_id = run_id + MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor( + run_id, self.edge_id, SchedulerConstants.get_log_source(request_json)) + + # Print the payload + logging.info("callback_start_train payload: {}".format(payload)) + logging.info( + f"FedMLDebug - run id {run_id}, Receive at callback_start_train: topic ({topic}), payload ({payload})" + ) + + # Save the parameters + self.start_request_json = payload + self.run_id = run_id + self.request_json = request_json + self.running_request_json[run_id_str] = request_json + edge_id_list = request_json.get("edgeids", list()) + self.run_edge_ids[run_id_str] = edge_id_list + + # report server running status to master agent + if not self.run_as_cloud_server: + self.mlops_metrics.report_server_id_status( + run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_STARTING, edge_id=self.edge_id, + server_id=self.edge_id, server_agent_id=self.edge_id, running_json=payload) + + # Start server with multiprocessing mode + if self.run_as_edge_server_and_agent or self.enable_simulation_cloud_agent: + self.init_job_task(request_json) + + self.args.run_id = run_id + + self._get_job_runner_manager().start_job_runner( + run_id, request_json, args=self.args, edge_id=self.edge_id, + sender_message_queue=self.message_center.get_sender_message_queue(), + listener_message_queue=self.get_listener_message_queue(), + status_center_queue=self.get_status_queue() + ) + + process = self._get_job_runner_manager().get_runner_process(run_id) + if process is not None: + GeneralConstants.save_run_process(run_id, process.pid, is_master=True) + + self.send_status_msg_to_edges(edge_id_list, run_id, self.edge_id) + elif self.run_as_cloud_agent: + self.init_job_task(request_json) + + self._get_job_runner_manager().start_job_runner( + run_id, request_json, args=self.args, edge_id=self.edge_id, + sender_message_queue=self.message_center.get_sender_message_queue(), + listener_message_queue=self.get_listener_message_queue(), + status_center_queue=self.get_status_queue(), should_start_cloud_server=True, + use_local_process_as_cloud_server=self.use_local_process_as_cloud_server + ) + + process = self._get_job_runner_manager().get_runner_process(run_id, is_cloud_server=True) + if process is not None: + GeneralConstants.save_run_process(run_id, process.pid, is_master=True) + elif self.run_as_cloud_server: + self.server_agent_id = request_json.get("cloud_agent_id", self.edge_id) + self.start_request_json = json.dumps(request_json) + run_id = request_json["runId"] + run_id_str = str(run_id) + + self.init_job_task(request_json) + + self.args.run_id = run_id + + self._get_job_runner_manager().start_job_runner( + run_id, request_json, args=self.args, edge_id=self.edge_id, + sender_message_queue=self.message_center.get_sender_message_queue(), + listener_message_queue=self.get_listener_message_queue(), + status_center_queue=self.get_status_queue() + ) + + self.send_status_msg_to_edges(edge_id_list, run_id, self.edge_id) + + def callback_stop_train(self, topic, payload, use_payload=None): + # Print the payload + logging.info( + f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" + ) + + # Parse the parameters. + request_json = json.loads(payload) + run_id = request_json.get("runId", None) + run_id = request_json.get("id", None) if run_id is None else run_id + run_id_str = str(run_id) + edge_ids = request_json.get("edgeids", None) + server_id = request_json.get("serverId", None) + if server_id is None: + server_id = request_json.get("server_id", None) + server_agent_id = server_id + + # Cleanup the cached object + if self.running_request_json.get(run_id_str, None) is not None: + self.running_request_json.pop(run_id_str) + + # If it is the cloud agent, then forward the stopping request to the corresponding cloud server. + if self.run_as_cloud_agent: + server_agent_id = self.edge_id + topic_stop_train_to_cloud_server = f"mlops/flserver_agent_{server_id}/stop_train" + self.message_center.send_message(topic_stop_train_to_cloud_server, payload) + return + + # Reset all edge status and server status + for iter_edge_id in edge_ids: + self.generate_status_report(run_id, iter_edge_id, server_agent_id=server_agent_id).\ + report_client_id_status(iter_edge_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_KILLED, + run_id=run_id, server_id=server_id) + + # To be compatible to the previous version of edge devices, we just send the stopping train message to edges. + # Currently, the latest version of edge devices don't need to process the stopping train message. + self.send_training_stop_request_to_edges(edge_ids, payload=payload, run_id=run_id) + + def callback_complete_job(self, topic, payload): + # Parse the parameters. + request_json = json.loads(payload) + run_id = request_json.get("runId", None) + run_id = request_json.get("id", None) if run_id is None else run_id + run_id_str = str(run_id) + server_id = request_json.get("serverId", None) + if server_id is None: + server_id = request_json.get("server_id", None) + + self._process_job_complete_status(run_id, server_id, request_json) + + def _process_job_complete_status(self, run_id, server_id, complete_payload): + pass + + def callback_run_logs(self, topic, payload): + run_id = str(topic).split('/')[-1] + run_id_str = str(run_id) + self._get_job_runner_manager().callback_run_logs(run_id, topic, payload) + + def callback_run_metrics(self, topic, payload): + run_id = str(topic).split('/')[-1] + run_id_str = str(run_id) + self._get_job_runner_manager().callback_run_metrics(run_id, topic, payload) + + def callback_edge_status(self, topic, payload): + self.send_status_message(topic, payload) + + def callback_report_current_status(self, topic, payload): + logging.info( + f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" + ) + + if self.run_as_edge_server_and_agent: + self.send_agent_active_msg(self.edge_id) + elif self.run_as_cloud_agent: + self.send_agent_active_msg(self.edge_id) + elif self.run_as_cloud_server: + pass + + @staticmethod + def callback_server_ota_msg(topic, payload): + logging.info( + f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" + ) + + request_json = json.loads(payload) + cmd = request_json["cmd"] + + if cmd == GeneralConstants.FEDML_OTA_CMD_UPGRADE: + # noinspection PyBroadException + try: + FedMLOtaUpgrade.process_ota_upgrade_msg() + # Process(target=FedMLServerRunner.process_ota_upgrade_msg).start() + raise Exception("After upgraded, restart runner...") + except Exception as e: + pass + elif cmd == GeneralConstants.FEDML_OTA_CMD_RESTART: + raise Exception("Restart runner...") + + def callback_response_device_info(self, topic, payload): + # Parse payload + payload_json = json.loads(payload) + run_id = payload_json.get("run_id", 0) + context = payload_json.get("context", None) + master_device_id = payload_json.get("master_device_id", 0) + slave_device_id = payload_json.get("slave_device_id", 0) + slave_device_id_list = payload_json.get("slave_device_id_list", 0) + edge_id = payload_json.get("edge_id", 0) + device_info = payload_json.get("edge_info", 0) + device_info["master_device_id"] = master_device_id + device_info["slave_device_id"] = slave_device_id + device_info["slave_device_id_list"] = slave_device_id_list + run_id_str = str(run_id) + + # Put device info into a multiprocessing queue so master runner checks if all edges are ready + if context is None: + self._get_job_runner_manager().put_run_edge_device_info_to_queue(run_id, edge_id, device_info) + + # if self.run_edge_device_info_global_queue is None: + # self.run_edge_device_info_global_queue = Array('i', list()) + # + # self.run_edge_device_info_global_queue[len(self.run_edge_device_info_global_queue)] = \ + # {"timestamp": time.time(), "edge_id": edge_id, "device_info": device_info} + + request_json = self.running_request_json.get(str(run_id), None) + if request_json is not None: + self.deploy_job_launcher.check_model_device_ready_and_deploy( + request_json, run_id, master_device_id, slave_device_id, run_edge_ids=self.run_edge_ids) + + def callback_request_device_info_from_mlops(self, topic, payload): + self.response_device_info_to_mlops(topic, payload) + + def callback_request_job_status(self, topic, payload): + self.response_job_status(topic, payload) + + def callback_request_device_status_in_job(self, topic, payload): + self.response_device_status_in_job(topic, payload) + + def generate_protocol_manager(self): + message_status_runner = self._generate_protocol_manager_instance( + self.args, agent_config=self.agent_config + ) + message_status_runner.async_check_timeout = self.async_check_timeout + message_status_runner.enable_async_cluster = self.enable_async_cluster + message_status_runner.request_json = self.request_json + message_status_runner.run_edge_ids = self.run_edge_ids + message_status_runner.version = self.version + message_status_runner.message_center_name = self.message_center_name + message_status_runner.run_id = self.run_id + message_status_runner.edge_id = self.edge_id + message_status_runner.server_agent_id = self.server_agent_id + message_status_runner.current_device_id = self.current_device_id + message_status_runner.unique_device_id = self.unique_device_id + message_status_runner.subscribed_topics = self.subscribed_topics + message_status_runner.run_as_cloud_agent = self.run_as_cloud_agent + message_status_runner.run_as_cloud_server = self.run_as_cloud_server + message_status_runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent + message_status_runner.run_as_cloud_server_and_agent = self.run_as_cloud_server_and_agent + message_status_runner.enable_simulation_cloud_agent = self.enable_simulation_cloud_agent + message_status_runner.use_local_process_as_cloud_server = self.use_local_process_as_cloud_server + message_status_runner.running_request_json = self.running_request_json + message_status_runner.start_request_json = self.start_request_json + message_status_runner.user_name = self.user_name + message_status_runner.status_queue = self.get_status_queue() + + return message_status_runner + + def response_job_status(self, topic, payload): + payload_json = json.loads(payload) + if self.mlops_metrics is not None: + run_id = payload_json.get("run_id", None) + edge_id = payload_json.get("edge_id", None) + if run_id is None or edge_id is None: + return + response_topic = f"master_agent/somewhere/response_job_status/{edge_id}" + response_payload = { + "run_id": run_id, + "master_agent": self.edge_id, + "edge_id": edge_id, + "job_status": ComputeCacheManager.get_instance().get_status_cache().get_job_status(), + "fedml_version": fedml.__version__ + } + self.mlops_metrics.report_json_message(response_topic, json.dumps(response_payload)) + + def response_device_status_in_job(self, topic, payload): + payload_json = json.loads(payload) + if self.mlops_metrics is not None: + run_id = payload_json.get("run_id", None) + edge_id = payload_json.get("edge_id", None) + if run_id is None or edge_id is None: + return + response_topic = f"master_agent/somewhere/response_device_status_in_job/{edge_id}" + response_payload = { + "run_id": run_id, + "master_agent": self.edge_id, + "edge_id": edge_id, + "device_status_in_job": + ComputeCacheManager.get_instance().get_status_cache().get_device_status_in_job(run_id, edge_id), + "fedml_version": fedml.__version__ + } + self.mlops_metrics.report_json_message(response_topic, json.dumps(response_payload)) + + def response_device_info_to_mlops(self, topic, payload): + response_topic = f"deploy/master_agent/mlops/response_device_info" + if self.mlops_metrics is not None: + response_payload = {"run_id": self.run_id, "master_agent_device_id": self.edge_id, + "fedml_version": fedml.__version__, "edge_id": self.edge_id} + self.mlops_metrics.report_json_message(response_topic, json.dumps(response_payload)) + + def init_job_task(self, request_json): + run_id = request_json["runId"] + run_config = request_json["run_config"] + edge_ids = request_json["edgeids"] + run_params = run_config.get("parameters", {}) + job_yaml = run_params.get("job_yaml", None) + server_id = request_json["server_id"] + if self.run_as_cloud_agent: + server_id = self.edge_id + + self.setup_listeners_for_edge_status(run_id, edge_ids, server_id) + self.setup_listener_for_run_metrics(run_id) + self.setup_listener_for_run_logs(run_id) + + def setup_listeners_for_edge_status(self, run_id, edge_ids, server_id): + edge_status_topic = "fl_client/flclient_agent_" + str(server_id) + "/status" + payload = {"run_id": run_id, "init_all_edge_id_list": edge_ids, "init_server_id": server_id} + self.callback_edge_status(edge_status_topic, json.dumps(payload)) + + for edge_id in edge_ids: + edge_status_topic = "fl_client/flclient_agent_" + str(edge_id) + "/status" + self.add_message_listener(edge_status_topic, self.callback_edge_status) + self.subscribe_msg(edge_status_topic) + + def remove_listeners_for_edge_status(self, edge_ids=None): + if edge_ids is None: + edge_ids = self.request_json["edgeids"] + + for edge_id in edge_ids: + edge_status_topic = "fl_client/flclient_agent_" + str(edge_id) + "/status" + self.unsubscribe_msg(edge_status_topic) + + def setup_listener_for_run_metrics(self, run_id): + metric_topic = f"fedml_slave/fedml_master/metrics/{run_id}" + self.add_message_listener(metric_topic, self.callback_run_metrics) + self.subscribe_msg(metric_topic) + + def remove_listener_for_run_metrics(self, run_id): + metric_topic = f"fedml_slave/fedml_master/metrics/{run_id}" + self.unsubscribe_msg(metric_topic) + + def setup_listener_for_run_logs(self, run_id): + logs_topic = f"fedml_slave/fedml_master/logs/{run_id}" + self.add_message_listener(logs_topic, self.callback_run_logs) + self.subscribe_msg(logs_topic) + + def remove_listener_for_run_logs(self, run_id): + logs_topic = f"fedml_slave/fedml_master/logs/{run_id}" + self.unsubscribe_msg(logs_topic) + + def send_training_stop_request_to_edges( + self, edge_id_list, payload=None, run_id=0): + if payload is None: + payload_obj = {"runId": run_id, "edgeids": edge_id_list} + payload = json.dumps(payload_obj) + + for edge_id in edge_id_list: + topic_stop_train = "flserver_agent/" + str(edge_id) + "/stop_train" + logging.info("stop_train: send topic " + topic_stop_train) + self.message_center.send_message(topic_stop_train, payload) + + def send_training_stop_request_to_specific_edge(self, edge_id, payload): + topic_stop_train = "flserver_agent/" + str(edge_id) + "/stop_train" + logging.info("stop_train: send topic " + topic_stop_train) + self.message_center.send_message(topic_stop_train, payload) + + def send_training_stop_request_to_cloud_server(self, edge_id, payload): + topic_stop_train = "mlops/flserver_agent_" + str(edge_id) + "/stop_train" + logging.info("stop_train: send topic " + topic_stop_train) + self.message_center.send_message(topic_stop_train, payload) + + def send_status_check_msg(self, run_id, edge_id, server_id, context=None): + topic_status_check = f"server/client/request_device_info/{edge_id}" + payload = {"server_id": server_id, "run_id": run_id} + if context is not None: + payload["context"] = context + self.message_center.send_message(topic_status_check, json.dumps(payload)) + + def send_status_msg_to_edges(self, edge_id_list, run_id, server_id, context=None): + # Send status message to all edges + for edge_id in edge_id_list: + self.send_status_check_msg(run_id, edge_id, self.edge_id, context=context) + + def report_exception_status(self, run_id): + self.mlops_metrics.report_job_status(run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION) + + @staticmethod + def get_start_train_topic_with_edge_id(edge_id): + return "mlops/flserver_agent_" + str(edge_id) + "/start_train" + + @abstractmethod + def _generate_protocol_manager_instance(self, args, agent_config=None): + return None diff --git a/python/fedml/computing/scheduler/master/cloud_server_manager.py b/python/fedml/computing/scheduler/master/cloud_server_manager.py new file mode 100755 index 0000000000..040a0f38a3 --- /dev/null +++ b/python/fedml/computing/scheduler/master/cloud_server_manager.py @@ -0,0 +1,177 @@ +import base64 +import json +import logging +import os +import traceback + +import fedml +from fedml.computing.scheduler.comm_utils.sys_utils import get_python_program + + +class FedMLCloudServerManager: + FEDML_CLOUD_SERVER_PREFIX = "fedml-server-run-" + LOCAL_RUNNER_INFO_DIR_NAME = 'runner_infos' + STATUS_IDLE = "IDLE" + FEDML_SERVER_BASE_IMAGE = "/fedml-device-image:" + + def __init__(self, args, run_id=None, edge_id=None, request_json=None, agent_config=None, version=None): + self.server_docker_image = None + self.args = args + self.run_id = run_id + self.edge_id = edge_id + self.request_json = request_json + self.agent_config = agent_config + if version is None: + version = fedml.get_env_version() + self.version = version + image_version = self.version + if image_version == "local": + image_version = "test" + self.server_docker_base_image = FedMLCloudServerManager._get_server_base_image(image_version) + self.cloud_server_name = None + + @staticmethod + def start_local_cloud_server(user, version, cloud_device_id, runner_cmd_encoded): + print(f"start cloud server, device id {cloud_device_id}, runner cmd {runner_cmd_encoded}") + pip_source_dir = os.path.dirname(__file__) + login_cmd = os.path.join(pip_source_dir, "server_login.py") + run_cmd = f"{get_python_program()} -W ignore {login_cmd} -t login -r cloud_server -u {str(user)} " \ + f"-v {version} -id {cloud_device_id} -rc {runner_cmd_encoded}" + os.system(run_cmd) + + def start_cloud_server_process_entry(self): + try: + self.start_cloud_server_process() + except Exception as e: + logging.info(f"Failed to start the cloud server. {traceback.format_exc()}") + + def start_cloud_server_process(self): + run_config = self.request_json["run_config"] + packages_config = run_config["packages_config"] + self.start_cloud_server(packages_config) + + def start_cloud_server(self, packages_config): + server_id = self.request_json["server_id"] + self.cloud_server_name = f"{FedMLCloudServerManager.FEDML_CLOUD_SERVER_PREFIX}{self.run_id}-{server_id}" + self.server_docker_image = ( + self.agent_config["docker_config"]["registry_server"] + + self.agent_config["docker_config"]["registry_dir"] + + self.server_docker_base_image + ) + + logging.info("docker image {}".format(self.server_docker_image)) + # logging.info("file_sys_driver {}".format(self.agent_config["docker_config"]["file_sys_driver"])) + + registry_secret_cmd = ( + "kubectl create namespace fedml-devops-aggregator-" + + self.version + + ";kubectl -n fedml-devops-aggregator-" + + self.version + + " delete secret secret-" + + self.cloud_server_name + + " ;kubectl create secret docker-registry secret-" + + self.cloud_server_name + + " --docker-server=" + + self.agent_config["docker_config"]["registry_server"] + + " --docker-username=" + + self.agent_config["docker_config"]["user_name"] + + " --docker-password=$(aws ecr-public get-login-password --region " + + self.agent_config["docker_config"]["public_cloud_region"] + + ")" + + " --docker-email=fedml@fedml.ai -n fedml-devops-aggregator-" + + self.version + ) + logging.info("Create secret cmd: " + registry_secret_cmd) + os.system(registry_secret_cmd) + + message_bytes = json.dumps(self.request_json).encode("ascii") + base64_bytes = base64.b64encode(message_bytes) + runner_cmd_encoded = base64_bytes.decode("ascii") + logging.info("runner_cmd_encoded: {}".format(runner_cmd_encoded)) + # logging.info("runner_cmd_decoded: {}".format(base64.b64decode(runner_cmd_encoded).decode())) + cur_dir = os.path.dirname(__file__) + run_deployment_cmd = ( + "export FEDML_AGGREGATOR_NAME=" + + self.cloud_server_name + + ";export FEDML_AGGREGATOR_SVC=" + + self.cloud_server_name + + ";export FEDML_AGGREGATOR_VERSION=" + + self.version + + ';export FEDML_AGGREGATOR_IMAGE_PATH="' + + self.server_docker_image + + '"' + + ";export FEDML_CONF_ID=" + + self.cloud_server_name + + ";export FEDML_DATA_PV_ID=" + + self.cloud_server_name + + ";export FEDML_DATA_PVC_ID=" + + self.cloud_server_name + + ";export FEDML_REGISTRY_SECRET_SUFFIX=" + + self.cloud_server_name + + ";export FEDML_ACCOUNT_ID=0" + + ";export FEDML_SERVER_DEVICE_ID=" + + self.request_json.get("cloudServerDeviceId", "0") + + ";export FEDML_VERSION=" + + self.version + + ";export FEDML_PACKAGE_NAME=" + + packages_config.get("server", "") + + ";export FEDML_PACKAGE_URL=" + + packages_config.get("serverUrl", "") + + ";export FEDML_RUNNER_CMD=" + + runner_cmd_encoded + + ";envsubst < " + + os.path.join(cur_dir, "templates", "fedml-server-deployment.yaml") + + " | kubectl apply -f - " + ) + logging.info("start run with k8s: " + run_deployment_cmd) + os.system(run_deployment_cmd) + + @staticmethod + def stop_cloud_server(run_id, server_id, agent_config): + cloud_server_name = FedMLCloudServerManager._get_cloud_server_name(run_id, server_id) + server_docker_image = ( + agent_config["docker_config"]["registry_server"] + + agent_config["docker_config"]["registry_dir"] + + FedMLCloudServerManager._get_server_base_image(fedml.get_env_version()) + ) + delete_deployment_cmd = ( + "export FEDML_AGGREGATOR_NAME=" + + cloud_server_name + + ";export FEDML_AGGREGATOR_SVC=" + + cloud_server_name + + ";export FEDML_AGGREGATOR_VERSION=" + + fedml.get_env_version() + + ';export FEDML_AGGREGATOR_IMAGE_PATH="' + + server_docker_image + + '"' + + ";export FEDML_CONF_ID=" + + cloud_server_name + + ";export FEDML_DATA_PV_ID=" + + cloud_server_name + + ";export FEDML_DATA_PVC_ID=" + + cloud_server_name + + ";export FEDML_REGISTRY_SECRET_SUFFIX=" + + cloud_server_name + + ";kubectl -n fedml-devops-aggregator-" + + fedml.get_env_version() + + " delete deployment " + + cloud_server_name + + ";kubectl -n fedml-devops-aggregator-" + + fedml.get_env_version() + + " delete svc " + + cloud_server_name + + ";kubectl -n fedml-devops-aggregator-" + + fedml.get_env_version() + + " delete secret secret-" + + cloud_server_name + ) + logging.info("stop run with k8s: " + delete_deployment_cmd) + os.system(delete_deployment_cmd) + + @staticmethod + def _get_server_base_image(version): + return f"{FedMLCloudServerManager.FEDML_SERVER_BASE_IMAGE}{version}" + + @staticmethod + def _get_cloud_server_name(run_id, server_id): + return f"{FedMLCloudServerManager.FEDML_CLOUD_SERVER_PREFIX}{run_id}-{server_id}" diff --git a/python/fedml/computing/scheduler/master/deploy_job_launcher.py b/python/fedml/computing/scheduler/master/deploy_job_launcher.py new file mode 100755 index 0000000000..50e4517547 --- /dev/null +++ b/python/fedml/computing/scheduler/master/deploy_job_launcher.py @@ -0,0 +1,95 @@ +import json +from fedml.computing.scheduler.comm_utils import sys_utils +from fedml.computing.scheduler.model_scheduler import device_client_constants +from fedml.computing.scheduler.model_scheduler.device_model_cards import FedMLModelCards +from fedml.computing.scheduler.scheduler_entry.constants import Constants + + +class FedMLDeployJobLauncher: + LOCAL_RUNNER_INFO_DIR_NAME = 'runner_infos' + STATUS_IDLE = "IDLE" + + def __init__(self, edge_id=None): + self.edge_id = edge_id + self.run_model_device_ids = dict() + + @staticmethod + def deploy_model(serving_devices, request_json, run_id): + run_config = request_json["run_config"] + run_params = run_config.get("parameters", {}) + job_yaml = run_params.get("job_yaml", {}) + job_type = job_yaml.get("job_type", None) + job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type + if job_type == Constants.JOB_TASK_TYPE_DEPLOY or job_type == Constants.JOB_TASK_TYPE_SERVE: + # computing = job_yaml.get("computing", {}) + # num_gpus = computing.get("minimum_num_gpus", 1) + serving_args = run_params.get("serving_args", {}) + model_id = serving_args.get("model_id", None) + model_name = serving_args.get("model_name", None) + model_version = serving_args.get("model_version", None) + # model_storage_url = serving_args.get("model_storage_url", None) + endpoint_name = serving_args.get("endpoint_name", None) + endpoint_id = serving_args.get("endpoint_id", None) + random = serving_args.get("random", "") + random_out = sys_utils.random2(random, "FEDML@9999GREAT") + random_list = random_out.split("FEDML@") + device_type = device_client_constants.ClientConstants.login_role_list[ + device_client_constants.ClientConstants.LOGIN_MODE_FEDML_CLOUD_INDEX] + FedMLModelCards.get_instance().deploy_model( + model_name, device_type, json.dumps(serving_devices), + "", random_list[1], None, + in_model_id=model_id, in_model_version=model_version, + endpoint_name=endpoint_name, endpoint_id=endpoint_id, run_id=run_id) + return endpoint_id + return None + + def check_model_device_ready_and_deploy(self, request_json, run_id, master_device_id, + slave_device_id, run_edge_ids=None): + run_config = request_json["run_config"] + run_params = run_config.get("parameters", {}) + job_yaml = run_params.get("job_yaml", {}) + job_type = job_yaml.get("job_type", None) + job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type + if job_type != Constants.JOB_TASK_TYPE_DEPLOY and job_type != Constants.JOB_TASK_TYPE_SERVE: + return + + # Init model device ids for each run + run_id_str = str(run_id) + if self.run_model_device_ids.get(run_id_str, None) is None: + self.run_model_device_ids[run_id_str] = list() + + # Append master device and slave devices to the model devices map + self.run_model_device_ids[run_id_str].append({"master_device_id": master_device_id, + "slave_device_id": slave_device_id}) + model_device_ids = self.run_model_device_ids.get(run_id_str, None) + if model_device_ids is None: + return + if run_edge_ids is None: + return + + # Check if all model devices are ready + if len(model_device_ids) != len(run_edge_ids.get(run_id_str, list())): + return + + # Generate model master ids and model slave device ids + device_master_ids = list() + device_slave_ids = list() + for device_ids in model_device_ids: + model_master_id = device_ids.get("master_device_id") + model_slave_id = device_ids.get("slave_device_id") + device_master_ids.append(model_master_id) + device_slave_ids.append(model_slave_id) + + if len(device_master_ids) <= 0: + return + + # Generate serving devices for deploying + serving_devices = list() + serving_devices.append(device_master_ids[0]) + serving_devices.extend(device_slave_ids) + + # Start to deploy the model + FedMLDeployJobLauncher.deploy_model(serving_devices, request_json, run_id=run_id) + + + diff --git a/python/fedml/computing/scheduler/master/launch_job_runner.py b/python/fedml/computing/scheduler/master/launch_job_runner.py new file mode 100755 index 0000000000..3f26da1ef7 --- /dev/null +++ b/python/fedml/computing/scheduler/master/launch_job_runner.py @@ -0,0 +1,44 @@ + +from ..master.server_constants import ServerConstants +from ..scheduler_core.general_constants import GeneralConstants +from .base_master_job_runner import FedMLBaseMasterJobRunner + + +class FedMLLaunchMasterJobRunner(FedMLBaseMasterJobRunner): + + def __init__(self, args, run_id=0, request_json=None, agent_config=None, edge_id=0, + cuda_visible_gpu_ids_str=None): + FedMLBaseMasterJobRunner.__init__( + self, args, edge_id=edge_id, request_json=request_json, agent_config=agent_config, run_id=run_id, + cuda_visible_gpu_ids_str=cuda_visible_gpu_ids_str, agent_data_dir=ServerConstants.get_data_dir(), + agent_package_download_dir=ServerConstants.get_package_download_dir(), + agent_package_unzip_dir=GeneralConstants.get_package_unzip_dir(ServerConstants.get_package_download_dir()), + agent_log_file_dir=ServerConstants.get_log_file_dir() + ) + + # Override + def _generate_job_runner_instance(self, args, run_id=None, request_json=None, agent_config=None, edge_id=None,): + return FedMLLaunchMasterJobRunner( + args, run_id=run_id, request_json=request_json, agent_config=agent_config, edge_id=edge_id + ) + + # Override + def _generate_extend_queue_list(self): + return None + + # Override + def get_download_package_info(self, packages_config=None): + return super().get_download_package_info(packages_config) + + # Override + def run_impl( + self, edge_id_status_queue, edge_device_info_queue, run_metrics_queue, + run_event_queue, run_artifacts_queue, run_logs_queue, edge_device_info_global_queue, + run_extend_queue_list=None, sender_message_queue=None, listener_message_queue=None, + status_center_queue=None + ): + super().run_impl( + edge_id_status_queue, edge_device_info_queue, run_metrics_queue, + run_event_queue, run_artifacts_queue, run_logs_queue, edge_device_info_global_queue, + run_extend_queue_list=run_extend_queue_list, sender_message_queue=sender_message_queue, + listener_message_queue=listener_message_queue, status_center_queue=status_center_queue) diff --git a/python/fedml/computing/scheduler/master/launch_job_runner_manager.py b/python/fedml/computing/scheduler/master/launch_job_runner_manager.py new file mode 100755 index 0000000000..9e94b089a3 --- /dev/null +++ b/python/fedml/computing/scheduler/master/launch_job_runner_manager.py @@ -0,0 +1,20 @@ + +from fedml.core.common.singleton import Singleton +from .launch_job_runner import FedMLLaunchMasterJobRunner +from .base_master_job_runner_manager import FedMLBaseMasterJobRunnerManager + + +class FedMLLaunchJobRunnerManager(FedMLBaseMasterJobRunnerManager, Singleton): + def __init__(self): + FedMLBaseMasterJobRunnerManager.__init__(self) + + @staticmethod + def get_instance(): + return FedMLLaunchJobRunnerManager() + + # Override + def _generate_job_runner_instance( + self, args, run_id=None, request_json=None, agent_config=None, edge_id=None + ): + return FedMLLaunchMasterJobRunner( + args, run_id=run_id, request_json=request_json, agent_config=agent_config, edge_id=edge_id) diff --git a/python/fedml/computing/scheduler/master/master_agent.py b/python/fedml/computing/scheduler/master/master_agent.py new file mode 100755 index 0000000000..9bbf6eb982 --- /dev/null +++ b/python/fedml/computing/scheduler/master/master_agent.py @@ -0,0 +1,28 @@ + +from ..master.server_constants import ServerConstants +from .server_data_interface import FedMLServerDataInterface +from .master_protocol_manager import FedMLLaunchMasterProtocolManager +from .base_master_agent import FedMLBaseMasterAgent + + +class FedMLLaunchMasterAgent(FedMLBaseMasterAgent): + + def __init__(self): + FedMLBaseMasterAgent.__init__(self) + + # Override + def _get_log_file_dir(self): + return ServerConstants.get_log_file_dir() + + # Override + def _save_agent_info(self, unique_device_id, edge_id): + ServerConstants.save_runner_infos(unique_device_id, edge_id) + + # Override + def _init_database(self): + FedMLServerDataInterface.get_instance().create_job_table() + + # Override + def _generate_protocol_manager_instance(self, args, agent_config=None): + return FedMLLaunchMasterProtocolManager(args, agent_config=agent_config) + diff --git a/python/fedml/computing/scheduler/master/master_protocol_manager.py b/python/fedml/computing/scheduler/master/master_protocol_manager.py new file mode 100755 index 0000000000..ca9621e41d --- /dev/null +++ b/python/fedml/computing/scheduler/master/master_protocol_manager.py @@ -0,0 +1,43 @@ +from abc import ABC + +from .base_master_protocol_manager import FedMLBaseMasterProtocolManager +from .launch_job_runner_manager import FedMLLaunchJobRunnerManager + + +class FedMLLaunchMasterProtocolManager(FedMLBaseMasterProtocolManager, ABC): + def __init__(self, args, agent_config=None): + FedMLBaseMasterProtocolManager.__init__(self, args, agent_config=agent_config) + + # Override + def generate_topics(self): + super().generate_topics() + + # Override + def add_protocol_handler(self): + super().add_protocol_handler() + + # Override + def _generate_protocol_manager_instance(self, args, agent_config=None): + return FedMLLaunchMasterProtocolManager(args, agent_config=agent_config) + + # Override + def _get_job_runner_manager(self): + return FedMLLaunchJobRunnerManager.get_instance() + + # Override + def _init_extra_items(self): + # Start the monitor process + self.mlops_metrics.stop_device_realtime_perf() + self.mlops_metrics.report_device_realtime_perf( + self.args, self.args.agent_config["mqtt_config"], is_client=False) + + # Override + def print_connected_info(self): + super().print_connected_info() + + # Override + def _process_job_complete_status(self, run_id, server_id, complete_payload): + # Complete the job runner + self._get_job_runner_manager().complete_job_runner( + run_id, args=self.args, server_id=server_id, request_json=complete_payload, + run_as_cloud_agent=self.run_as_cloud_agent, run_as_cloud_server=self.run_as_cloud_server) diff --git a/python/fedml/computing/scheduler/master/server_constants.py b/python/fedml/computing/scheduler/master/server_constants.py index 058c57e199..ebd8b2aef6 100644 --- a/python/fedml/computing/scheduler/master/server_constants.py +++ b/python/fedml/computing/scheduler/master/server_constants.py @@ -255,6 +255,20 @@ def get_dataset_url(): ServerConstants.get_mlops_url()) return create_dataset_url + @staticmethod + def get_presigned_multi_part_url(): + get_presigned_multi_part_url = "{}/system/api/v1/cli/oss/multipart/presigned-url".format( + ServerConstants.get_mlops_url() + ) + return get_presigned_multi_part_url + + @staticmethod + def get_complete_multipart_upload_url(): + complete_multipart_upload_url = "{}/system/api/v1/cli/oss/multipart/upload/complete".format( + ServerConstants.get_mlops_url() + ) + return complete_multipart_upload_url + @staticmethod def list_dataset_url(): list_dataset_url = "{}/fedmlOpsServer/api/v1/cli/dataset/list".format( @@ -268,9 +282,10 @@ def get_dataset_metadata_url(): return get_dataset_metadata_url @staticmethod - def cleanup_run_process(run_id): + def cleanup_run_process(run_id, not_kill_subprocess=False): RunProcessUtils.cleanup_run_process( - run_id, ServerConstants.get_data_dir(), ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME) + run_id, ServerConstants.get_data_dir(), ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME, + not_kill_subprocess=not_kill_subprocess) @staticmethod def save_run_process(run_id, process_id): diff --git a/python/fedml/computing/scheduler/master/server_login.py b/python/fedml/computing/scheduler/master/server_login.py index dee2c83236..8dd0696bc8 100755 --- a/python/fedml/computing/scheduler/master/server_login.py +++ b/python/fedml/computing/scheduler/master/server_login.py @@ -1,407 +1,11 @@ import argparse -import logging import os -import platform -import time -import traceback - -import click import fedml -from fedml.computing.scheduler.comm_utils import sys_utils -from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants -from fedml.computing.scheduler.master.server_runner import FedMLServerRunner -from fedml.computing.scheduler.master.server_constants import ServerConstants -from fedml.core.mlops.mlops_runtime_log import MLOpsRuntimeLog -from fedml.core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon - - -def __login_as_edge_server_and_agent(args, userid, version, api_key="", use_extra_device_id_suffix=None, role=None): - setattr(args, "account_id", userid) - setattr(args, "current_running_dir", ServerConstants.get_fedml_home_dir()) - - sys_name = platform.system() - if sys_name == "Darwin": - sys_name = "MacOS" - if hasattr(args, "os_name") and args.os_name is not None and args.os_name != "": - pass - else: - setattr(args, "os_name", sys_name) - setattr(args, "version", version) - setattr(args, "log_file_dir", ServerConstants.get_log_file_dir()) - is_from_docker = False - if hasattr(args, "device_id") and args.device_id is not None and args.device_id != "0": - setattr(args, "current_device_id", args.device_id) - is_from_docker = True - else: - setattr(args, "current_device_id", FedMLServerRunner.get_device_id()) - setattr(args, "config_version", version) - setattr(args, "cloud_region", "") - - # Create server runner for communication with the FedML client. - runner = FedMLServerRunner(args) - runner.run_as_edge_server_and_agent = True - - # Fetch configs from the MLOps config server. - service_config = dict() - config_try_count = 0 - edge_id = 0 - while config_try_count < 5: - try: - mqtt_config, s3_config, mlops_config, docker_config = runner.fetch_configs() - service_config["mqtt_config"] = mqtt_config - service_config["s3_config"] = s3_config - service_config["ml_ops_config"] = mlops_config - service_config["docker_config"] = docker_config - runner.agent_config = service_config - log_server_url = mlops_config.get("LOG_SERVER_URL", None) - if log_server_url is not None: - setattr(args, "log_server_url", log_server_url) - setattr(runner.args, "log_server_url", log_server_url) - break - except Exception as e: - click.echo("{}\n{}".format(SchedulerConstants.ERR_MSG_BINDING_EXCEPTION_1, traceback.format_exc())) - click.echo(SchedulerConstants.ERR_MSG_BINDING_EXIT_RETRYING) - config_try_count += 1 - time.sleep(3) - continue - - if config_try_count >= 5: - click.echo("") - click.echo("[5] Oops, you failed to login the FedML MLOps platform.") - click.echo("Please check whether your network is normal!") - return - - # Judge whether running from fedml docker hub - is_from_fedml_docker_hub = False - dock_loc_file = ServerConstants.get_docker_location_file() - if os.path.exists(dock_loc_file): - is_from_fedml_docker_hub = True - - # Build unique device id - if is_from_docker: - unique_device_id = args.current_device_id + "@" + args.os_name + ".Docker.Edge.Server" - else: - unique_device_id = args.current_device_id + "@" + args.os_name + ".Edge.Server" - setattr(args, "is_from_docker", is_from_docker) - - if is_from_fedml_docker_hub: - unique_device_id = args.current_device_id + "@" + args.os_name + ".DockerHub.Edge.Server" - - if use_extra_device_id_suffix is not None: - unique_device_id = args.current_device_id + "@" + args.os_name + use_extra_device_id_suffix - - # Bind account id to FedML® Nexus AI Platform - register_try_count = 0 - edge_id = -1 - user_name = None - while register_try_count < 5: - try: - edge_id, user_name, extra_url = runner.bind_account_and_device_id( - service_config["ml_ops_config"]["EDGE_BINDING_URL"], args.account_id, unique_device_id, args.os_name, - api_key=api_key, role=role - ) - if edge_id > 0: - runner.edge_id = edge_id - break - except SystemExit as e: - click.echo("Your account does not exist. Please make sure your account correct.") - os.system("fedml logout -s") - return - except Exception as e: - click.echo("{}\n{}".format(SchedulerConstants.ERR_MSG_BINDING_EXCEPTION_2, traceback.format_exc())) - click.echo(SchedulerConstants.ERR_MSG_BINDING_EXIT_RETRYING) - register_try_count += 1 - time.sleep(3) - continue - - if edge_id <= 0: - click.echo("") - click.echo("[6] Oops, you failed to login the FedML MLOps platform.") - click.echo("Please check whether your network is normal!") - return - setattr(args, "server_id", edge_id) - runner.args = args - runner.edge_id = edge_id - init_logs(args, edge_id) - - # Log arguments and binding results. - # logging.info("login: unique_device_id = %s" % str(unique_device_id)) - # logging.info("login: server_id = %s" % str(edge_id)) - runner.unique_device_id = unique_device_id - runner.user_name = user_name - ServerConstants.save_runner_infos(args.current_device_id + "." + args.os_name, edge_id) - - # Setup MQTT connection for communication with the FedML server. - try: - runner.setup_agent_mqtt_connection(service_config) - except Exception as e: - login_exit_file = os.path.join(ServerConstants.get_log_file_dir(), "exited.log") - with open(login_exit_file, "w") as f: - f.writelines(f"{os.getpid()}.") - runner.stop_agent() - raise e - - # Start mqtt looper - runner.start_agent_mqtt_loop() - - -def __login_as_cloud_agent(args, userid, version): - setattr(args, "account_id", userid) - setattr(args, "current_running_dir", ServerConstants.get_fedml_home_dir()) - - sys_name = platform.system() - if sys_name == "Darwin": - sys_name = "MacOS" - setattr(args, "os_name", sys_name) - setattr(args, "version", version) - setattr(args, "log_file_dir", ServerConstants.get_log_file_dir()) - if hasattr(args, "device_id") and args.device_id is not None and args.device_id != "0": - setattr(args, "current_device_id", args.device_id) - else: - setattr(args, "current_device_id", FedMLServerRunner.get_device_id()) - setattr(args, "config_version", version) - setattr(args, "cloud_region", "") - - # Create server runner for communication with the FedML client. - runner = FedMLServerRunner(args) - runner.run_as_cloud_agent = True - - # Fetch configs from the MLOps config server. - service_config = dict() - config_try_count = 0 - edge_id = 0 - while config_try_count < 5: - try: - mqtt_config, s3_config, mlops_config, docker_config = runner.fetch_configs() - service_config["mqtt_config"] = mqtt_config - service_config["s3_config"] = s3_config - service_config["ml_ops_config"] = mlops_config - service_config["docker_config"] = docker_config - runner.agent_config = service_config - log_server_url = mlops_config.get("LOG_SERVER_URL", None) - if log_server_url is not None: - setattr(args, "log_server_url", log_server_url) - setattr(runner.args, "log_server_url", log_server_url) - break - except Exception as e: - click.echo("{}\n{}".format(SchedulerConstants.ERR_MSG_BINDING_EXCEPTION_1, traceback.format_exc())) - click.echo(SchedulerConstants.ERR_MSG_BINDING_EXIT_RETRYING) - config_try_count += 1 - time.sleep(3) - continue - - if config_try_count >= 5: - click.echo("") - click.echo("[7] Oops, you failed to login the FedML MLOps platform.") - click.echo("Please check whether your network is normal!") - return - - # Build unique device id - if args.current_device_id is not None and len(str(args.current_device_id)) > 0: - unique_device_id = args.current_device_id + "@" + args.os_name + ".Public.Cloud" - - # Bind account id to FedML® Nexus AI Platform - register_try_count = 0 - if hasattr(args, "server_agent_id") and args.server_agent_id is not None: - edge_id = args.server_agent_id - else: - edge_id = -1 - user_name = None - while register_try_count < 5: - try: - edge_id, user_name, extra_url = runner.bind_account_and_device_id( - service_config["ml_ops_config"]["EDGE_BINDING_URL"], args.account_id, unique_device_id, args.os_name - ) - if edge_id > 0: - runner.edge_id = edge_id - break - except SystemExit as e: - click.echo("Your account does not exist. Please make sure your account correct.") - os.system("fedml logout -s") - return - except Exception as e: - click.echo("{}\n{}".format(SchedulerConstants.ERR_MSG_BINDING_EXCEPTION_2, traceback.format_exc())) - click.echo(SchedulerConstants.ERR_MSG_BINDING_EXIT_RETRYING) - register_try_count += 1 - time.sleep(3) - continue - - if edge_id <= 0: - click.echo("") - click.echo("[8] Oops, you failed to login the FedML MLOps platform.") - click.echo("Please check whether your network is normal!") - return - setattr(args, "server_id", edge_id) - runner.args = args - runner.edge_id = edge_id - init_logs(args, edge_id) - logging.info("args {}".format(args)) - - # Log arguments and binding results. - logging.info("login: unique_device_id = %s" % str(unique_device_id)) - logging.info("login: server_id = %s" % str(edge_id)) - runner.unique_device_id = unique_device_id - runner.user_name = "cloud_agent" if user_name is None else user_name - ServerConstants.save_runner_infos(args.current_device_id + "." + args.os_name, edge_id) - - # Setup MQTT connection for communication with the FedML server. - try: - runner.setup_agent_mqtt_connection(service_config) - except Exception as e: - login_exit_file = os.path.join(ServerConstants.get_log_file_dir(), "exited.log") - with open(login_exit_file, "w") as f: - f.writelines(f"{os.getpid()}.") - runner.stop_agent() - raise e - - # Start mqtt looper - runner.start_agent_mqtt_loop() - - -def __login_as_cloud_server(args, userid, version): - setattr(args, "account_id", userid) - setattr(args, "current_running_dir", ServerConstants.get_fedml_home_dir()) - - sys_name = platform.system() - if sys_name == "Darwin": - sys_name = "MacOS" - setattr(args, "os_name", sys_name) - setattr(args, "version", version) - setattr(args, "log_file_dir", ServerConstants.get_log_file_dir()) - if hasattr(args, "device_id") and args.device_id is not None and args.device_id != "0": - setattr(args, "current_device_id", args.device_id) - else: - setattr(args, "current_device_id", FedMLServerRunner.get_device_id()) - setattr(args, "config_version", version) - setattr(args, "cloud_region", "") - - # Create server runner for communication with the FedML client. - runner = FedMLServerRunner(args) - runner.run_as_cloud_server = True - - # Fetch configs from the MLOps config server. - service_config = dict() - config_try_count = 0 - edge_id = 0 - while config_try_count < 5: - try: - mqtt_config, s3_config, mlops_config, docker_config = runner.fetch_configs() - service_config["mqtt_config"] = mqtt_config - service_config["s3_config"] = s3_config - service_config["ml_ops_config"] = mlops_config - service_config["docker_config"] = docker_config - runner.agent_config = service_config - log_server_url = mlops_config.get("LOG_SERVER_URL", None) - if log_server_url is not None: - setattr(args, "log_server_url", log_server_url) - setattr(runner.args, "log_server_url", log_server_url) - break - except Exception as e: - click.echo("{}\n{}".format(SchedulerConstants.ERR_MSG_BINDING_EXCEPTION_1, traceback.format_exc())) - click.echo(SchedulerConstants.ERR_MSG_BINDING_EXIT_RETRYING) - config_try_count += 1 - time.sleep(3) - continue - - if config_try_count >= 5: - click.echo("") - click.echo("[9] Oops, you failed to login the FedML MLOps platform.") - click.echo("Please check whether your network is normal!") - return - - # Build unique device id - if hasattr(args, "device_id") and args.device_id is not None and args.device_id != "0": - unique_device_id = args.current_device_id - else: - unique_device_id = args.current_device_id + "@" + args.os_name + ".Public.Server" - - # Bind account id to FedML® Nexus AI Platform - register_try_count = 0 - edge_id = -1 - user_name = None - while register_try_count < 5: - try: - edge_id, user_name, extra_url = runner.bind_account_and_device_id( - service_config["ml_ops_config"]["EDGE_BINDING_URL"], args.account_id, unique_device_id, args.os_name - ) - if edge_id > 0: - runner.edge_id = edge_id - break - except SystemExit as e: - click.echo("Your account does not exist. Please make sure your account correct.") - os.system("fedml logout -s") - return - except Exception as e: - click.echo("{}\n{}".format(SchedulerConstants.ERR_MSG_BINDING_EXCEPTION_2, traceback.format_exc())) - click.echo(SchedulerConstants.ERR_MSG_BINDING_EXIT_RETRYING) - register_try_count += 1 - time.sleep(3) - continue - - if edge_id <= 0: - click.echo("") - click.echo("[10] Oops, you failed to login the FedML MLOps platform.") - click.echo("Please check whether your network is normal!") - return - setattr(args, "server_id", edge_id) - runner.args = args - runner.edge_id = edge_id - runner.user_name = "cloud_server" if user_name is None else user_name - init_logs(args, edge_id) - - # Log arguments and binding results. - logging.info("login: unique_device_id = %s" % str(unique_device_id)) - logging.info("login: server_id = %s" % str(edge_id)) - ServerConstants.save_runner_infos(args.current_device_id + "." + args.os_name, edge_id) - - # Echo results - print("\n\nCongratulations, your device is connected to the FedML MLOps platform successfully!") - print( - "Your unique device ID is " - + str(unique_device_id) - + "\n" - ) - - # Setup MQTT connection for communication with the FedML server. - try: - runner.setup_agent_mqtt_connection(service_config) - except Exception as e: - login_exit_file = os.path.join(ServerConstants.get_log_file_dir(), "exited.log") - with open(login_exit_file, "w") as f: - f.writelines(f"{os.getpid()}.") - runner.stop_agent() - raise e - - # Start mqtt looper - runner.start_agent_mqtt_loop() - - -def init_logs(args, edge_id): - # Init runtime logs - args.log_file_dir = ServerConstants.get_log_file_dir() - args.run_id = 0 - args.role = "server" - args.edge_id = edge_id - setattr(args, "using_mlops", True) - setattr(args, "server_agent_id", edge_id) - MLOpsRuntimeLog.get_instance(args).init_logs() - - -def login(args): - if args.role == ServerConstants.login_role_list[ServerConstants.LOGIN_MODE_LOCAL_INDEX]: - __login_as_edge_server_and_agent(args, args.user, args.version, api_key=args.api_key) - elif args.role == ServerConstants.login_role_list[ServerConstants.LOGIN_MODE_CLOUD_AGENT_INDEX]: - __login_as_cloud_agent(args, args.user, args.version) - elif args.role == ServerConstants.login_role_list[ServerConstants.LOGIN_MODE_CLOUD_SERVER_INDEX]: - __login_as_cloud_server(args, args.user, args.version) - elif args.role == ServerConstants.login_role_list[ServerConstants.LOGIN_MODE_GPU_MASTER_SERVER_INDEX]: - __login_as_edge_server_and_agent(args, args.user, args.version, api_key=args.api_key, - use_extra_device_id_suffix=".Edge.GPU.MasterServer", role=args.role) +from fedml.computing.scheduler.master.master_agent import FedMLLaunchMasterAgent def logout(): - ServerConstants.cleanup_run_process(None) - sys_utils.cleanup_all_fedml_server_api_processes() + FedMLLaunchMasterAgent.logout() if __name__ == "__main__": @@ -432,7 +36,9 @@ def logout(): fedml.set_local_on_premise_platform_port(args.local_on_premise_platform_port) fedml.set_env_version(args.version) + master_agent = FedMLLaunchMasterAgent() if args.type == 'login': - login(args) + master_agent.login(args.api_key, api_key=args.api_key, device_id=args.device_id, + os_name=args.os_name, role=args.role, runner_cmd=args.runner_cmd) else: - logout() + master_agent.logout() diff --git a/python/fedml/computing/scheduler/master/server_runner.py b/python/fedml/computing/scheduler/master/server_runner.py deleted file mode 100755 index 238349a3e4..0000000000 --- a/python/fedml/computing/scheduler/master/server_runner.py +++ /dev/null @@ -1,2775 +0,0 @@ -import base64 -import copy -import json -import logging -import platform -import queue -import sys - -import multiprocessing -from multiprocessing import Process, Queue, Value, Array -import os -import shutil -import stat -import subprocess -import threading - -import time -import traceback -import urllib -import uuid -import zipfile -from os import listdir -from urllib.parse import urljoin, urlparse - -import requests - -import fedml -from ..comm_utils.job_cleanup import JobCleanup -from ..scheduler_core.scheduler_matcher import SchedulerMatcher -from ..comm_utils.constants import SchedulerConstants -from ..comm_utils.job_utils import JobRunnerUtils -from ..comm_utils.run_process_utils import RunProcessUtils -from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog - -from ....core.distributed.communication.mqtt.mqtt_manager import MqttManager -from ..comm_utils.yaml_utils import load_yaml_config -from ..slave.client_constants import ClientConstants -from ..master.server_constants import ServerConstants - -from ....core.mlops.mlops_metrics import MLOpsMetrics - -from ....core.mlops.mlops_configs import MLOpsConfigs -from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon -from ....core.mlops.mlops_status import MLOpsStatus -from ..comm_utils.sys_utils import get_sys_runner_info, get_python_program -from ..comm_utils import sys_utils -from .server_data_interface import FedMLServerDataInterface -from ....core.mlops.mlops_utils import MLOpsUtils -from ..scheduler_entry.constants import Constants -from ..model_scheduler.model_device_server import FedMLModelDeviceServerRunner -from ..model_scheduler.device_model_cards import FedMLModelCards -from ..model_scheduler import device_client_constants -from ..scheduler_core.log_manager import LogsManager -from ..scheduler_core.metrics_manager import MetricsManager -from ..scheduler_core.master_api_daemon import MasterApiDaemon -from fedml.utils.debugging import debug -from ..scheduler_core.message_center import FedMLMessageCenter -import ssl - - -class RunnerError(Exception): - """ Runner stopped. """ - pass - - -class RunnerCompletedError(Exception): - """ Runner completed. """ - pass - - -class FedMLServerRunner(FedMLMessageCenter): - FEDML_CLOUD_SERVER_PREFIX = "fedml-server-run-" - debug_cloud_server = False - - def __init__(self, args, run_id=0, request_json=None, agent_config=None, edge_id=0): - super().__init__() - self.master_api_daemon = None - self.run_stop_process = None - self.run_stop_process_map = dict() - self.run_edge_id_status_queue_map = dict() - self.run_metrics_queue_map = dict() - self.run_events_queue_map = dict() - self.run_artifacts_queue_map = dict() - self.run_logs_queue_map = dict() - self.async_check_timeout = 0 - self.enable_async_cluster = False - self.origin_fedml_config_object = None - self.package_type = SchedulerConstants.JOB_PACKAGE_TYPE_DEFAULT - self.local_api_process = None - self.run_process_event = None - self.run_process_event_map = dict() - self.run_process_completed_event = None - self.run_process_completed_event_map = dict() - self.run_process_event_map_for_stop = dict() - self.edge_device_info_queue = None - self.run_edge_device_info_queue_map = dict() - self.run_edge_device_info_queue_map_for_stop = dict() - self.run_edge_device_info_global_queue = None - self.run_edge_device_info_global_queue_for_stop = None - self.run_process = None - self.run_process_map = dict() - self.start_request_json = None - self.server_docker_image = None - self.cloud_server_name = None - self.run_as_cloud_agent = False - self.run_as_cloud_server = False - self.run_as_edge_server_and_agent = False - self.run_as_cloud_server_and_agent = False - self.fedml_packages_base_dir = None - self.fedml_packages_unzip_dir = None - self.mqtt_mgr = None - self.running_request_json = dict() - self.run_id = run_id - self.unique_device_id = None - self.edge_id = edge_id - self.server_agent_id = 0 - if request_json is not None: - self.server_agent_id = request_json.get("server_id", 0) - self.process = None - self.args = args - self.request_json = copy.deepcopy(request_json) - self.version = args.version - self.device_id = args.device_id - self.cur_dir = os.path.split(os.path.realpath(__file__))[0] - if args.current_running_dir is not None: - self.cur_dir = args.current_running_dir - - image_version = self.version - if image_version == "local": - image_version = "dev" - self.server_docker_base_image = "/fedml-device-image:" + image_version - - self.agent_config = agent_config - self.fedml_data_base_package_dir = os.path.join("/", "fedml", "data") - self.fedml_data_local_package_dir = os.path.join("/", "fedml", "fedml-package", "fedml", "data") - self.fedml_data_dir = self.fedml_data_base_package_dir - self.fedml_config_dir = os.path.join("/", "fedml", "conf") - - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES = { - "${FEDSYS.RUN_ID}": "", - "${FEDSYS.PRIVATE_LOCAL_DATA}": "", - "${FEDSYS.CLIENT_ID_LIST}": "", - "${FEDSYS.SYNTHETIC_DATA_URL}": "", - "${FEDSYS.IS_USING_LOCAL_DATA}": "", - "${FEDSYS.CLIENT_NUM}": "", - "${FEDSYS.CLIENT_INDEX}": "", - "${FEDSYS.CLIENT_OBJECT_LIST}": "", - "${FEDSYS.LOG_SERVER_URL}": "", - } - - self.mlops_metrics = None - self.client_agent_active_list = dict() - self.server_active_list = dict() - self.run_status = None - self.ntp_offset = MLOpsUtils.get_ntp_offset() - self.runner_list = dict() - self.enable_simulation_cloud_agent = False - self.use_local_process_as_cloud_server = False - - self.model_device_server = None - self.run_model_device_ids = dict() - self.run_edge_ids = dict() - self.master_api_process = None - - self.subscribed_topics = list() - self.user_name = None - self.message_center = None - - def build_dynamic_constrain_variables(self, run_id, run_config): - data_config = run_config.get("data_config", {}) - server_edge_id_list = self.request_json["edgeids"] - is_using_local_data = 0 - private_data_dir = data_config.get("privateLocalData", "") - synthetic_data_url = data_config.get("syntheticDataUrl", "") - edges = self.request_json["edges"] - # if private_data_dir is not None \ - # and len(str(private_data_dir).strip(' ')) > 0: - # is_using_local_data = 1 - if private_data_dir is None or len(str(private_data_dir).strip(" ")) <= 0: - params_config = run_config.get("parameters", None) - private_data_dir = ServerConstants.get_data_dir() - if synthetic_data_url is None or len(str(synthetic_data_url)) <= 0: - synthetic_data_url = private_data_dir - - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.RUN_ID}"] = run_id - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.PRIVATE_LOCAL_DATA}"] = private_data_dir.replace(" ", "") - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_ID_LIST}"] = str(server_edge_id_list).replace(" ", "") - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.SYNTHETIC_DATA_URL}"] = synthetic_data_url.replace(" ", "") - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.IS_USING_LOCAL_DATA}"] = str(is_using_local_data) - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_NUM}"] = len(server_edge_id_list) - client_objects = str(json.dumps(edges)) - client_objects = client_objects.replace(" ", "").replace("\n", "").replace('"', '\\"') - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_OBJECT_LIST}"] = client_objects - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.LOG_SERVER_URL}"] = self.agent_config["ml_ops_config"][ - "LOG_SERVER_URL" - ] - - def unzip_file(self, zip_file, unzip_file_path) -> str: - unziped_file_name = "" - if zipfile.is_zipfile(zip_file): - with zipfile.ZipFile(zip_file, "r") as zipf: - zipf.extractall(unzip_file_path) - unziped_file_name = zipf.namelist()[0] - else: - raise Exception("Invalid zip file {}".format(zip_file)) - - return unziped_file_name - - def package_download_progress(self, count, blksize, filesize): - self.check_runner_stop_event() - - downloaded = count * blksize - downloaded = filesize if downloaded > filesize else downloaded - progress = (downloaded / filesize * 100) if filesize != 0 else 0 - progress_int = int(progress) - downloaded_kb = format(downloaded / 1024, '.2f') - - # since this hook funtion is stateless, we need a state to avoid printing progress repeatly - if count == 0: - self.prev_download_progress = 0 - if progress_int != self.prev_download_progress and progress_int % 5 == 0: - self.prev_download_progress = progress_int - logging.info("package downloaded size {} KB, progress {}%".format(downloaded_kb, progress_int)) - - def retrieve_and_unzip_package(self, package_name, package_url): - local_package_path = ServerConstants.get_package_download_dir() - os.makedirs(local_package_path, exist_ok=True) - filename, filename_without_extension, file_extension = ServerConstants.get_filename_and_extension(package_url) - local_package_file = os.path.join(local_package_path, f"fedml_run_{self.run_id}_{filename_without_extension}") - if os.path.exists(local_package_file): - os.remove(local_package_file) - ssl._create_default_https_context = ssl._create_unverified_context - urllib.request.urlretrieve(package_url, local_package_file, - reporthook=self.package_download_progress) - unzip_package_path = os.path.join(ClientConstants.get_package_unzip_dir(), - f"unzip_fedml_run_{self.run_id}_{filename_without_extension}") - try: - shutil.rmtree(unzip_package_path, ignore_errors=True) - except Exception as e: - pass - - package_dir_name = self.unzip_file(local_package_file, unzip_package_path) # Using unziped folder name - unzip_package_full_path = os.path.join(unzip_package_path, package_dir_name) - - logging.info("local_package_file {}, unzip_package_path {}, unzip file full path {}".format( - local_package_file, unzip_package_path, unzip_package_full_path)) - - return unzip_package_full_path - - def update_local_fedml_config(self, run_id, run_config): - packages_config = run_config["packages_config"] - - # Copy config file from the client - server_package_name = packages_config.get("server", None) - server_package_url = packages_config.get("serverUrl", None) - unzip_package_path = self.retrieve_and_unzip_package(server_package_name, server_package_url) - self.fedml_packages_unzip_dir = unzip_package_path - fedml_local_config_file = os.path.join(unzip_package_path, "conf", "fedml.yaml") - - # Load the above config to memory - config_from_container = load_yaml_config(fedml_local_config_file) - container_entry_file_config = config_from_container["entry_config"] - container_dynamic_args_config = config_from_container["dynamic_args"] - entry_file = container_entry_file_config["entry_file"] - conf_file = container_entry_file_config["conf_file"] - self.package_type = container_entry_file_config.get("package_type", SchedulerConstants.JOB_PACKAGE_TYPE_DEFAULT) - full_conf_path = os.path.join(unzip_package_path, "fedml", "config", os.path.basename(conf_file)) - - # Dynamically build constrain variable with realtime parameters from server - self.build_dynamic_constrain_variables(run_id, run_config) - - # Update entry arguments value with constrain variable values with realtime parameters from server - # currently we support the following constrain variables: - # ${FEDSYS_RUN_ID}: a run id represented one entire Federated Learning flow - # ${FEDSYS_PRIVATE_LOCAL_DATA}: private local data path in the Federated Learning client - # ${FEDSYS_CLIENT_ID_LIST}: client list in one entire Federated Learning flow - # ${FEDSYS_SYNTHETIC_DATA_URL}: synthetic data url from server, - # if this value is not null, the client will download data from this URL to use it as - # federated training data set - # ${FEDSYS_IS_USING_LOCAL_DATA}: whether use private local data as federated training data set - # container_dynamic_args_config["data_cache_dir"] = "${FEDSYS.PRIVATE_LOCAL_DATA}" - for constrain_variable_key, constrain_variable_value in self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES.items(): - for argument_key, argument_value in container_dynamic_args_config.items(): - if argument_value is not None and str(argument_value).find(constrain_variable_key) == 0: - replaced_argument_value = str(argument_value).replace( - constrain_variable_key, str(constrain_variable_value) - ) - container_dynamic_args_config[argument_key] = replaced_argument_value - - # Merge all container new config sections as new config dictionary - package_conf_object = dict() - package_conf_object["entry_config"] = container_entry_file_config - package_conf_object["dynamic_args"] = container_dynamic_args_config - package_conf_object["dynamic_args"]["config_version"] = self.args.config_version - container_dynamic_args_config["mqtt_config_path"] = os.path.join( - unzip_package_path, "fedml", "config", os.path.basename(container_dynamic_args_config["mqtt_config_path"]) - ) - container_dynamic_args_config["s3_config_path"] = os.path.join( - unzip_package_path, "fedml", "config", os.path.basename(container_dynamic_args_config["s3_config_path"]) - ) - log_file_dir = ServerConstants.get_log_file_dir() - os.makedirs(log_file_dir, exist_ok=True) - package_conf_object["dynamic_args"]["log_file_dir"] = log_file_dir - - # Save new config dictionary to local file - fedml_updated_config_file = os.path.join(unzip_package_path, "conf", "fedml.yaml") - ServerConstants.generate_yaml_doc(package_conf_object, fedml_updated_config_file) - - # Build dynamic arguments and set arguments to fedml config object - if not self.build_dynamic_args(run_id, run_config, package_conf_object, unzip_package_path): - return None, None - - return unzip_package_path, package_conf_object - - def build_dynamic_args(self, run_id, run_config, package_conf_object, base_dir): - fedml_conf_file = package_conf_object["entry_config"]["conf_file"] - fedml_conf_file_processed = str(fedml_conf_file).replace('\\', os.sep).replace('/', os.sep) - fedml_conf_path = os.path.join(base_dir, "fedml", "config", - os.path.basename(fedml_conf_file_processed)) - fedml_conf_object = load_yaml_config(fedml_conf_path) - self.origin_fedml_config_object = fedml_conf_object.copy() - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", {}) - - # Replace local fedml config objects with parameters from MLOps web - parameters_object = run_config.get("parameters", None) - if parameters_object is not None: - for config_k, config_v in fedml_conf_object.items(): - parameter_v = parameters_object.get(config_k, None) - if parameter_v is not None: - fedml_conf_object[config_k] = parameter_v - parameters_object.pop(config_k) - - for config_k, config_v in parameters_object.items(): - fedml_conf_object[config_k] = config_v - - package_dynamic_args = package_conf_object["dynamic_args"] - if fedml_conf_object.get("comm_args", None) is not None: - fedml_conf_object["comm_args"]["mqtt_config_path"] = package_dynamic_args["mqtt_config_path"] - fedml_conf_object["comm_args"]["s3_config_path"] = package_dynamic_args["s3_config_path"] - fedml_conf_object["common_args"]["using_mlops"] = True - if fedml_conf_object.get("train_args", None) is not None: - fedml_conf_object["train_args"]["run_id"] = package_dynamic_args["run_id"] - fedml_conf_object["train_args"]["client_id_list"] = package_dynamic_args["client_id_list"] - fedml_conf_object["train_args"]["client_num_in_total"] = int(package_dynamic_args["client_num_in_total"]) - fedml_conf_object["train_args"]["client_num_per_round"] = int(package_dynamic_args["client_num_in_total"]) - fedml_conf_object["train_args"]["server_id"] = self.edge_id - fedml_conf_object["train_args"]["server_agent_id"] = self.request_json.get("cloud_agent_id", self.edge_id) - fedml_conf_object["train_args"]["group_server_id_list"] = self.request_json.get("group_server_id_list", - list()) - if fedml_conf_object.get("device_args", None) is not None: - fedml_conf_object["device_args"]["worker_num"] = int(package_dynamic_args["client_num_in_total"]) - # fedml_conf_object["data_args"]["data_cache_dir"] = package_dynamic_args["data_cache_dir"] - if fedml_conf_object.get("tracking_args", None) is not None: - fedml_conf_object["tracking_args"]["log_file_dir"] = package_dynamic_args["log_file_dir"] - fedml_conf_object["tracking_args"]["log_server_url"] = package_dynamic_args["log_server_url"] - - bootstrap_script_path = None - env_args = fedml_conf_object.get("environment_args", None) - if env_args is not None: - bootstrap_script_file = env_args.get("bootstrap", None) - if bootstrap_script_file is not None: - bootstrap_script_file = str(bootstrap_script_file).replace('\\', os.sep).replace('/', os.sep) - if platform.system() == 'Windows': - bootstrap_script_file = bootstrap_script_file.rstrip('.sh') + '.bat' - if bootstrap_script_file is not None: - bootstrap_script_dir = os.path.join(base_dir, "fedml", os.path.dirname(bootstrap_script_file)) - bootstrap_script_path = os.path.join( - bootstrap_script_dir, bootstrap_script_dir, os.path.basename(bootstrap_script_file) - ) - # try: - # os.makedirs(package_dynamic_args["data_cache_dir"], exist_ok=True) - # except Exception as e: - # pass - fedml_conf_object["dynamic_args"] = package_dynamic_args - - ServerConstants.generate_yaml_doc(fedml_conf_object, fedml_conf_path) - - is_bootstrap_run_ok = True - try: - if bootstrap_script_path is not None: - if os.path.exists(bootstrap_script_path): - bootstrap_stat = os.stat(bootstrap_script_path) - if platform.system() == 'Windows': - os.chmod(bootstrap_script_path, - bootstrap_stat.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) - bootstrap_scripts = "{}".format(bootstrap_script_path) - else: - os.chmod(bootstrap_script_path, - bootstrap_stat.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) - bootstrap_scripts = "cd {}; ./{}".format(bootstrap_script_dir, - os.path.basename(bootstrap_script_file)) - bootstrap_scripts = str(bootstrap_scripts).replace('\\', os.sep).replace('/', os.sep) - logging.info("Bootstrap scripts are being executed...") - shell_cmd_list = list() - shell_cmd_list.append(bootstrap_scripts) - process, error_list = ServerConstants.execute_commands_with_live_logs( - shell_cmd_list, callback=self.callback_run_bootstrap) - - ret_code, out, err = process.returncode, None, None - if ret_code is None or ret_code <= 0: - if error_list is not None and len(error_list) > 0: - is_bootstrap_run_ok = False - else: - if out is not None: - out_str = sys_utils.decode_our_err_result(out) - if out_str != "": - logging.info("{}".format(out_str)) - - sys_utils.log_return_info(bootstrap_script_file, 0) - - is_bootstrap_run_ok = True - else: - if err is not None: - err_str = sys_utils.decode_our_err_result(err) - if err_str != "": - logging.error("{}".format(err_str)) - - sys_utils.log_return_info(bootstrap_script_file, ret_code) - - is_bootstrap_run_ok = False - except Exception as e: - logging.error("Bootstrap scripts error: {}".format(traceback.format_exc())) - - is_bootstrap_run_ok = False - - return is_bootstrap_run_ok - - def callback_run_bootstrap(self, job_pid): - ServerConstants.save_bootstrap_process(self.run_id, job_pid) - - @debug - def run( - self, process_event, completed_event, edge_id_status_queue=None, - edge_device_info_queue=None, run_metrics_queue=None, - run_event_queue=None, run_artifacts_queue=None, run_logs_queue=None, - message_center_queue=None, edge_device_info_global_queue=None - ): - print(f"Server runner process id {os.getpid()}, run id {self.run_id}") - - if platform.system() != "Windows": - os.setsid() - - os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' - os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') - - self.run_process_event = process_event - self.run_process_completed_event = completed_event - try: - MLOpsUtils.set_ntp_offset(self.ntp_offset) - - self.rebuild_message_center(message_center_queue) - - self.run_impl(edge_id_status_queue, edge_device_info_queue, run_metrics_queue, - run_event_queue, run_artifacts_queue, run_logs_queue, edge_device_info_global_queue) - except RunnerError: - logging.info("Runner stopped.") - self.mlops_metrics.report_server_id_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - except RunnerCompletedError: - logging.info("Runner completed.") - except Exception as e: - logging.error("Runner exits with exceptions. {}".format(traceback.format_exc())) - self.mlops_metrics.report_server_id_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - finally: - logging.info("Release resources.") - self._process_run_metrics_queue(run_metrics_queue) - self._process_run_logs_queue(run_logs_queue) - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) - if self.mlops_metrics is not None: - self.mlops_metrics.stop_sys_perf() - time.sleep(3) - ServerConstants.cleanup_run_process(self.run_id) - ServerConstants.cleanup_learning_process(self.run_id) - ServerConstants.cleanup_bootstrap_process(self.run_id) - - def check_runner_stop_event(self): - if self.run_process_event is not None and self.run_process_event.is_set(): - logging.info("Received stopping event.") - raise RunnerError("Runner stopped") - - if self.run_process_completed_event is not None and self.run_process_completed_event.is_set(): - logging.info("Received completed event.") - raise RunnerCompletedError("Runner completed") - - def deploy_model(self, serving_devices, request_json, run_id): - run_config = request_json["run_config"] - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", {}) - job_type = job_yaml.get("job_type", None) - job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type - if job_type == Constants.JOB_TASK_TYPE_DEPLOY or job_type == Constants.JOB_TASK_TYPE_SERVE: - computing = job_yaml.get("computing", {}) - num_gpus = computing.get("minimum_num_gpus", 1) - serving_args = run_params.get("serving_args", {}) - model_id = serving_args.get("model_id", None) - model_name = serving_args.get("model_name", None) - model_version = serving_args.get("model_version", None) - model_storage_url = serving_args.get("model_storage_url", None) - endpoint_name = serving_args.get("endpoint_name", None) - endpoint_id = serving_args.get("endpoint_id", None) - random = serving_args.get("random", "") - random_out = sys_utils.random2(random, "FEDML@9999GREAT") - random_list = random_out.split("FEDML@") - device_type = device_client_constants.ClientConstants.login_role_list[ - device_client_constants.ClientConstants.LOGIN_MODE_FEDML_CLOUD_INDEX] - FedMLModelCards.get_instance().deploy_model( - model_name, device_type, json.dumps(serving_devices), - "", random_list[1], None, - in_model_id=model_id, in_model_version=model_version, - endpoint_name=endpoint_name, endpoint_id=endpoint_id, run_id=run_id) - - @debug - def run_impl( - self, edge_id_status_queue, edge_device_info_queue, run_metrics_queue, - run_event_queue, run_artifacts_queue, run_logs_queue, edge_device_info_global_queue - ): - run_id = self.request_json["runId"] - run_config = self.request_json["run_config"] - data_config = run_config["data_config"] - edge_ids = self.request_json["edgeids"] - - self.check_runner_stop_event() - - self.run_id = run_id - self.args.run_id = self.run_id - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - - # report server running status - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_STARTING, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - logging.info("Detect all status of Edge ids: " + str(edge_ids)) - - status_ok, active_edge_info_dict, inactivate_edges = self.detect_edges_status( - edge_device_info_queue, edge_device_info_global_queue=edge_device_info_global_queue, - callback_when_edges_ready=self.send_training_request_to_edges) - logging.info(f"Status OK: {status_ok}, Active edge info dict: {active_edge_info_dict}, " - f"inactivate edges: {inactivate_edges}") - if not status_ok: - logging.error(f"Status of edge device is not OK. Active edge info dict: {active_edge_info_dict}, " - f"Inactivate edges: {inactivate_edges}") - return - - if not self.should_continue_run_job(run_id): - if FedMLServerRunner.debug_cloud_server: - while True: - time.sleep(30) - # Check if the run status is normal - self.aggregate_run_status_metrics_logs( - run_id, edge_ids, edge_id_status_queue, edge_device_info_queue, - edge_device_info_global_queue, - run_metrics_queue, run_logs_queue) - return - - # Start the server job - self._start_runner_process(run_id, self.request_json, is_server_job=True) - - # Check if the run status is normal - self.aggregate_run_status_metrics_logs( - run_id, edge_ids, edge_id_status_queue, edge_device_info_queue, - edge_device_info_global_queue, - run_metrics_queue, run_logs_queue) - - def aggregate_run_status_metrics_logs( - self, run_id, edge_id_list, edge_id_status_queue, edge_device_info_queue, - edge_device_info_global_queue, run_metrics_queue, run_logs_queue): - total_sleep_seconds = 0 - sleep_seconds = 3 - allowed_status_check_sleep_seconds = 60 * 25 - server_id = self.edge_id - normal_response_status_list = [ - ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE, ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_TRAINING, ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION, ClientConstants.MSG_MLOPS_CLIENT_STATUS_RUNNING - ] - edges_id_status_timeout_map = dict() - number_of_failed_edges = 0 - number_of_finished_edges = 0 - number_of_killed_edges = 0 - running_edges_list = list() - inactivate_edge_list = list() - current_edge_id_status_map = dict() - - while True: - self.check_runner_stop_event() - - # Process run metrics - self._process_run_metrics_queue(run_metrics_queue) - - # Process run logs - self._process_run_logs_queue(run_logs_queue) - - # Fetch edge id and status from the edge id status queue - while True: - try: - queue_item = edge_id_status_queue.get(block=False, timeout=3) - if queue_item is not None: - current_edge_id_status_map.update(queue_item) - except queue.Empty as e: # If queue is empty, then break loop - break - - # Calc the total completed device number - server_id = current_edge_id_status_map.get("server", 0) - running_edges_list.clear() - number_of_failed_edges = 0 - number_of_finished_edges = 0 - number_of_killed_edges = 0 - for edge_id_item, status_item in current_edge_id_status_map.items(): - if edge_id_item == "server": - continue - - if status_item is None or status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED or \ - status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION: - number_of_failed_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED: - number_of_finished_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED: - number_of_killed_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE or \ - status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE: - continue - - running_edges_list.append(edge_id_item) - - # Process the no response edges and accumulate the counter. - for edge_id_item in edge_id_list: - status_dict = edges_id_status_timeout_map.get(str(edge_id_item)) - status_item = current_edge_id_status_map.get(str(edge_id_item)) - if status_item is None: - continue - if status_dict is None: - status_dict = {"status": status_item, "count": 0} - else: - if status_item in normal_response_status_list: - status_dict["count"] = 0 - else: - status_dict["count"] += 1 - edges_id_status_timeout_map[str(edge_id_item)] = status_dict - - # If the completed device number is equal total device number, then break - if len(running_edges_list) <= 0 and len(current_edge_id_status_map.keys()) == len(edge_id_list) + 1: - break - - # Calc the timeout value to wait to device killed. - self.check_runner_stop_event() - time.sleep(sleep_seconds) - total_sleep_seconds += sleep_seconds - no_response_edge_ids = list() - for no_res_edge, no_res_status in edges_id_status_timeout_map.items(): - if no_res_status.get("count") * sleep_seconds > allowed_status_check_sleep_seconds: - no_response_edge_ids.append(no_res_edge) - - # If timeout, then report killed device status - if len(no_response_edge_ids) > 0: - for edge_id_item in no_response_edge_ids: - self.mlops_metrics.report_client_id_status( - edge_id_item, ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED, - server_id=self.edge_id, run_id=self.run_id) - - # Check if we can get the response device info from edge devices - # and set the inactive edges to killed status. - self.check_runner_stop_event() - given_edge_ids = list(set(edge_id_list) - set(inactivate_edge_list)) - status_ok, active_edge_info_dict, inactivate_edges = self.detect_edges_status( - edge_device_info_queue, edge_device_info_global_queue=edge_device_info_global_queue, - need_to_trigger_exception=False, status_timeout=60, - given_edge_ids=given_edge_ids, callback_when_detecting=self.callback_when_detecting_on_aggregation, - args_for_callback_when_detecting=(run_metrics_queue, run_logs_queue) - ) - if not status_ok: - inactivate_edge_list.extend(inactivate_edges) - for edge_id_item in inactivate_edges: - self.mlops_metrics.report_client_id_status( - edge_id_item, ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE, - server_id=self.edge_id, run_id=self.run_id) - - # Calc the final run status based on the completed device numbers and fault tolerance parameters. - enable_fault_tolerance, fault_tolerance_rate = self.parse_fault_tolerance_params(run_id) - running_edges_list = list(set(running_edges_list)) - status_to_report = self.calculate_server_status( - run_id, len(edge_id_list), number_of_failed_edges, number_of_finished_edges, - number_of_killed_edges, running_edges_list, enable_fault_tolerance=enable_fault_tolerance, - fault_tolerance_rate=fault_tolerance_rate) - if status_to_report is not None: - logging.info( - f"Run completed when aggregating status, metrics and logs, will report status {status_to_report}") - self.mlops_metrics.report_server_id_status( - self.run_id, status_to_report, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - def callback_when_detecting_on_aggregation(self, detecting_args): - # Process run metrics - self._process_run_metrics_queue(detecting_args[0]) - - # Process run logs - self._process_run_logs_queue(detecting_args[1]) - - def _process_run_metrics_queue(self, run_metrics_queue): - # Fetch metrics from the run metrics queue - while True: - try: - metrics_item = run_metrics_queue.get(block=False, timeout=3) - MetricsManager.get_instance().save_metrics(metrics_item) - metric_json = json.loads(metrics_item) - if metric_json.get("is_endpoint", False): - metric_json().pop("is_endpoint") - self.mlops_metrics.report_endpoint_metric({}, payload=json.dumps(metric_json)) - else: - self.mlops_metrics.report_server_training_metric({}, payload=metrics_item) - except queue.Empty as e: # If queue is empty, then break loop - break - - def _process_run_logs_queue(self, run_logs_queue): - # Fetch logs from the run logs queue - while True: - try: - logs_item = run_logs_queue.get(block=False, timeout=3) - LogsManager.save_logs(logs_item) - except queue.Empty as e: # If queue is empty, then break loop - break - - def run_server_job_impl(self, process_event, completed_event, edge_id_status_queue=None, - edge_device_info_queue=None, run_metrics_queue=None, - run_event_queue=None, run_artifacts_queue=None, run_logs_queue=None, - message_center_queue=None, edge_device_info_global_queue=None): - print(f"Server runner process id {os.getpid()}, run id {self.run_id}") - - if platform.system() != "Windows": - os.setsid() - - os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' - os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') - - self.run_process_event = process_event - self.run_process_completed_event = completed_event - - MLOpsUtils.set_ntp_offset(self.ntp_offset) - - self.rebuild_message_center(message_center_queue) - - run_id = self.request_json["runId"] - run_config = self.request_json["run_config"] - data_config = run_config["data_config"] - edge_ids = self.request_json["edgeids"] - - self.check_runner_stop_event() - - # get training params - private_local_data_dir = data_config.get("privateLocalData", "") - is_using_local_data = 0 - # if private_local_data_dir is not None and len(str(private_local_data_dir).strip(' ')) > 0: - # is_using_local_data = 1 - - # start a run according to the hyper-parameters - # fedml_local_data_dir = self.cur_dir + "/fedml_data/run_" + run_id_str + "_edge_" + str(edge_id) - fedml_local_data_dir = os.path.join(self.cur_dir, "fedml_data") - fedml_local_config_dir = os.path.join(self.cur_dir, "fedml_config") - if is_using_local_data: - fedml_local_data_dir = private_local_data_dir - self.fedml_data_dir = self.fedml_data_local_package_dir - - self.check_runner_stop_event() - - logging.info("download packages and run the bootstrap script...") - - # update local config with real time parameters from server and dynamically replace variables value - unzip_package_path, fedml_config_object = self.update_local_fedml_config(run_id, run_config) - if unzip_package_path is None or fedml_config_object is None: - logging.info("failed to update local fedml config.") - self.check_runner_stop_event() - self.cleanup_run_when_starting_failed() - self.send_training_stop_request_to_edges_when_exception(edge_ids, payload=self.start_request_json, - run_id=run_id) - return - - logging.info("cleanup the previous aggregation process and check downloaded packages...") - - entry_file_config = fedml_config_object["entry_config"] - dynamic_args_config = fedml_config_object["dynamic_args"] - entry_file = str(entry_file_config["entry_file"]).replace('\\', os.sep).replace('/', os.sep) - entry_file = os.path.basename(entry_file) - conf_file = entry_file_config["conf_file"] - conf_file = str(conf_file).replace('\\', os.sep).replace('/', os.sep) - ServerConstants.cleanup_learning_process(run_id) - self.check_runner_stop_event() - if not os.path.exists(unzip_package_path): - logging.info("failed to unzip file.") - self.check_runner_stop_event() - self.cleanup_run_when_starting_failed() - self.send_training_stop_request_to_edges_when_exception(edge_ids, payload=self.start_request_json, - run_id=run_id) - return - os.chdir(os.path.join(unzip_package_path, "fedml")) - - self.check_runner_stop_event() - - logging.info("starting the server user process...") - - entry_file_full_path = os.path.join(unzip_package_path, "fedml", entry_file) - conf_file_full_path = os.path.join(unzip_package_path, "fedml", conf_file) - logging.info(" ") - logging.info(" ") - logging.info("====Your Run Logs Begin===") - process, is_launch_task, error_list = self.execute_job_task(entry_file_full_path, conf_file_full_path, run_id) - logging.info("====Your Run Logs End===") - logging.info(" ") - logging.info(" ") - - ret_code, out, err = process.returncode, None, None - is_run_ok = sys_utils.is_runner_finished_normally(process.pid) - if is_launch_task: - is_run_ok = True - if error_list is not None and len(error_list) > 0: - is_run_ok = False - if ret_code is None or ret_code <= 0: - self.check_runner_stop_event() - - if is_run_ok: - if out is not None: - out_str = sys_utils.decode_our_err_result(out) - if out_str != "": - logging.info("{}".format(out_str)) - - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - if is_launch_task: - sys_utils.log_return_info(f"job {run_id}", 0) - else: - sys_utils.log_return_info(entry_file, 0) - else: - is_run_ok = False - - if not is_run_ok: - # If the run status is killed or finished, then return with the normal state. - current_job = FedMLServerDataInterface.get_instance().get_job_by_id(run_id) - if current_job is not None and (current_job.status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED or - current_job.status == ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED): - return - - self.check_runner_stop_event() - - logging.error("failed to run the aggregation process...") - - if err is not None: - err_str = sys_utils.decode_our_err_result(err) - if err_str != "": - logging.error("{}".format(err_str)) - - if is_launch_task: - sys_utils.log_return_info(f"job {run_id}", ret_code) - else: - sys_utils.log_return_info(entry_file, ret_code) - - self.send_training_stop_request_to_edges_when_exception(edge_ids, run_id=run_id) - - def init_job_task(self, request_json): - run_id = request_json["runId"] - run_config = request_json["run_config"] - edge_ids = request_json["edgeids"] - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", None) - server_id = request_json["server_id"] - if self.run_as_cloud_agent: - server_id = self.edge_id - - self.setup_listeners_for_edge_status(run_id, edge_ids, server_id) - self.setup_listener_for_run_metrics(run_id) - self.setup_listener_for_run_logs(run_id) - - def should_continue_run_job(self, run_id): - run_config = self.request_json["run_config"] - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", {}) - job_yaml_default_none = run_params.get("job_yaml", None) - framework_type = job_yaml.get("framework_type", None) - job_type = job_yaml.get("job_type", None) - job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type - if job_yaml_default_none is not None: - if job_type == Constants.JOB_TASK_TYPE_FEDERATE: - return True - - if framework_type is None or framework_type != Constants.JOB_FRAMEWORK_TYPE_FEDML: - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_RUNNING, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - return False - - return True - - def execute_job_task(self, entry_file_full_path, conf_file_full_path, run_id): - run_config = self.request_json["run_config"] - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", {}) - job_yaml_default_none = run_params.get("job_yaml", None) - job_api_key = job_yaml.get("run_api_key", None) - job_api_key = job_yaml.get("fedml_run_dynamic_params", None) if job_api_key is None else job_api_key - assigned_gpu_ids = run_params.get("gpu_ids", None) - framework_type = job_yaml.get("framework_type", None) - job_type = job_yaml.get("job_type", None) - job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type - conf_file_object = load_yaml_config(conf_file_full_path) - entry_args_dict = conf_file_object.get("fedml_entry_args", {}) - entry_args = entry_args_dict.get("arg_items", None) - - executable_interpreter = ClientConstants.CLIENT_SHELL_PS \ - if platform.system() == ClientConstants.PLATFORM_WINDOWS else ClientConstants.CLIENT_SHELL_BASH - - if job_yaml_default_none is None: - # Generate the job executing commands for previous federated learning (Compatibility) - python_program = get_python_program() - logging.info("Run the server: {} {} --cf {} --rank 0 --role server".format( - python_program, entry_file_full_path, conf_file_full_path)) - entry_command = f"{python_program} {entry_file_full_path} --cf " \ - f"{conf_file_full_path} --rank 0 --role server" - shell_cmd_list = [entry_command] - - # Run the job executing commands for previous federated learning (Compatibility) - process, error_list = ClientConstants.execute_commands_with_live_logs( - shell_cmd_list, callback=self.callback_start_fl_job, should_write_log_file=False) - is_launch_task = False - else: - self.check_runner_stop_event() - - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_RUNNING, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - # Generate the job executing commands - job_executing_commands = JobRunnerUtils.generate_job_execute_commands( - run_id=self.run_id, edge_id=self.edge_id, version=self.version, package_type=self.package_type, - executable_interpreter=executable_interpreter, entry_file_full_path=entry_file_full_path, - conf_file_object=conf_file_object, entry_args=entry_args, assigned_gpu_ids=assigned_gpu_ids, - job_api_key=job_api_key, client_rank=0) - - # Run the job executing commands - logging.info(f"Run the server job with job id {self.run_id}, device id {self.edge_id}.") - process, error_list = ServerConstants.execute_commands_with_live_logs( - job_executing_commands, callback=self.start_job_perf, error_processor=self.job_error_processor) - is_launch_task = True - - return process, is_launch_task, error_list - - def callback_start_fl_job(self, job_pid): - ServerConstants.save_learning_process(self.run_id, job_pid) - self.mlops_metrics.report_sys_perf( - self.args, self.agent_config["mqtt_config"], job_process_id=job_pid) - - def start_job_perf(self, job_pid): - ServerConstants.save_learning_process(self.run_id, job_pid) - self.mlops_metrics.report_job_perf(self.args, self.agent_config["mqtt_config"], job_pid) - - def job_error_processor(self, error_list): - self.check_runner_stop_event() - - error_str = "\n".join(error_list) - raise Exception(f"Error occurs when running the job... {error_str}") - - def process_job_status(self, run_id, edge_id, status): - number_of_failed_edges = 0 - number_of_finished_edges = 0 - number_of_killed_edges = 0 - edge_id_status_dict = self.client_agent_active_list.get(f"{run_id}", {}) - server_id = edge_id_status_dict.get("server", 0) - enable_fault_tolerance, fault_tolerance_rate = self.parse_fault_tolerance_params(run_id) - running_edges_list = list() - for edge_id_item, status_item in edge_id_status_dict.items(): - if edge_id_item == "server": - continue - - if status_item is None or status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED or \ - status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION: - number_of_failed_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED: - number_of_finished_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED: - number_of_killed_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE or \ - status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE: - continue - - running_edges_list.append(edge_id_item) - - # Report client status - edge_status = ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED if status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION else status - self.mlops_metrics.report_client_training_status(edge_id, edge_status, run_id=run_id) - self.mlops_metrics.report_client_device_status_to_web_ui(edge_id, edge_status, run_id=run_id) - - # Report server status based on the fault tolerance model and parameters - edge_nums = len(edge_id_status_dict.keys()) - 1 - status_to_report = self.calculate_server_status( - run_id, edge_nums, number_of_failed_edges, number_of_finished_edges, number_of_killed_edges, - running_edges_list, enable_fault_tolerance=enable_fault_tolerance, - fault_tolerance_rate=fault_tolerance_rate) - if status_to_report is not None: - logging.info(f"Run completed when processing edge status, will report status {status_to_report}") - self.report_server_status(run_id, server_id, status_to_report) - - def calculate_server_status( - self, run_id, total_edge_nums, number_of_failed_edges, number_of_finished_edges, - number_of_killed_edges, running_edges_list, enable_fault_tolerance=False, - fault_tolerance_rate=0.8 - ): - # Report server status based on the fault tolerance model and parameters - actual_failed_rate = number_of_failed_edges / total_edge_nums - all_edges_run_completed = True if len(running_edges_list) <= 0 else False - if all_edges_run_completed: - status_to_report = None - if enable_fault_tolerance: - if actual_failed_rate >= fault_tolerance_rate: - status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED - self.send_training_stop_request_to_edges_when_exception( - running_edges_list, run_id=run_id, status=status_to_report) - return status_to_report - else: - if number_of_killed_edges == total_edge_nums: - status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED - else: - status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED - else: - if number_of_failed_edges > 0: - status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED - elif number_of_finished_edges == total_edge_nums: - status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED - elif number_of_killed_edges == total_edge_nums: - status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED - - return status_to_report - - def parse_fault_tolerance_params(self, run_id): - run_json = self.running_request_json.get(str(run_id), None) - if run_json is None: - run_json = self.request_json - run_config = run_json.get("run_config", {}) - run_params = run_config.get("parameters", {}) - common_args = run_params.get("common_args", {}) - enable_fault_tolerance = common_args.get("enable_fault_tolerance", False) - fault_tolerance_rate = common_args.get("fault_tolerance_rate", 0) - return enable_fault_tolerance, fault_tolerance_rate - - def report_server_status(self, run_id, server_id, status): - self.mlops_metrics.report_server_id_status(run_id, status, edge_id=self.edge_id, - server_id=server_id, server_agent_id=self.edge_id) - - def stop_run_when_starting_failed(self): - edge_id_list = self.request_json["edgeids"] - run_id = self.request_json.get("run_id", 0) - logging.error("edge ids {}".format(str(edge_id_list))) - - payload = self.running_request_json.get(str(run_id)) - if payload is not None: - self.send_training_stop_request_to_edges(edge_id_list, payload=json.dumps(payload), run_id=run_id) - - # logging.info("Stop run successfully when starting failed.") - - self.mlops_metrics.report_server_id_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - def cleanup_run_when_finished(self, should_send_server_id_status=True): - # logging.info("Cleanup run successfully when finished.") - - self.mlops_metrics.report_server_training_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED, edge_id=self.edge_id - ) - - if should_send_server_id_status: - self.mlops_metrics.report_server_id_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - pass - - time.sleep(1) - - ServerConstants.cleanup_learning_process(self.run_id) - ServerConstants.cleanup_bootstrap_process(self.run_id) - - try: - local_package_path = ServerConstants.get_package_download_dir() - for package_file in listdir(local_package_path): - if os.path.basename(package_file).startswith("run_" + str(self.run_id)): - shutil.rmtree(os.path.join(local_package_path, package_file), ignore_errors=True) - except Exception as e: - pass - - def cleanup_run_when_starting_failed( - self, status=ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, should_send_server_id_status=True): - # logging.info("Cleanup run successfully when starting failed.") - - self.mlops_metrics.report_server_training_status( - self.run_id, status, edge_id=self.edge_id) - - if should_send_server_id_status: - self.mlops_metrics.report_server_id_status( - self.run_id, status, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - pass - - time.sleep(1) - - ServerConstants.cleanup_learning_process(self.run_id) - ServerConstants.cleanup_bootstrap_process(self.run_id) - - try: - local_package_path = ServerConstants.get_package_download_dir() - for package_file in listdir(local_package_path): - if os.path.basename(package_file).startswith("run_" + str(self.run_id)): - shutil.rmtree(os.path.join(local_package_path, package_file), ignore_errors=True) - except Exception as e: - pass - - def should_process_async_cluster(self): - run_config = self.request_json.get("run_config", {}) - run_params = run_config.get("parameters", {}) - common_args = run_params.get("common_args", {}) - self.enable_async_cluster = common_args.get("enable_async_cluster", False) - self.async_check_timeout = common_args.get("async_check_timeout", 0) - if self.enable_async_cluster: - return True, self.async_check_timeout - - return False, self.async_check_timeout - - @debug - def detect_edges_status( - self, edge_device_info_queue, edge_device_info_global_queue=None, callback_when_edges_ready=None, status_timeout=None, - need_to_trigger_exception=True, status_check_context=None, given_edge_ids=None, - callback_when_detecting=None, args_for_callback_when_detecting=None - ): - run_id = self.request_json["runId"] - run_id_str = str(run_id) - edge_id_list = self.request_json["edgeids"] - if given_edge_ids is not None: - edge_id_list = given_edge_ids - - # Init realtime status of all edges - run_edges_realtime_status = dict() - run_edges_realtime_status[run_id_str] = dict() - - edge_info_global_dict = dict() - if edge_device_info_global_queue is not None: - for edge_info_global in edge_device_info_global_queue: - edge_info_id = edge_info_global.get("edge_id") - edge_info_global_dict[edge_info_id] = edge_info_global - - # Send status message to all edges - allowed_cache_edge_status_time = 60 - for edge_id in edge_id_list: - # Check if the edge status was filled allowed_cache_edge_status_time seconds ago, - # if so no more checking message would be sent. - edge_info = edge_info_global_dict.get(edge_id, None) - if edge_info is not None: - timestamp = edge_info.get("timestamp", None) - time_interval = time.time() - timestamp - if time_interval <= allowed_cache_edge_status_time: - continue - - self.send_status_check_msg(run_id, edge_id, self.edge_id, context=status_check_context) - time.sleep(3) - - total_sleep_seconds = 0 - status_check_sleep_seconds = 10 - allowed_status_check_sleep_seconds = 60 * 2 if status_timeout is None else status_timeout - allowed_status_check_sleep_seconds_for_async = 30 - inactivate_edges = list() - active_edge_info_dict = dict() - log_active_edge_info_flag = True - while True: - if callback_when_detecting is not None: - callback_when_detecting(args_for_callback_when_detecting) - - # Fetch edge info from the edge status queue, which will be added to realtime status map - while True: - self.check_runner_stop_event() - - try: - edge_info = edge_device_info_queue.get(block=False, timeout=1) - if edge_info is not None: - edge_id = edge_info.get("edge_id", None) - if edge_id is not None: - run_edges_realtime_status[run_id_str][edge_id] = edge_info - except queue.Empty as e: # If queue is empty, then break loop - break - - self.check_runner_stop_event() - - # Check all edges which don't send response status successfully - # and retry to send the status checking message. - active_edges_count = 0 - inactivate_edges.clear() - active_edge_info_dict.clear() - for edge_id in edge_id_list: - edge_info_dict = run_edges_realtime_status.get(run_id_str, {}) - edge_info = edge_info_dict.get(edge_id, None) - edge_info = edge_info_dict.get(str(edge_id), None) if edge_info is None else edge_info - if edge_info is not None: - active_edges_count += 1 - active_edge_info_dict[str(edge_id)] = edge_info - else: - # Check if the edge status was filled allowed_cache_edge_status_time seconds ago, - # if so no more checking message would be sent. - edge_info = edge_info_global_dict.get(edge_id, None) - if edge_info is not None: - timestamp = edge_info.get("timestamp", None) - time_interval = time.time() - timestamp - if time_interval <= allowed_cache_edge_status_time: - active_edges_count += 1 - active_edge_info_dict[str(edge_id)] = edge_info - continue - - inactivate_edges.append(edge_id) - self.send_status_check_msg(run_id, edge_id, self.edge_id, context=status_check_context) - - # If all edges are ready then send the starting job message to them - if active_edges_count == len(edge_id_list): - if log_active_edge_info_flag: - logging.debug(f"All edges are ready. Active edge id list is as follows. {active_edge_info_dict}") - log_active_edge_info_flag = False - if callback_when_edges_ready is not None: - logging.info("All edges are ready. Start to process the callback function.") - callback_when_edges_ready(active_edge_info_dict=active_edge_info_dict) - else: - logging.debug("All edges are ready. No callback function to process.") - break - else: - logging.info(f"All edges are not ready. Active edge id list: {active_edge_info_dict}, " - f"Inactive edge id list: {inactivate_edges}") - log_active_edge_info_flag = True - - # Check if runner needs to stop and sleep specific time - self.check_runner_stop_event() - time.sleep(status_check_sleep_seconds) - total_sleep_seconds += status_check_sleep_seconds - - # Check if the status response message has timed out to receive - if total_sleep_seconds >= allowed_status_check_sleep_seconds: - # If so, send failed message to MLOps and send exception message to all edges. - logging.error(f"There are inactive edge devices. " - f"Inactivate edge id list is as follows. {inactivate_edges}") - if need_to_trigger_exception: - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.server_agent_id) - self.send_training_stop_request_to_edges_when_exception(edge_id_list, - payload=json.dumps(self.request_json), - run_id=run_id) - return False, active_edge_info_dict, inactivate_edges - - # If we enable the mode for async cluster, then sleep some time and send messages to all clients. - if callback_when_edges_ready is not None: - should_async, async_timeout = self.should_process_async_cluster() - if should_async and total_sleep_seconds >= allowed_status_check_sleep_seconds_for_async: - if async_timeout > allowed_status_check_sleep_seconds_for_async: - time.sleep(async_timeout - allowed_status_check_sleep_seconds_for_async) - self.send_training_request_to_edges() - return True, active_edge_info_dict, inactivate_edges - - return True, active_edge_info_dict, inactivate_edges - - def send_status_check_msg(self, run_id, edge_id, server_id, context=None): - topic_get_model_device_id = "server/client/request_device_info/" + str(edge_id) - payload = {"server_id": server_id, "run_id": run_id} - if context is not None: - payload["context"] = context - self.message_center.send_message(topic_get_model_device_id, json.dumps(payload)) - - @debug - def send_training_request_to_edges(self, active_edge_info_dict=None): - run_id = self.request_json["runId"] - edge_id_list = self.request_json["edgeids"] - run_config = self.request_json.get("run_config", {}) - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", {}) - job_yaml_default_none = run_params.get("job_yaml", None) - computing = job_yaml.get("computing", {}) - request_num_gpus = computing.get("minimum_num_gpus", None) - job_gpu_id_list = self.request_json.get("job_gpu_id_list", None) - - logging.info("Send training request to Edge ids: " + str(edge_id_list)) - - should_match_gpu = False - if job_yaml_default_none is not None and request_num_gpus is not None and \ - int(request_num_gpus) > 0 and active_edge_info_dict is not None: - should_match_gpu = True - SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(active_edge_info_dict, show_gpu_list=True) - - # Match and assign gpus to each device - assigned_gpu_num_dict, assigned_gpu_ids_dict = SchedulerMatcher.match_and_assign_gpu_resources_to_devices( - request_num_gpus, edge_id_list, active_edge_info_dict, job_gpu_id_list=job_gpu_id_list) - if assigned_gpu_num_dict is None or assigned_gpu_ids_dict is None: - # If no resources available, send failed message to MLOps and send exception message to all edges. - gpu_count, gpu_available_count = SchedulerMatcher.parse_and_print_gpu_info_for_all_edges( - active_edge_info_dict, should_print=True) - err_info = f"No resources available." \ - f"Total available GPU count {gpu_available_count} is less than " \ - f"request GPU count {request_num_gpus}" - logging.error(err_info) - - # Bug fix: This mqtt message needs to be sent so platform can clean up the failed run and change the - # status from running to failed. - self.mlops_metrics.report_server_training_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id - ) - - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.server_agent_id) - self.send_training_stop_request_to_edges_when_exception(edge_id_list, - payload=json.dumps(self.request_json), - run_id=run_id) - - serving_args = job_yaml.get("serving_args", {}) - endpoint_id = serving_args.get("endpoint_id", None) - if endpoint_id is not None: - fedml.mlops.log_endpoint_status( - endpoint_id, device_client_constants.ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED) - fedml.mlops.log_run_log_lines( - endpoint_id, 0, [err_info], - log_source=device_client_constants.ClientConstants.FEDML_LOG_SOURCE_TYPE_MODEL_END_POINT - ) - return - - # Generate master node addr and port - master_node_addr, master_node_port = SchedulerMatcher.get_master_node_info(edge_id_list, - active_edge_info_dict) - - # Generate new edge id list after matched - edge_id_list = SchedulerMatcher.generate_new_edge_list_for_gpu_matching(assigned_gpu_num_dict) - if len(edge_id_list) <= 0: - gpu_count, gpu_available_count = SchedulerMatcher.parse_and_print_gpu_info_for_all_edges( - active_edge_info_dict, should_print=True) - logging.error(f"Request parameter for GPU num is invalid." - f"Total available GPU count {gpu_available_count}." - f"Request GPU num {request_num_gpus}") - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.server_agent_id) - self.send_training_stop_request_to_edges_when_exception(edge_id_list, - payload=json.dumps(self.request_json), - run_id=run_id) - return - - if should_match_gpu: - # Report gpu num and related infos to MLOps. - serving_args = job_yaml.get("serving_args", {}) - endpoint_id = serving_args.get("endpoint_id", None) - if endpoint_id is not None: - endpoint_info = list() - for edge_id_item, gpu_num in assigned_gpu_num_dict.items(): - edge_info = active_edge_info_dict.get(str(edge_id_item), {}) - endpoint_info.append({ - "machine_id": edge_id_item, "endpoint_gpu_count": gpu_num, - "master_deploy_id": edge_info.get("master_device_id", 0), - "slave_deploy_id": edge_info.get("slave_device_id", 0)}) - topic_name = f"compute/mlops/endpoint" - endpoint_info_json = {"endpoint_id": endpoint_id, "endpoint_info": endpoint_info} - print(f"endpoint_info_json {endpoint_info_json}") - self.message_center.send_message(topic_name, json.dumps(endpoint_info_json)) - - client_rank = 1 - for edge_id in edge_id_list: - topic_start_train = "flserver_agent/" + str(edge_id) + "/start_train" - logging.info("start_train: send topic " + topic_start_train + " to client...") - request_json = self.request_json - request_json["client_rank"] = client_rank - client_rank += 1 - - if active_edge_info_dict is not None: - edge_info = active_edge_info_dict.get(str(edge_id), {}) - model_master_device_id = edge_info.get("master_device_id", None) - model_slave_device_id = edge_info.get("slave_device_id", None) - model_slave_device_id_list = edge_info.get("slave_device_id_list", None) - - if should_match_gpu: - request_json["scheduler_match_info"] = SchedulerMatcher.generate_match_info_for_scheduler( - edge_id, edge_id_list, master_node_addr, master_node_port, - assigned_gpu_num_dict, assigned_gpu_ids_dict, - model_master_device_id=model_master_device_id, - model_slave_device_id=model_slave_device_id, - model_slave_device_id_list=model_slave_device_id_list - ) - - self.message_center.send_message(topic_start_train, json.dumps(request_json)) - - def setup_listeners_for_edge_status(self, run_id, edge_ids, server_id): - self.client_agent_active_list[f"{run_id}"] = dict() - self.client_agent_active_list[f"{run_id}"][f"server"] = server_id - for edge_id in edge_ids: - self.client_agent_active_list[f"{run_id}"][f"{edge_id}"] = ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - edge_status_topic = "fl_client/flclient_agent_" + str(edge_id) + "/status" - self.add_message_listener(edge_status_topic, self.callback_edge_status) - self.subscribe_msg(edge_status_topic) - - def remove_listeners_for_edge_status(self, edge_ids=None): - if edge_ids is None: - edge_ids = self.request_json["edgeids"] - - for edge_id in edge_ids: - edge_status_topic = "fl_client/flclient_agent_" + str(edge_id) + "/status" - self.unsubscribe_msg(edge_status_topic) - - def setup_listener_for_run_metrics(self, run_id): - metric_topic = f"fedml_slave/fedml_master/metrics/{run_id}" - self.add_message_listener(metric_topic, self.callback_run_metrics) - self.subscribe_msg(metric_topic) - - def remove_listener_for_run_metrics(self, run_id): - metric_topic = f"fedml_slave/fedml_master/metrics/{run_id}" - self.unsubscribe_msg(metric_topic) - - def setup_listener_for_run_logs(self, run_id): - logs_topic = f"fedml_slave/fedml_master/logs/{run_id}" - self.add_message_listener(logs_topic, self.callback_run_logs) - self.subscribe_msg(logs_topic) - - def remove_listener_for_run_logs(self, run_id): - logs_topic = f"fedml_slave/fedml_master/logs/{run_id}" - self.unsubscribe_msg(logs_topic) - - def callback_run_logs(self, topic, payload): - run_id = str(topic).split('/')[-1] - run_id_str = str(run_id) - if self.run_logs_queue_map.get(run_id_str) is None: - self.run_logs_queue_map[run_id_str] = Queue() - self.run_logs_queue_map[run_id_str].put(payload) - - def callback_run_metrics(self, topic, payload): - print(f"callback_run_metrics topic {topic}, payload {payload}") - run_id = str(topic).split('/')[-1] - run_id_str = str(run_id) - if self.run_metrics_queue_map.get(run_id_str) is None: - self.run_metrics_queue_map[run_id_str] = Queue() - self.run_metrics_queue_map[run_id_str].put(payload) - - def callback_edge_status(self, topic, payload): - payload_json = json.loads(payload) - run_id = payload_json.get("run_id", None) - edge_id = payload_json.get("edge_id", None) - status = payload_json.get("status", None) - if run_id is not None and edge_id is not None: - active_item_dict = self.client_agent_active_list.get(f"{run_id}", None) - if active_item_dict is None: - return - self.client_agent_active_list[f"{run_id}"][f"{edge_id}"] = status - - if self.run_edge_id_status_queue_map.get(f"{run_id}") is None: - self.run_edge_id_status_queue_map[f"{run_id}"] = Queue() - self.run_edge_id_status_queue_map[f"{run_id}"].put(self.client_agent_active_list[f"{run_id}"]) - - self.process_job_status(run_id, edge_id, status) - - def ota_upgrade(self, payload, request_json): - run_id = request_json["runId"] - force_ota = False - ota_version = None - - try: - run_config = request_json.get("run_config", None) - parameters = run_config.get("parameters", None) - common_args = parameters.get("common_args", None) - force_ota = common_args.get("force_ota", False) - ota_version = common_args.get("ota_version", None) - except Exception as e: - pass - - if force_ota and ota_version is not None: - should_upgrade = True if ota_version != fedml.__version__ else False - upgrade_version = ota_version - else: - try: - fedml_is_latest_version, local_ver, remote_ver = sys_utils.check_fedml_is_latest_version(self.version) - except Exception as e: - return - - should_upgrade = False if fedml_is_latest_version else True - upgrade_version = remote_ver - - if should_upgrade: - job_obj = FedMLServerDataInterface.get_instance().get_job_by_id(run_id) - if job_obj is None: - FedMLServerDataInterface.get_instance(). \ - save_started_job(run_id, self.edge_id, time.time(), - ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING, - ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING, - payload) - - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - logging.info(f"Upgrade to version {upgrade_version} ...") - - sys_utils.do_upgrade(self.version, upgrade_version) - - raise Exception("Restarting after upgraded...") - - def callback_start_train(self, topic=None, payload=None): - print("callback_start_train: ") - try: - MLOpsConfigs.fetch_all_configs() - except Exception as e: - pass - - # [NOTES] Example Request JSON: https://fedml-inc.larksuite.com/wiki/ScnIwUif9iupbjkYS0LuBrd6sod#WjbEdhYrvogmlGxKTOGu98C6sSb - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - - # Process the log - run_id = request_json["runId"] - run_id_str = str(run_id) - if self.run_as_edge_server_and_agent or self.enable_simulation_cloud_agent: - # Start log processor for current run - self.args.run_id = run_id - self.args.edge_id = self.edge_id - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor( - run_id, self.edge_id, SchedulerConstants.get_log_source(request_json)) - logging.info("start the log processor.") - elif self.run_as_cloud_agent: - # Start log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor( - run_id, request_json.get("server_id", "0"), SchedulerConstants.get_log_source(request_json) - ) - elif self.run_as_cloud_server: - self.server_agent_id = request_json.get("cloud_agent_id", self.edge_id) - run_id = request_json["runId"] - run_id_str = str(run_id) - - # Start log processor for current run - self.args.run_id = run_id - MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor( - run_id, self.edge_id, SchedulerConstants.get_log_source(request_json)) - - logging.info("callback_start_train payload: {}".format(payload)) - logging.info( - f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - ) - - # if not self.run_as_cloud_agent and not self.run_as_cloud_server: - # self.ota_upgrade(payload, request_json) - - # report server running status - if not self.run_as_cloud_server: - self.mlops_metrics.report_server_id_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_STARTING, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - self.start_request_json = payload - self.run_id = run_id - ServerConstants.save_runner_infos(self.args.device_id + "." + self.args.os_name, self.edge_id, run_id=run_id) - - # Start server with multiprocessing mode - self.request_json = request_json - self.running_request_json[run_id_str] = request_json - edge_id_list = request_json.get("edgeids", list()) - self.run_edge_ids[run_id_str] = edge_id_list - - logging.info("subscribe the client exception message.") - - if self.run_as_edge_server_and_agent or self.enable_simulation_cloud_agent: - self.init_job_task(request_json) - - self.args.run_id = run_id - - self._start_runner_process(run_id, request_json) - - ServerConstants.save_run_process(run_id, self.run_process_map[run_id_str].pid) - elif self.run_as_cloud_agent: - self.init_job_task(request_json) - - server_runner = FedMLServerRunner( - self.args, run_id=run_id, request_json=request_json, agent_config=self.agent_config - ) - server_runner.run_as_cloud_agent = self.run_as_cloud_agent - server_runner.start_request_json = json.dumps(request_json) - self.run_process_event_map[run_id_str] = multiprocessing.Event() - self.run_process_event_map[run_id_str].clear() - server_runner.run_process_event = self.run_process_event_map[run_id_str] - - if not self.use_local_process_as_cloud_server: - self.run_process_map[run_id_str] = Process(target=server_runner.start_cloud_server_process_entry) - self.run_process_map[run_id_str].start() - else: - message_bytes = json.dumps(self.request_json).encode("ascii") - base64_bytes = base64.b64encode(message_bytes) - runner_cmd_encoded = base64_bytes.decode("ascii") - logging.info("runner_cmd_encoded: {}".format(runner_cmd_encoded)) - - cloud_device_id = request_json.get("cloudServerDeviceId", "0") - - self.run_process_map[run_id_str] = Process( - target=FedMLServerRunner.start_local_cloud_server, - args=(run_id_str, self.args.user, self.version, cloud_device_id, runner_cmd_encoded)) - self.run_process_map[run_id_str].start() - time.sleep(1) - - ServerConstants.save_run_process(run_id, self.run_process_map[run_id_str].pid) - elif self.run_as_cloud_server: - self.server_agent_id = request_json.get("cloud_agent_id", self.edge_id) - self.start_request_json = json.dumps(request_json) - run_id = request_json["runId"] - run_id_str = str(run_id) - - self.init_job_task(request_json) - - self.args.run_id = run_id - - self._start_runner_process(run_id, request_json) - # ServerConstants.save_run_process(run_id, self.run_process_map[run_id_str].pid) - - @staticmethod - def start_local_cloud_server(run_id, user, version, cloud_device_id, runner_cmd_encoded): - print(f"start cloud server, device id {cloud_device_id}, runner cmd {runner_cmd_encoded}") - if not FedMLServerRunner.debug_cloud_server: - pip_source_dir = os.path.dirname(__file__) - login_cmd = os.path.join(pip_source_dir, "server_login.py") - run_cmd = f"{get_python_program()} -W ignore {login_cmd} -t login -r cloud_server -u {str(user)} " \ - f"-v {version} -id {cloud_device_id} -rc {runner_cmd_encoded}" - os.system(run_cmd) - - def _start_runner_process(self, run_id, request_json, is_server_job=False): - server_runner = FedMLServerRunner( - self.args, run_id=run_id, request_json=request_json, agent_config=self.agent_config - ) - run_id_str = str(run_id) - server_runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent - server_runner.edge_id = self.edge_id - server_runner.server_agent_id = self.server_agent_id - server_runner.start_request_json = json.dumps(request_json) - self.run_process_event_map[run_id_str] = multiprocessing.Event() - self.run_process_event_map[run_id_str].clear() - server_runner.run_process_event = self.run_process_event_map[run_id_str] - self.run_process_completed_event_map[run_id_str] = multiprocessing.Event() - self.run_process_completed_event_map[run_id_str].clear() - server_runner.run_process_completed_event = self.run_process_completed_event_map[run_id_str] - if self.run_edge_id_status_queue_map.get(run_id_str, None) is None: - self.run_edge_id_status_queue_map[run_id_str] = Queue() - if self.run_edge_device_info_queue_map.get(run_id_str, None) is None: - self.run_edge_device_info_queue_map[run_id_str] = Queue() - if self.run_metrics_queue_map.get(run_id_str, None) is None: - self.run_metrics_queue_map[run_id_str] = Queue() - if self.run_events_queue_map.get(run_id_str, None) is None: - self.run_events_queue_map[run_id_str] = Queue() - if self.run_artifacts_queue_map.get(run_id_str, None) is None: - self.run_artifacts_queue_map[run_id_str] = Queue() - if self.run_logs_queue_map.get(run_id_str, None) is None: - self.run_logs_queue_map[run_id_str] = Queue() - # if self.run_edge_device_info_global_queue is None: - # self.run_edge_device_info_global_queue = Array('i', list()) - server_runner.edge_id_status_queue = self.run_edge_id_status_queue_map[run_id_str] - server_runner.edge_device_info_queue = self.run_edge_device_info_queue_map[run_id_str] - self.run_process_map[run_id_str] = Process( - target=server_runner.run if not is_server_job else server_runner.run_server_job_impl, args=( - self.run_process_event_map[run_id_str], self.run_process_completed_event_map[run_id_str], - self.run_edge_id_status_queue_map[run_id_str], self.run_edge_device_info_queue_map[run_id_str], - self.run_metrics_queue_map[run_id_str], self.run_events_queue_map[run_id_str], - self.run_artifacts_queue_map[run_id_str], self.run_logs_queue_map[run_id_str], - self.message_center.get_message_queue(), - self.run_edge_device_info_global_queue - ) - ) - self.run_process_map[run_id_str].start() - ServerConstants.save_run_process(run_id, self.run_process_map[run_id_str].pid) - - def start_cloud_server_process_entry(self): - try: - self.start_cloud_server_process() - except Exception as e: - pass - - def start_cloud_server_process(self): - run_config = self.request_json["run_config"] - packages_config = run_config["packages_config"] - self.start_cloud_server(packages_config) - - def start_cloud_server(self, packages_config): - server_id = self.request_json["server_id"] - self.cloud_server_name = FedMLServerRunner.FEDML_CLOUD_SERVER_PREFIX + str(self.run_id) + "-" + str(server_id) - self.server_docker_image = ( - self.agent_config["docker_config"]["registry_server"] - + self.agent_config["docker_config"]["registry_dir"] - + self.server_docker_base_image - ) - - logging.info("docker image {}".format(self.server_docker_image)) - # logging.info("file_sys_driver {}".format(self.agent_config["docker_config"]["file_sys_driver"])) - - registry_secret_cmd = ( - "kubectl create namespace fedml-devops-aggregator-" - + self.version - + ";kubectl -n fedml-devops-aggregator-" - + self.version - + " delete secret secret-" - + self.cloud_server_name - + " ;kubectl create secret docker-registry secret-" - + self.cloud_server_name - + " --docker-server=" - + self.agent_config["docker_config"]["registry_server"] - + " --docker-username=" - + self.agent_config["docker_config"]["user_name"] - + " --docker-password=$(aws ecr-public get-login-password --region " - + self.agent_config["docker_config"]["public_cloud_region"] - + ")" - + " --docker-email=fedml@fedml.ai -n fedml-devops-aggregator-" - + self.version - ) - logging.info("Create secret cmd: " + registry_secret_cmd) - os.system(registry_secret_cmd) - - message_bytes = json.dumps(self.request_json).encode("ascii") - base64_bytes = base64.b64encode(message_bytes) - runner_cmd_encoded = base64_bytes.decode("ascii") - logging.info("runner_cmd_encoded: {}".format(runner_cmd_encoded)) - # logging.info("runner_cmd_decoded: {}".format(base64.b64decode(runner_cmd_encoded).decode())) - cur_dir = os.path.dirname(__file__) - run_deployment_cmd = ( - "export FEDML_AGGREGATOR_NAME=" - + self.cloud_server_name - + ";export FEDML_AGGREGATOR_SVC=" - + self.cloud_server_name - + ";export FEDML_AGGREGATOR_VERSION=" - + self.version - + ';export FEDML_AGGREGATOR_IMAGE_PATH="' - + self.server_docker_image - + '"' - + ";export FEDML_CONF_ID=" - + self.cloud_server_name - + ";export FEDML_DATA_PV_ID=" - + self.cloud_server_name - + ";export FEDML_DATA_PVC_ID=" - + self.cloud_server_name - + ";export FEDML_REGISTRY_SECRET_SUFFIX=" - + self.cloud_server_name - + ";export FEDML_ACCOUNT_ID=0" - + ";export FEDML_SERVER_DEVICE_ID=" - + self.request_json.get("cloudServerDeviceId", "0") - + ";export FEDML_VERSION=" - + self.version - + ";export FEDML_PACKAGE_NAME=" - + packages_config.get("server", "") - + ";export FEDML_PACKAGE_URL=" - + packages_config.get("serverUrl", "") - + ";export FEDML_RUNNER_CMD=" - + runner_cmd_encoded - + ";envsubst < " - + os.path.join(cur_dir, "templates", "fedml-server-deployment.yaml") - + " | kubectl apply -f - " - ) - logging.info("FedMLServerRunner.run with k8s: " + run_deployment_cmd) - os.system(run_deployment_cmd) - - def stop_cloud_server(self): - self.cloud_server_name = FedMLServerRunner.FEDML_CLOUD_SERVER_PREFIX + str(self.run_id) \ - + "-" + str(self.edge_id) - self.server_docker_image = ( - self.agent_config["docker_config"]["registry_server"] - + self.agent_config["docker_config"]["registry_dir"] - + self.server_docker_base_image - ) - delete_deployment_cmd = ( - "export FEDML_AGGREGATOR_NAME=" - + self.cloud_server_name - + ";export FEDML_AGGREGATOR_SVC=" - + self.cloud_server_name - + ";export FEDML_AGGREGATOR_VERSION=" - + self.version - + ';export FEDML_AGGREGATOR_IMAGE_PATH="' - + self.server_docker_image - + '"' - + ";export FEDML_CONF_ID=" - + self.cloud_server_name - + ";export FEDML_DATA_PV_ID=" - + self.cloud_server_name - + ";export FEDML_DATA_PVC_ID=" - + self.cloud_server_name - + ";export FEDML_REGISTRY_SECRET_SUFFIX=" - + self.cloud_server_name - + ";kubectl -n fedml-devops-aggregator-" - + self.version - + " delete deployment " - + self.cloud_server_name - + ";kubectl -n fedml-devops-aggregator-" - + self.version - + " delete svc " - + self.cloud_server_name - + ";kubectl -n fedml-devops-aggregator-" - + self.version - + " delete secret secret-" - + self.cloud_server_name - ) - logging.info("FedMLServerRunner.stop_run with k8s: " + delete_deployment_cmd) - os.system(delete_deployment_cmd) - - def setup_message_center(self): - if self.message_center is not None: - return - - self.message_center = FedMLMessageCenter(agent_config=self.agent_config) - self.message_center.start_sender() - - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - self.mlops_metrics.set_messenger(self.message_center) - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.edge_id = self.edge_id - self.mlops_metrics.server_agent_id = self.server_agent_id - - def rebuild_message_center(self, message_center_queue): - self.message_center = FedMLMessageCenter(message_queue=message_center_queue) - - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - self.mlops_metrics.set_messenger(self.message_center) - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.edge_id = self.edge_id - self.mlops_metrics.server_agent_id = self.server_agent_id - - def release_message_center(self): - try: - if self.message_center is not None: - self.message_center.stop() - self.message_center = None - - except Exception as e: - logging.error( - f"Failed to release client mqtt manager with Exception {e}. Traceback: {traceback.format_exc()}") - pass - - def send_training_stop_request_to_edges( - self, edge_id_list, payload=None, run_id=0): - if payload is None: - payload_obj = {"runId": run_id, "edgeids": edge_id_list} - else: - payload_obj = json.loads(payload) - - for edge_id in edge_id_list: - topic_stop_train = "flserver_agent/" + str(edge_id) + "/stop_train" - logging.info("stop_train: send topic " + topic_stop_train) - self.message_center.send_message(topic_stop_train, json.dumps(payload_obj)) - - def send_training_stop_request_to_specific_edge(self, edge_id, payload): - topic_stop_train = "flserver_agent/" + str(edge_id) + "/stop_train" - logging.info("stop_train: send topic " + topic_stop_train) - self.message_center.send_message(topic_stop_train, payload) - - def send_training_stop_request_to_cloud_server(self, edge_id, payload): - topic_stop_train = "mlops/flserver_agent_" + str(edge_id) + "/stop_train" - logging.info("stop_train: send topic " + topic_stop_train) - self.message_center.send_message(topic_stop_train, payload) - - def send_training_stop_request_to_edges_when_exception( - self, edge_id_list, payload=None, run_id=0, server_id=None, status=None): - if payload is None: - payload_obj = {"runId": run_id, "edgeids": edge_id_list} - if server_id is not None: - payload_obj["serverId"] = server_id - else: - payload_obj = json.loads(payload) - payload_obj["run_status"] = ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION if status is None else status - topic_stop_train = "flserver_agent/" + str(self.edge_id) + "/stop_train" - self.callback_stop_train(topic_stop_train, json.dumps(payload_obj), use_payload=payload_obj) - - def callback_stop_train(self, topic, payload, use_payload=None): - # logging.info("callback_stop_train: topic = %s, payload = %s" % (topic, payload)) - logging.info( - f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - ) - - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json.get("runId", None) - if run_id is None: - run_id = request_json.get("id", None) - - edge_id_list = request_json["edgeids"] - server_id = request_json.get("serverId", None) - if server_id is None: - server_id = request_json.get("server_id", None) - - if run_id is None or server_id is None: - logging.info("Json format is not correct!") - return - - # logging.info("Stop run with multiprocessing.") - - # Stop server with multiprocessing mode - run_id_str = str(run_id) - stop_request_json = self.running_request_json.get(run_id_str, None) - if stop_request_json is None: - stop_request_json = request_json - if use_payload is not None: - stop_request_json = use_payload - - if self.run_process_event_map.get(run_id_str) is not None: - self.run_process_event_map.get(run_id_str).set() - - if self.run_as_edge_server_and_agent or self.enable_simulation_cloud_agent: - server_runner = FedMLServerRunner( - self.args, run_id=run_id, request_json=stop_request_json, agent_config=self.agent_config, - edge_id=self.edge_id - ) - server_runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent - self.run_process_event_map_for_stop[run_id_str] = multiprocessing.Event() - if self.run_edge_id_status_queue_map.get(run_id_str, None) is None: - self.run_edge_id_status_queue_map[run_id_str] = Queue() - if self.run_edge_device_info_queue_map_for_stop.get(run_id_str, None) is None: - self.run_edge_device_info_queue_map_for_stop[run_id_str] = Queue() - # if self.run_edge_device_info_global_queue_for_stop is None: - # self.run_edge_device_info_global_queue_for_stop = Array('i', list()) - - self.run_stop_process_map[run_id_str] = Process( - target=server_runner.run_stop, args=( - self.run_process_event_map_for_stop[run_id_str], - self.run_edge_id_status_queue_map[run_id_str], - self.run_edge_device_info_queue_map_for_stop[run_id_str], - self.run_edge_device_info_global_queue_for_stop, - self.message_center.get_message_queue(), - ) - ) - self.run_stop_process_map[run_id_str].start() - elif self.run_as_cloud_agent: - self.send_training_stop_request_to_cloud_server(server_id, payload) - return - elif self.run_as_cloud_server: - # if not self.use_local_process_as_cloud_server: - server_runner = FedMLServerRunner( - self.args, run_id=run_id, request_json=stop_request_json, agent_config=self.agent_config, - edge_id=server_id - ) - server_runner.run_as_cloud_agent = self.run_as_cloud_agent - self.run_process_event_map_for_stop[run_id_str] = multiprocessing.Event() - if self.run_edge_id_status_queue_map.get(run_id_str, None) is None: - self.run_edge_id_status_queue_map[run_id_str] = Queue() - if self.run_edge_device_info_queue_map_for_stop.get(run_id_str, None) is None: - self.run_edge_device_info_queue_map_for_stop[run_id_str] = Queue() - # if self.run_edge_device_info_global_queue_for_stop is None: - # self.run_edge_device_info_global_queue_for_stop = Array('i', list()) - - self.run_stop_process_map[run_id_str] = Process( - target=server_runner.run_stop, args=( - self.run_process_event_map_for_stop[run_id_str], - self.run_edge_id_status_queue_map[run_id_str], - self.run_edge_device_info_queue_map_for_stop[run_id_str], - self.run_edge_device_info_global_queue_for_stop, - self.message_center.get_message_queue(), - ) - ) - self.run_stop_process_map[run_id_str].start() - return - - if self.running_request_json.get(run_id_str, None) is not None: - self.running_request_json.pop(run_id_str) - - if self.run_process_map.get(run_id_str, None) is not None: - self.run_process_map.pop(run_id_str) - - def run_stop(self, process_event, edge_id_status_queue, edge_device_info_queue, - edge_device_info_global_queue, message_center_queue): - if platform.system() != "Windows": - os.setsid() - - os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' - os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') - - self.run_process_event = process_event - try: - MLOpsUtils.set_ntp_offset(self.ntp_offset) - - self.rebuild_message_center(message_center_queue) - - self.run_stop_impl(edge_id_status_queue, edge_device_info_queue, edge_device_info_global_queue) - except Exception as e: - logging.error("Stop runner exits with exceptions. {}".format(traceback.format_exc())) - finally: - logging.info("Release resources.") - - def run_stop_impl(self, edge_id_status_queue, edge_device_info_queue, edge_device_info_global_queue): - run_id_str = str(self.run_id) - edge_id_list = self.request_json["edgeids"] - - # Detect running status of all edges - status_ok, active_edge_info_dict, inactivate_edges = self.detect_edges_status( - edge_device_info_queue, edge_device_info_global_queue=edge_device_info_global_queue, - status_timeout=120, need_to_trigger_exception=False, - status_check_context=SchedulerConstants.STATUS_CHECK_FRO_RUN_STOP_CONTEXT) - - # Send the training stopping request to running edges. - for edge_id_item, _ in active_edge_info_dict.items(): - self.send_training_stop_request_to_specific_edge(edge_id_item, json.dumps(self.request_json)) - time.sleep(0.2) - time.sleep(3) - - total_sleep_seconds = 0 - allowed_status_check_sleep_seconds = 60 - server_id = self.edge_id - running_edges_list = list() - current_edge_id_status_map = dict() - - while True: - # Fetch edge id and status from the edge id status queue - while True: - try: - queue_item = edge_id_status_queue.get(block=False, timeout=3) - if queue_item is not None: - current_edge_id_status_map.update(queue_item) - except queue.Empty as e: # If queue is empty, then break loop - break - - # Calc the total killed device number - running_edges_list.clear() - number_of_failed_edges = 0 - number_of_finished_edges = 0 - number_of_killed_edges = 0 - for edge_id_item, status_item in current_edge_id_status_map.items(): - if edge_id_item == "server": - continue - - if status_item is None or status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED or \ - status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION: - number_of_failed_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED: - number_of_finished_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED: - number_of_killed_edges += 1 - continue - - if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE or \ - status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE: - continue - - running_edges_list.append(edge_id_item) - - # If the killed device number is equal total device number, then break - if len(running_edges_list) <= 0 and len(current_edge_id_status_map.keys()) == len(edge_id_list) + 1: - break - - # Calc the timeout value to wait to device killed. - time.sleep(3) - total_sleep_seconds += 3 - if total_sleep_seconds < allowed_status_check_sleep_seconds: - continue - - # If timeout, then report killed device status - no_response_edges = list(set(edge_id_list) - set(running_edges_list)) - if len(no_response_edges) <= 0: - break - for edge_id_item in no_response_edges: - self.mlops_metrics.report_client_id_status( - edge_id_item, ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED, - server_id=self.edge_id, run_id=self.run_id) - - if self.run_as_edge_server_and_agent or self.enable_simulation_cloud_agent: - # Stop log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) - elif self.run_as_cloud_agent: - # Stop log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, server_id) - - self.mlops_metrics.report_server_id_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED, edge_id=self.edge_id, - server_id=self.edge_id, server_agent_id=self.edge_id) - - def set_run_status(self, run_id, status, running_request_json): - server_runner = FedMLServerRunner( - self.args, run_id=run_id, request_json=running_request_json, agent_config=self.agent_config - ) - server_runner.edge_id = self.edge_id - server_runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent - server_runner.run_status = status - server_runner.message_center = self.message_center - server_runner.mlops_metrics = self.mlops_metrics - server_runner.cleanup_client_with_status() - - def callback_runner_id_status(self, topic, payload): - # logging.info("callback_runner_id_status: topic = %s, payload = %s" % (topic, payload)) - # logging.info( - # f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - # ) - - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json["run_id"] - status = request_json["status"] - edge_id = request_json["edge_id"] - server_id = request_json.get("server_id", None) - run_id_str = str(run_id) - - if ( - status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED - or status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED - or status == ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED - ): - completed_event = self.run_process_completed_event_map.get(run_id_str, None) - if completed_event is not None: - completed_event.set() - - FedMLServerDataInterface.get_instance().save_job_status(run_id, self.edge_id, status, status) - - # Stop server with multiprocessing mode - running_request_json = self.running_request_json.get(run_id_str, None) - if running_request_json is None: - running_request_json = request_json - if self.run_as_edge_server_and_agent or self.enable_simulation_cloud_agent: - self.set_run_status(run_id, status, running_request_json) - - run_process = self.run_process_map.get(run_id_str, None) - if run_process is not None: - if run_process.pid is not None: - RunProcessUtils.kill_process(run_process.pid) - - self.run_process_map.pop(run_id_str) - - # Stop log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - elif self.run_as_cloud_agent: - pass - elif self.run_as_cloud_server: - self.set_run_status(run_id, status, running_request_json) - - # Stop log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - if self.use_local_process_as_cloud_server: - # RunProcessUtils.kill_process(os.getpid()) - cloud_server_process = self.run_process_map.get(run_id_str, None) - if cloud_server_process is not None: - RunProcessUtils.kill_process(cloud_server_process.pid) - else: - self.stop_cloud_server() - - if self.run_process_map.get(run_id_str, None) is not None: - self.run_process_map.pop(run_id_str) - - self.remove_listener_for_run_metrics(self.run_id) - self.remove_listener_for_run_logs(self.run_id) - elif ( - status == ServerConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION - ): - request_json = self.running_request_json.get(run_id_str, None) - if request_json is not None: - edge_id_list = request_json.get("edgeids", list()) - server_id = request_json.get("serverId", None) - server_id = request_json.get("server_id", None) if server_id is None else server_id - self.send_training_stop_request_to_edges_when_exception( - edge_id_list, run_id=run_id, server_id=server_id, - status=ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) - - FedMLServerDataInterface.get_instance().save_job_status(run_id, self.edge_id, status, status) - else: - request_json = self.running_request_json.get(run_id_str, None) - if request_json is None: - request_json = self.start_request_json - self.mlops_metrics.report_server_training_status( - run_id, status, edge_id=self.edge_id, running_json=json.dumps(request_json)) - - def cleanup_client_with_status(self): - if self.run_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED: - # logging.info("received to finished status.") - self.cleanup_run_when_finished(should_send_server_id_status=False) - elif self.run_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED: - # logging.info("received to failed status.") - self.cleanup_run_when_starting_failed(should_send_server_id_status=False) - elif self.run_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED: - # logging.info("received to failed status.") - self.cleanup_run_when_starting_failed( - status=self.run_status, should_send_server_id_status=False) - - def callback_report_current_status(self, topic, payload): - logging.info( - f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - ) - - request_json = json.loads(payload) - if self.run_as_edge_server_and_agent: - self.send_agent_active_msg() - elif self.run_as_cloud_agent: - self.send_agent_active_msg() - elif self.run_as_cloud_server: - pass - - @staticmethod - def process_ota_upgrade_msg(): - os.system("pip install -U fedml") - - def callback_server_ota_msg(self, topic, payload): - logging.info( - f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - ) - - request_json = json.loads(payload) - cmd = request_json["cmd"] - - if cmd == ServerConstants.FEDML_OTA_CMD_UPGRADE: - try: - self.process_ota_upgrade_msg() - # Process(target=FedMLServerRunner.process_ota_upgrade_msg).start() - raise Exception("After upgraded, restart runner...") - except Exception as e: - pass - elif cmd == ServerConstants.FEDML_OTA_CMD_RESTART: - raise Exception("Restart runner...") - - def callback_response_device_info(self, topic, payload): - # Parse payload - payload_json = json.loads(payload) - run_id = payload_json.get("run_id", 0) - context = payload_json.get("context", None) - master_device_id = payload_json.get("master_device_id", 0) - slave_device_id = payload_json.get("slave_device_id", 0) - slave_device_id_list = payload_json.get("slave_device_id_list", 0) - edge_id = payload_json.get("edge_id", 0) - device_info = payload_json.get("edge_info", 0) - device_info["master_device_id"] = master_device_id - device_info["slave_device_id"] = slave_device_id - device_info["slave_device_id_list"] = slave_device_id_list - run_id_str = str(run_id) - - # Put device info into a multiprocessing queue so master runner checks if all edges are ready - if context is None: - if self.run_edge_device_info_queue_map.get(run_id_str, None) is None: - self.run_edge_device_info_queue_map[run_id_str] = Queue() - self.run_edge_device_info_queue_map[run_id_str].put(device_info) - - # if self.run_edge_device_info_global_queue is None: - # self.run_edge_device_info_global_queue = Array('i', list()) - # - # self.run_edge_device_info_global_queue[len(self.run_edge_device_info_global_queue)] = \ - # {"timestamp": time.time(), "edge_id": edge_id, "device_info": device_info} - - self.check_model_device_ready_and_deploy(run_id, master_device_id, slave_device_id, - slave_device_id_list=slave_device_id_list) - elif context == SchedulerConstants.STATUS_CHECK_FRO_RUN_STOP_CONTEXT: - if self.run_edge_device_info_queue_map_for_stop.get(run_id_str, None) is None: - self.run_edge_device_info_queue_map_for_stop[run_id_str] = Queue() - self.run_edge_device_info_queue_map_for_stop[run_id_str].put(device_info) - - # if self.run_edge_device_info_global_queue_for_stop is None: - # self.run_edge_device_info_global_queue_for_stop = Array('i', list()) - # - # self.run_edge_device_info_global_queue_for_stop[len(self.run_edge_device_info_global_queue_for_stop)] = \ - # {"timestamp": time.time(), "edge_id": edge_id, "device_info": device_info} - - def check_model_device_ready_and_deploy(self, run_id, master_device_id, slave_device_id, slave_device_id_list=None): - request_json = self.running_request_json.get(str(run_id), None) - if request_json is None: - return - run_config = request_json["run_config"] - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", {}) - job_type = job_yaml.get("job_type", None) - job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type - if job_type != Constants.JOB_TASK_TYPE_DEPLOY and job_type != Constants.JOB_TASK_TYPE_SERVE: - return - - # Init model device ids for each run - run_id_str = str(run_id) - if self.run_model_device_ids.get(run_id_str, None) is None: - self.run_model_device_ids[run_id_str] = list() - - # Append master device and slave devices to the model devices map - self.run_model_device_ids[run_id_str].append({"master_device_id": master_device_id, - "slave_device_id": slave_device_id}) - model_device_ids = self.run_model_device_ids.get(run_id_str, None) - if model_device_ids is None: - return - - # Check if all model devices are ready - if len(model_device_ids) != len(self.run_edge_ids.get(run_id_str, list())): - return - - # Generate model master ids and model slave device ids - device_master_ids = list() - device_slave_ids = list() - for device_ids in model_device_ids: - model_master_id = device_ids.get("master_device_id") - model_slave_id = device_ids.get("slave_device_id") - device_master_ids.append(model_master_id) - device_slave_ids.append(model_slave_id) - - if len(device_master_ids) <= 0: - return - - # Generate serving devices for deploying - serving_devices = list() - serving_devices.append(device_master_ids[0]) - serving_devices.extend(device_slave_ids) - - # Start to deploy the model - self.deploy_model(serving_devices, request_json, run_id=run_id) - - def callback_request_device_info_from_mlops(self, topic, payload): - self.response_device_info_to_mlops(topic, payload) - - def response_device_info_to_mlops(self, topic, payload): - response_topic = f"deploy/master_agent/mlops/response_device_info" - payload_json = json.loads(payload) - need_gpu_info = payload_json.get("need_gpu_info", False) - if self.mlops_metrics is not None: - if not need_gpu_info: - response_payload = { - "run_id": self.run_id, - "master_agent_device_id": self.edge_id, - "fedml_version": fedml.__version__ - } - else: - total_mem, free_mem, total_disk_size, free_disk_size, cup_utilization, cpu_cores, \ - gpu_cores_total, gpu_cores_available, sent_bytes, recv_bytes, gpu_available_ids = \ - sys_utils.get_sys_realtime_stats() - gpu_available_ids = JobRunnerUtils.get_instance().get_available_gpu_id_list(self.edge_id) - gpu_available_ids = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids) - gpu_cores_available = len(gpu_available_ids) - response_payload = { - "run_id": self.run_id, - "master_agent_device_id": self.edge_id, - "memoryTotal": round(total_mem * MLOpsUtils.BYTES_TO_GB, 2), - "memoryAvailable": round(free_mem * MLOpsUtils.BYTES_TO_GB, 2), - "diskSpaceTotal": round(total_disk_size * MLOpsUtils.BYTES_TO_GB, 2), - "diskSpaceAvailable": round(free_disk_size * MLOpsUtils.BYTES_TO_GB, 2), - "cpuUtilization": round(cup_utilization, 2), - "cpuCores": cpu_cores, - "gpuCoresTotal": gpu_cores_total, - "gpuCoresAvailable": gpu_cores_available, - "networkTraffic": sent_bytes + recv_bytes, - "timestamp": int(MLOpsUtils.get_ntp_time()), - "fedml_version": fedml.__version__ - } - self.mlops_metrics.report_json_message(response_topic, json.dumps(response_payload)) - - @staticmethod - def get_device_id(): - device_file_path = os.path.join(ServerConstants.get_data_dir(), ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME) - file_for_device_id = os.path.join(device_file_path, "devices.id") - if not os.path.exists(device_file_path): - os.makedirs(device_file_path) - elif os.path.exists(file_for_device_id): - with open(file_for_device_id, 'r', encoding='utf-8') as f: - device_id_from_file = f.readline() - if device_id_from_file is not None and device_id_from_file != "": - return device_id_from_file - - if platform.system() == "Darwin": - cmd_get_serial_num = "system_profiler SPHardwareDataType | grep Serial | awk '{gsub(/ /,\"\")}{print}' " \ - "|awk -F':' '{print $2}' " - device_id = os.popen(cmd_get_serial_num).read() - device_id = device_id.replace('\n', '').replace(' ', '') - if device_id is None or device_id == "": - device_id = hex(uuid.getnode()) - else: - device_id = "0x" + device_id - else: - if "nt" in os.name: - - def get_uuid(): - guid = "" - try: - cmd = "wmic csproduct get uuid" - guid = str(subprocess.check_output(cmd)) - pos1 = guid.find("\\n") + 2 - guid = guid[pos1:-15] - except Exception as ex: - pass - return str(guid) - - device_id = str(get_uuid()) - elif "posix" in os.name: - device_id = sys_utils.get_device_id_in_docker() - if device_id is None: - device_id = hex(uuid.getnode()) - else: - device_id = sys_utils.run_subprocess_open( - "hal-get-property --udi /org/freedesktop/Hal/devices/computer --key system.hardware.uuid".split() - ) - device_id = hex(device_id) - - if device_id is not None and device_id != "": - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - else: - device_id = hex(uuid.uuid4()) - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - - return device_id - - def bind_account_and_device_id(self, url, account_id, device_id, os_name, api_key="", role=None): - if role is None: - role = "edge_server" - if self.run_as_edge_server_and_agent: - role = "edge_server" - elif self.run_as_cloud_agent: - role = "cloud_agent" - elif self.run_as_cloud_server: - role = "cloud_server" - - ip = requests.get('https://checkip.amazonaws.com').text.strip() - fedml_ver, exec_path, os_ver, cpu_info, python_ver, torch_ver, mpi_installed, \ - cpu_usage, available_mem, total_mem, gpu_info, gpu_available_mem, gpu_total_mem, \ - gpu_count, gpu_vendor, cpu_count, gpu_device_name = get_sys_runner_info() - host_name = sys_utils.get_host_name() - json_params = { - "accountid": account_id, - "deviceid": device_id, - "type": os_name, - "state": ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE, - "status": ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE, - "processor": cpu_info, - "core_type": cpu_info, - "network": "", - "role": role, - "os_ver": os_ver, - "memory": total_mem, - "ip": ip, - "api_key": api_key, - "extra_infos": {"fedml_ver": fedml_ver, "exec_path": exec_path, "os_ver": os_ver, - "cpu_info": cpu_info, "python_ver": python_ver, "torch_ver": torch_ver, - "mpi_installed": mpi_installed, "cpu_usage": cpu_usage, - "available_mem": available_mem, "total_mem": total_mem, - "cpu_count": cpu_count, "gpu_count": 0, "host_name": host_name} - } - if gpu_count > 0: - if gpu_total_mem is not None: - json_params["gpu"] = gpu_info if gpu_info is not None else "" + ", Total GPU Memory: " + gpu_total_mem - else: - json_params["gpu"] = gpu_info if gpu_info is not None else "" - json_params["extra_infos"]["gpu_info"] = gpu_info if gpu_info is not None else "" - if gpu_available_mem is not None: - json_params["extra_infos"]["gpu_available_mem"] = gpu_available_mem - if gpu_total_mem is not None: - json_params["extra_infos"]["gpu_total_mem"] = gpu_total_mem - - json_params["extra_infos"]["gpu_count"] = gpu_count - json_params["extra_infos"]["gpu_vendor"] = gpu_vendor - json_params["extra_infos"]["gpu_device_name"] = gpu_device_name - - gpu_available_id_list = sys_utils.get_available_gpu_id_list(limit=gpu_count) - gpu_available_count = len(gpu_available_id_list) if gpu_available_id_list is not None else 0 - gpu_list = sys_utils.get_gpu_list() - json_params["extra_infos"]["gpu_available_count"] = gpu_available_count - json_params["extra_infos"]["gpu_available_id_list"] = gpu_available_id_list - json_params["extra_infos"]["gpu_list"] = gpu_list - else: - json_params["gpu"] = "None" - json_params["extra_infos"]["gpu_available_count"] = 0 - json_params["extra_infos"]["gpu_available_id_list"] = [] - json_params["extra_infos"]["gpu_list"] = [] - - _, cert_path = MLOpsConfigs.get_request_params() - if cert_path is not None: - try: - requests.session().verify = cert_path - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - except requests.exceptions.SSLError as err: - MLOpsConfigs.install_root_ca_file() - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - else: - response = requests.post(url, json=json_params, headers={"Connection": "close"}) - edge_id = -1 - user_name = None - extra_url = None - if response.status_code != 200: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - pass - else: - # print("url = {}, response = {}".format(url, response)) - status_code = response.json().get("code") - if status_code == "SUCCESS": - edge_id = response.json().get("data").get("id") - user_name = response.json().get("data").get("userName", None) - extra_url = response.json().get("data").get("url", None) - if edge_id is None or edge_id <= 0: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - else: - if status_code == SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR: - raise SystemExit(SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR) - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - return -1, None, None - return edge_id, user_name, extra_url - - def fetch_configs(self): - return MLOpsConfigs.fetch_all_configs() - - def send_agent_active_msg(self): - active_topic = "flserver_agent/active" - status = MLOpsStatus.get_instance().get_server_agent_status(self.edge_id) - if ( - status is not None - and status != ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE - and status != ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - ): - return - - if self.run_as_cloud_agent: - status = ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - else: - try: - current_job = FedMLServerDataInterface.get_instance().get_job_by_id(self.run_id) - except Exception as e: - current_job = None - if current_job is None: - if status is not None and status == ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE: - status = ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - else: - return - else: - status = ServerConstants.get_device_state_from_run_edge_state(current_job.status) - active_msg = {"ID": self.edge_id, "status": status} - MLOpsStatus.get_instance().set_server_agent_status(self.edge_id, status) - if self.mqtt_mgr is not None: - self.mqtt_mgr.send_message_json(active_topic, json.dumps(active_msg)) - else: - self.send_message_json(active_topic, json.dumps(active_msg)) - - def recover_start_train_msg_after_upgrading(self): - try: - current_job = FedMLServerDataInterface.get_instance().get_current_job() - if current_job is not None and \ - current_job.status == ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING: - logging.info("start training after upgrading.") - server_agent_id = self.edge_id - topic_start_train = "mlops/flserver_agent_" + str(server_agent_id) + "/start_train" - self.callback_start_train(topic_start_train, current_job.running_json) - except Exception as e: - logging.info("recover starting train message after upgrading: {}".format(traceback.format_exc())) - - def on_agent_mqtt_connected(self, mqtt_client_object): - # The MQTT message topic format is as follows: // - - # Setup MQTT message listener for starting training - server_agent_id = self.edge_id - topic_start_train = "mlops/flserver_agent_" + str(server_agent_id) + "/start_train" - self.add_message_listener(topic_start_train, self.callback_start_train) - self.mqtt_mgr.add_message_listener(topic_start_train, self.listener_message_dispatch_center) - - # Setup MQTT message listener for stopping training - topic_stop_train = "mlops/flserver_agent_" + str(server_agent_id) + "/stop_train" - self.add_message_listener(topic_stop_train, self.callback_stop_train) - self.mqtt_mgr.add_message_listener(topic_stop_train, self.listener_message_dispatch_center) - - # Setup MQTT message listener for server status switching - topic_server_status = "fl_server/flserver_agent_" + str(server_agent_id) + "/status" - self.add_message_listener(topic_server_status, self.callback_runner_id_status) - self.mqtt_mgr.add_message_listener(topic_server_status, self.listener_message_dispatch_center) - - # Setup MQTT message listener to report current device status. - topic_report_status = "mlops/report_device_status" - self.add_message_listener(topic_report_status, self.callback_report_current_status) - self.mqtt_mgr.add_message_listener(topic_report_status, self.listener_message_dispatch_center) - - # Setup MQTT message listener to OTA messages from the MLOps. - topic_ota_msg = "mlops/flserver_agent_" + str(server_agent_id) + "/ota" - self.add_message_listener(topic_ota_msg, self.callback_server_ota_msg) - self.mqtt_mgr.add_message_listener(topic_ota_msg, self.listener_message_dispatch_center) - - # Setup MQTT message listener to request device info from the client. - topic_response_device_info = "client/server/response_device_info/" + str(self.edge_id) - self.add_message_listener(topic_response_device_info, self.callback_response_device_info) - self.mqtt_mgr.add_message_listener(topic_response_device_info, self.listener_message_dispatch_center) - - # Setup MQTT message listener to request device info from MLOps. - topic_request_device_info_from_mlops = f"deploy/mlops/master_agent/request_device_info/{self.edge_id}" - self.add_message_listener(topic_request_device_info_from_mlops, self.callback_request_device_info_from_mlops) - self.mqtt_mgr.add_message_listener( - topic_request_device_info_from_mlops, self.listener_message_dispatch_center) - - # Subscribe topics for starting train, stopping train and fetching client status. - mqtt_client_object.subscribe(topic_start_train, qos=2) - mqtt_client_object.subscribe(topic_stop_train, qos=2) - mqtt_client_object.subscribe(topic_server_status, qos=2) - mqtt_client_object.subscribe(topic_report_status, qos=2) - mqtt_client_object.subscribe(topic_ota_msg, qos=2) - mqtt_client_object.subscribe(topic_response_device_info, qos=2) - mqtt_client_object.subscribe(topic_request_device_info_from_mlops, qos=2) - - self.subscribed_topics.clear() - self.subscribed_topics.append(topic_start_train) - self.subscribed_topics.append(topic_stop_train) - self.subscribed_topics.append(topic_server_status) - self.subscribed_topics.append(topic_report_status) - self.subscribed_topics.append(topic_ota_msg) - self.subscribed_topics.append(topic_response_device_info) - self.subscribed_topics.append(topic_request_device_info_from_mlops) - - # Broadcast the first active message. - self.send_agent_active_msg() - - # Start the message center for listener - self.start_listener(sender_message_queue=self.message_center.get_message_queue(), - agent_config=self.agent_config) - - if self.run_as_cloud_server: - # Start the FedML server - message_bytes = self.args.runner_cmd.encode("ascii") - base64_bytes = base64.b64decode(message_bytes) - payload = base64_bytes.decode("ascii") - self.receive_message_json(topic_start_train, payload) - - # Echo results - MLOpsRuntimeLog.get_instance(self.args).enable_show_log_to_stdout() - print("\nCongratulations, your device is connected to the FedML MLOps platform successfully!") - print( - "Your FedML Edge ID is " + str(self.edge_id) + ", unique device ID is " - + str(self.unique_device_id) - ) - MLOpsRuntimeLog.get_instance(self.args).enable_show_log_to_stdout(enable=True) - - def on_agent_mqtt_disconnected(self, mqtt_client_object): - MLOpsStatus.get_instance().set_server_agent_status( - self.edge_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE - ) - - def setup_agent_mqtt_connection(self, service_config): - # Setup MQTT connection - self.mqtt_mgr = MqttManager( - service_config["mqtt_config"]["BROKER_HOST"], - service_config["mqtt_config"]["BROKER_PORT"], - service_config["mqtt_config"]["MQTT_USER"], - service_config["mqtt_config"]["MQTT_PWD"], - service_config["mqtt_config"]["MQTT_KEEPALIVE"], - f"FedML_ServerAgent_Daemon_@{self.user_name}@_@{self.args.current_device_id}@_@{str(uuid.uuid4())}@", - "flserver_agent/last_will_msg", - json.dumps({"ID": self.edge_id, "status": ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE}) - ) - - # Init local database - FedMLServerDataInterface.get_instance().create_job_table() - - # Start the message center to process edge related messages. - self.setup_message_center() - - server_api_cmd = "fedml.computing.scheduler.master.server_api:api" - server_api_pids = RunProcessUtils.get_pid_from_cmd_line(server_api_cmd) - if server_api_pids is None or len(server_api_pids) <= 0: - # Start local API services - cur_dir = os.path.dirname(__file__) - fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir))) - python_program = get_python_program() - self.local_api_process = ServerConstants.exec_console_with_script( - "{} -m uvicorn {} --host 0.0.0.0 --port {} --reload --reload-delay 3 --reload-dir {} " - "--log-level critical".format( - python_program, server_api_cmd, ServerConstants.LOCAL_SERVER_API_PORT, - fedml_base_dir - ), - should_capture_stdout=False, - should_capture_stderr=False - ) - # if self.local_api_process is not None and self.local_api_process.pid is not None: - # print(f"Server local API process id {self.local_api_process.pid}") - - # Setup MQTT connected listener - self.mqtt_mgr.add_connected_listener(self.on_agent_mqtt_connected) - self.mqtt_mgr.add_disconnected_listener(self.on_agent_mqtt_disconnected) - self.mqtt_mgr.connect() - - # Report the IDLE status to MLOps - self.mlops_metrics.report_server_training_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE, edge_id=self.edge_id) - MLOpsStatus.get_instance().set_server_agent_status( - self.edge_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - ) - - # MLOpsRuntimeLogDaemon.get_instance(self.args).stop_all_log_processor() - - self.mlops_metrics.stop_device_realtime_perf() - self.mlops_metrics.report_device_realtime_perf(self.args, service_config["mqtt_config"], is_client=False) - - if not self.run_as_cloud_server: - self.recover_start_train_msg_after_upgrading() - - JobCleanup.get_instance().sync_data_on_startup(self.edge_id, is_client=False) - - self.master_api_daemon = MasterApiDaemon() - self.master_api_process = Process(target=self.master_api_daemon.run) - self.master_api_process.start() - - # if self.model_device_server is None: - # self.model_device_server = FedMLModelDeviceServerRunner(self.args, self.args.current_device_id, - # self.args.os_name, self.args.is_from_docker, - # self.agent_config) - # self.model_device_server.start() - - def start_agent_mqtt_loop(self): - # Start MQTT message loop - try: - self.mqtt_mgr.loop_forever() - except Exception as e: - if str(e) == "Restarting after upgraded...": - logging.info("Restarting after upgraded...") - else: - logging.info("Server tracing: {}".format(traceback.format_exc())) - - finally: - login_exit_file = os.path.join(ServerConstants.get_log_file_dir(), "exited.log") - with open(login_exit_file, "w") as f: - f.writelines(f"{os.getpid()}.") - - self.stop_agent() - - time.sleep(5) - sys_utils.cleanup_all_fedml_server_login_processes( - ServerConstants.SERVER_LOGIN_PROGRAM, clean_process_group=False) - sys.exit(1) - - def stop_agent(self): - if self.run_process_event is not None: - self.run_process_event.set() - - if self.mqtt_mgr is not None: - try: - for topic in self.subscribed_topics: - self.mqtt_mgr.unsubscribe_msg(topic) - except Exception as e: - pass - - self.mqtt_mgr.loop_stop() - self.mqtt_mgr.disconnect() - self.release_message_center() - - def get_runner(self): - runner = FedMLServerRunner( - self.args, run_id=self.run_id, request_json=self.request_json, - agent_config=self.agent_config - ) - runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent - runner.edge_id = self.edge_id - runner.server_agent_id = self.server_agent_id - runner.start_request_json = self.start_request_json - runner.unique_device_id = self.unique_device_id - runner.user_name = self.user_name - runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent - runner.run_as_cloud_agent = self.run_as_cloud_agent - runner.run_as_cloud_server = self.run_as_cloud_server - return runner diff --git a/python/fedml/computing/scheduler/model_scheduler/autoscaler/autoscaler.py b/python/fedml/computing/scheduler/model_scheduler/autoscaler/autoscaler.py index dd6ca67706..4cab1e133c 100644 --- a/python/fedml/computing/scheduler/model_scheduler/autoscaler/autoscaler.py +++ b/python/fedml/computing/scheduler/model_scheduler/autoscaler/autoscaler.py @@ -8,7 +8,7 @@ from enum import Enum from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache from fedml.computing.scheduler.model_scheduler.autoscaler.policies import * -from utils.singleton import Singleton +from fedml.computing.scheduler.comm_utils.singleton import Singleton class ScaleOp(Enum): @@ -38,6 +38,26 @@ def get_current_timestamp_micro_seconds(cls): # in REDIS we record/operate in micro-seconds, hence the division by 1e3! return int(format(time.time_ns() / 1000.0, '.0f')) + @classmethod + def filter_by_timestamp(cls, + metrics, + before_now_minutes=None, + before_now_seconds=None) -> pd.DataFrame: + + # We subtract the number of seconds/minutes from the current timestamp, and then we query + # the data frame to fetch all the records whose timestamp is within the given range. + # By default, we return all records. + filtered = metrics + if before_now_minutes: + less_than_ts = \ + str(pd.Timestamp.utcnow().replace(tzinfo=None) - pd.Timedelta(minutes=before_now_minutes)) + filtered = metrics.query("'{}' <= {}".format(less_than_ts, "timestamp")) + if before_now_seconds: + less_than_ts = \ + str(pd.Timestamp.utcnow().replace(tzinfo=None) - pd.Timedelta(seconds=before_now_seconds)) + filtered = metrics.query("'{}' <= {}".format(less_than_ts, "timestamp")) + return filtered + @classmethod def scale_operation_predictive(cls, predictive_policy: PredictivePolicy, @@ -51,17 +71,23 @@ def scale_operation_ewm(cls, ewm_policy: EWMPolicy, metrics: pd.DataFrame) -> ScaleOp: + logging.info("Executing the ExponentialWeightMoving average autoscaling policy.") # Adding the context below to avoid having a series of warning messages. with warnings.catch_warnings(): warnings.simplefilter(action='ignore', category=FutureWarning) - period_data = metrics.last("{}min".format(ewm_policy.ewm_mins)) - # If the data frame window is empty then do nothing more, just return. - if period_data.empty: - return ScaleOp.NO_OP - metric_name = "current_latency" \ - if "ewm_latency" == ewm_policy.metric else "current_qps" - ewm_period = period_data[metric_name] \ - .ewm(alpha=ewm_policy.ewm_alpha).mean() + period_data = cls.filter_by_timestamp(metrics, + before_now_minutes=ewm_policy.ewm_mins) + + # If the data frame window is empty then it means we + # did not have any incoming request, so we need to scale down. + if period_data.empty: + return ScaleOp.DOWN_IN_OP + + # Otherwise, we proceed as normal. + metric_name = "current_latency" \ + if "ewm_latency" == ewm_policy.metric else "current_qps" + ewm_period = period_data[metric_name] \ + .ewm(alpha=ewm_policy.ewm_alpha).mean() scale_op = ScaleOp.NO_OP # If there is no exponential moving average within this @@ -110,15 +136,22 @@ def scale_operation_query_concurrency(cls, concurrent_query_policy: ConcurrentQueryPolicy, metrics: pd.DataFrame) -> ScaleOp: + logging.info("Executing the QueryConcurrency autoscaling policy.") # Adding the context below to avoid having a series of warning messages. with warnings.catch_warnings(): warnings.simplefilter(action='ignore', category=FutureWarning) - # Here, the number of queries is the number of rows in the short period data frame. - period_data = metrics.last("{}s".format(concurrent_query_policy.window_size_secs)) - # If the data frame window is empty then do nothing more, just return. - if period_data.empty: - return ScaleOp.NO_OP - queries_num = period_data.shape[0] + period_data = cls.filter_by_timestamp( + metrics, + before_now_seconds=concurrent_query_policy.window_size_secs) + + # If the data frame window is empty then it means we + # did not have any incoming request, so we need to scale down. + if period_data.empty: + return ScaleOp.DOWN_IN_OP + + # Otherwise, we proceed as normal. + queries_num = period_data.shape[0] + logging.info(f"Detect {queries_num} of requests in {concurrent_query_policy.window_size_secs} seconds") try: # QSR: Queries per Second per Replica: (Number of Queries / Number of Current Replicas) / Window Size @@ -154,15 +187,20 @@ def scale_operation_meet_traffic_demand(cls, meet_traffic_demand_policy: MeetTrafficDemandPolicy, metrics: pd.DataFrame) -> ScaleOp: + logging.info("Executing the MeetTrafficDemand autoscaling policy.") # Adding the context below to avoid having a series of warning messages. with warnings.catch_warnings(): warnings.simplefilter(action='ignore', category=FutureWarning) - # Here, the number of queries is the number of rows in the short period data frame. - period_data = metrics.last("{}s".format(meet_traffic_demand_policy.window_size_secs)) - # If the data frame window is empty then do nothing more, just return. - if period_data.empty: - return ScaleOp.NO_OP + period_data = cls.filter_by_timestamp( + metrics, + before_now_seconds=meet_traffic_demand_policy.window_size_secs) + + # If the data frame window is empty then it means we + # did not have any incoming request, so we need to scale down. + if period_data.empty: + return ScaleOp.DOWN_IN_OP + # Otherwise, we proceed as normal. period_requests_num = period_data.shape[0] all_latencies = metrics["current_latency"] # Original value is milliseconds, convert to seconds. @@ -216,6 +254,7 @@ def run_autoscaling_policy(self, def validate_scaling_bounds(cls, scale_op: ScaleOp, autoscaling_policy: AutoscalingPolicy) -> ScaleOp: + logging.info("Validating scaling bounds.") # We cannot be lower than the minimum number of replicas, # nor exceed the maximum number of requested replicas. new_running_replicas = autoscaling_policy.current_replicas + scale_op.value @@ -242,6 +281,7 @@ def enforce_scaling_down_delay_interval(self, # If the policy has no scaledown delay then return immediately. if autoscaling_policy.scaledown_delay_secs == 0: + logging.info("No scale down delay, so scale down immediately.") return ScaleOp.DOWN_IN_OP # By default, we return a no operation. @@ -256,9 +296,12 @@ def enforce_scaling_down_delay_interval(self, self.fedml_model_cache.get_endpoint_scaling_down_decision_time(endpoint_id) diff_secs = (current_timestamp - previous_timestamp) / 1e6 if diff_secs > autoscaling_policy.scaledown_delay_secs: + logging.info("Scaling down since the time difference: {}secs, " + "is above the delay period: {} secs.".format( + diff_secs, autoscaling_policy.scaledown_delay_secs)) # At this point, we will perform the scaling down operation, hence # we need to delete the previously stored scaling down timestamp (if any). - self.fedml_model_cache.delete_endpoint_scaling_down_decision_time(endpoint_id) + self.clean_up_scaling_down_operation_state(endpoint_id) scale_op = ScaleOp.DOWN_IN_OP else: # Record the timestamp of the scaling down operation. @@ -268,7 +311,8 @@ def enforce_scaling_down_delay_interval(self, return scale_op def clean_up_scaling_down_operation_state(self, endpoint_id) -> bool: - # We return True if the clean up operation succeeded, else False. + # We return True if the cleaning up operation succeeded, else False. + logging.info("Cleaning up scale down state from Redis.") to_clean_up = \ self.fedml_model_cache.exists_endpoint_scaling_down_decision_time(endpoint_id) if to_clean_up: @@ -293,35 +337,40 @@ def scale_operation_endpoint(self, 0: do nothing """ - # Fetch most recent metric record from the database. + # Fetch all metrics record from the database. metrics = self.fedml_model_cache.get_endpoint_metrics( - endpoint_id=endpoint_id) + end_point_id=endpoint_id) # Default to nothing. scale_op = ScaleOp.NO_OP if not metrics: # If no metric exists then no scaling operation. + logging.info("No existing metric, so no scaling operation.") return scale_op - # If we continue here, then it means that there was at least one request. - # The `most_recent_metric` is of type list, hence we need to access index 0. - most_recent_metric = metrics[-1] - latest_request_timestamp_micro_secs = most_recent_metric["timestamp"] - # The time module does not have a micro-second function built-in, so we need to - # divide nanoseconds by 1e3 and convert to micro-seconds. - current_time_micro_seconds = time.time_ns() / 1e3 - # compute elapsed time and convert to seconds - elapsed_time_secs = \ - (current_time_micro_seconds - latest_request_timestamp_micro_secs) / 1e6 - if elapsed_time_secs > autoscaling_policy.release_replica_after_idle_secs: + if autoscaling_policy.release_replica_after_idle_secs: + # At this point it means that there was at least one request. The + # `most_recent_metric` is of type list, hence we need to access index 0. + most_recent_metric = metrics[-1] + latest_request_timestamp_micro_secs = most_recent_metric["timestamp"] + # The time module does not have a micro-second function built-in, + # so we need to divide nanoseconds by 1e3 and convert to micro-seconds. + current_time_micro_seconds = time.time_ns() / 1e3 + # Compute the elapsed time and convert to seconds. + elapsed_time_secs = \ + (current_time_micro_seconds - latest_request_timestamp_micro_secs) / 1e6 # If the elapsed time is greater than the requested idle time, # in other words there was no incoming request then scale down. - scale_op = ScaleOp.DOWN_IN_OP + if elapsed_time_secs > autoscaling_policy.release_replica_after_idle_secs: + logging.info("Endpoint remained idle for {} seconds, need to scale down.".format( + elapsed_time_secs)) + scale_op = ScaleOp.DOWN_IN_OP else: - # Otherwise, it means there was a request within the elapsed time, then: + # Otherwise, it means there was a request within the elapsed time, then, + # Check if the current number of running replicas is 0 it means + # we need more resources, hence we need to scale up: ScaleOp.UP_OUT_OP. if autoscaling_policy.current_replicas == 0: - # Check if the current number of running replicas is 0, - # then we need more resources, hence ScaleOp.UP_OUT_OP. + logging.info("Incoming requests but with 0 replicas, scaling up.") scale_op = ScaleOp.UP_OUT_OP else: # Else, trigger the autoscaling policy with all existing values. diff --git a/python/fedml/computing/scheduler/model_scheduler/autoscaler/policies.py b/python/fedml/computing/scheduler/model_scheduler/autoscaler/policies.py index 546817ec82..0ad2cc0d13 100644 --- a/python/fedml/computing/scheduler/model_scheduler/autoscaler/policies.py +++ b/python/fedml/computing/scheduler/model_scheduler/autoscaler/policies.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel, field_validator, NonNegativeInt, NonNegativeFloat +from pydantic import BaseModel, NonNegativeInt, NonNegativeFloat, validator class AutoscalingPolicy(BaseModel): @@ -22,7 +22,7 @@ class AutoscalingPolicy(BaseModel): min_replicas: NonNegativeInt max_replicas: NonNegativeInt previous_triggering_value: float = None - release_replica_after_idle_secs: NonNegativeInt = 300 # default is after 5 minutes + release_replica_after_idle_secs: NonNegativeInt = None scaledown_delay_secs: NonNegativeInt = 60 # default is 1 minute scaleup_cost_secs: NonNegativeInt = 300 # default is 5 minutes @@ -70,9 +70,10 @@ class EWMPolicy(AutoscalingPolicy): ub_threshold: NonNegativeFloat # recommended value: 0.5 lb_threshold: NonNegativeFloat # recommended value: 0.5 - @field_validator("metric") - def validate_option(cls, v): - assert v in ["ewm_latency", "ewm_qps"] + @validator("metric") + def metric_match(cls, v) -> str: + if v not in ["ewm_latency", "ewm_qps"]: + raise ValueError("Wrong metric name.") return v diff --git a/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/autoscaler_test.py b/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/autoscaler_test.py index 7af1022c7d..eadc2dc9a9 100644 --- a/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/autoscaler_test.py +++ b/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/autoscaler_test.py @@ -111,21 +111,21 @@ def test_validate_scaling_bounds(self): # Validate scale up. scale_up = autoscaler.validate_scaling_bounds(ScaleOp.UP_OUT_OP, autoscaling_policy) - self.assertEquals(scale_up, ScaleOp.UP_OUT_OP) + self.assertEqual(scale_up, ScaleOp.UP_OUT_OP) # Validate scale down. scale_down = autoscaler.validate_scaling_bounds(ScaleOp.DOWN_IN_OP, autoscaling_policy) - self.assertEquals(scale_down, ScaleOp.DOWN_IN_OP) + self.assertEqual(scale_down, ScaleOp.DOWN_IN_OP) # Validate max out-of-bounds. autoscaling_policy.current_replicas = 3 scale_oob_max = autoscaler.validate_scaling_bounds(ScaleOp.UP_OUT_OP, autoscaling_policy) - self.assertEquals(scale_oob_max, ScaleOp.NO_OP) + self.assertEqual(scale_oob_max, ScaleOp.NO_OP) # Validate min out-of-bounds. autoscaling_policy.current_replicas = 1 scale_oob_min = autoscaler.validate_scaling_bounds(ScaleOp.DOWN_IN_OP, autoscaling_policy) - self.assertEquals(scale_oob_min, ScaleOp.NO_OP) + self.assertEqual(scale_oob_min, ScaleOp.NO_OP) def test_enforce_scaling_down_delay_interval(self): self.populate_redis_with_dummy_metrics() @@ -140,15 +140,15 @@ def test_enforce_scaling_down_delay_interval(self): autoscaling_policy.scaledown_delay_secs = 0.0 scale_down = autoscaler.enforce_scaling_down_delay_interval(ENV_ENDPOINT_ID_1, autoscaling_policy) - self.assertEquals(scale_down, ScaleOp.DOWN_IN_OP) + self.assertEqual(scale_down, ScaleOp.DOWN_IN_OP) autoscaling_policy.scaledown_delay_secs = 1 scale_noop = autoscaler.enforce_scaling_down_delay_interval(ENV_ENDPOINT_ID_1, autoscaling_policy) - self.assertEquals(scale_noop, ScaleOp.NO_OP) + self.assertEqual(scale_noop, ScaleOp.NO_OP) time.sleep(2) scale_down = autoscaler.enforce_scaling_down_delay_interval(ENV_ENDPOINT_ID_1, autoscaling_policy) - self.assertEquals(scale_down, ScaleOp.DOWN_IN_OP) + self.assertEqual(scale_down, ScaleOp.DOWN_IN_OP) self.clear_redis() diff --git a/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py b/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py index 34721d9002..78a1231abf 100644 --- a/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py +++ b/python/fedml/computing/scheduler/model_scheduler/autoscaler/test/scaling_algorithm_real_test.py @@ -2,9 +2,10 @@ import logging from collections import namedtuple -from fedml.computing.scheduler.model_scheduler.autoscaler.autoscaler import Autoscaler, ReactivePolicy +from fedml.computing.scheduler.model_scheduler.autoscaler.autoscaler import Autoscaler from fedml.core.mlops.mlops_runtime_log import MLOpsRuntimeLog from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache +from fedml.computing.scheduler.model_scheduler.autoscaler.policies import ConcurrentQueryPolicy if __name__ == "__main__": @@ -18,9 +19,6 @@ parser.add_argument('--redis_addr', default="local") parser.add_argument('--redis_port', default=6379) parser.add_argument('--redis_password', default="fedml_default") - parser.add_argument('--metric', - default="latency", - help="Either latency or qps") args = parser.parse_args() fedml_model_cache = FedMLModelCache.get_instance() @@ -32,50 +30,19 @@ # Init the autoscaler autoscaler = Autoscaler(args.redis_addr, args.redis_port, args.redis_password) - latency_reactive_policy_default = { - "metric": "latency", - "ewm_mins": 15, - "ewm_alpha": 0.5, - "ub_threshold": 0.5, - "lb_threshold": 0.99, - "triggering_value": 1.6561916828471053 + autoscaling_policy_config = { + "current_replicas": 1, + "min_replicas": 1, + "max_replicas": 3, + "queries_per_replica": 2, + "window_size_secs": 60, + "scaledown_delay_secs": 120, } - qps_reactive_policy_default = { - "metric": "qps", - "ewm_mins": 15, - "ewm_alpha": 0.5, - "ub_threshold": 2, - "lb_threshold": 0.5 - } - policy_config = latency_reactive_policy_default \ - if args.metric == "latency" else qps_reactive_policy_default - autoscaling_policy = ReactivePolicy(**policy_config) - - for endpoint_settings in endpoints_settings_list: - endpoint_state = endpoint_settings["state"] - if endpoint_state == "DEPLOYED" and endpoint_settings["enable_auto_scaling"]: - - e_id, e_name, model_name = \ - endpoint_settings["endpoint_id"], \ - endpoint_settings["endpoint_name"], \ - endpoint_settings["model_name"] - logging.info(f"Querying the autoscaler for endpoint {e_id} with user settings {endpoint_settings}.") - - # For every endpoint we just update the policy configuration. - autoscaling_policy.min_replicas = endpoint_settings["scale_min"] - autoscaling_policy.max_replicas = endpoint_settings["scale_max"] - # We retrieve a list of replicas for every endpoint. The number - # of running replicas is the length of that list. - current_replicas = len(fedml_model_cache.get_endpoint_replicas_results(e_id)) - autoscaling_policy.current_replicas = current_replicas - logging.info(f"Endpoint {e_id} autoscaling policy: {autoscaling_policy}.") - - scale_op = autoscaler.scale_operation_endpoint( - autoscaling_policy, - str(e_id)) - - new_replicas = current_replicas + scale_op.value - - logging.info(f"Scaling operation {scale_op.value} for endpoint {e_id} .") - logging.info(f"New Replicas {new_replicas} for endpoint {e_id} .") - logging.info(f"Current Replicas {current_replicas} for endpoint {e_id} .") + autoscaling_policy = ConcurrentQueryPolicy(**autoscaling_policy_config) + + # Please replace the `e_id` below with a proper e_id value. + e_id = 1111 + scale_op = autoscaler.scale_operation_endpoint( + autoscaling_policy, + str(e_id)) + logging.info(f"Scaling operation {scale_op.value} for endpoint {e_id} .") diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py index 915690e9a4..fbe7a95ab9 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_client_constants.py @@ -74,6 +74,7 @@ class ClientConstants(object): K8S_DEPLOYMENT_SLAVE_MOUNT_HOME_DIR = "/home/fedml/fedml-client" LOCAL_CLIENT_API_PORT = 22030 + ENV_CLIENT_PROXY_PORT_KEY = "FEDML_WORKER_INFERENCE_PROXY_PORT" INFERENCE_HTTP_PORT = 8000 INFERENCE_GRPC_PORT = 8001 @@ -95,6 +96,13 @@ class ClientConstants(object): INFERENCE_ENGINE_TYPE_INT_DEFAULT = 2 INFERENCE_MODEL_VERSION = "1" INFERENCE_INFERENCE_SERVER_VERSION = "v2" + INFERENCE_REQUEST_TIMEOUT = 30 + + ENV_CONNECTION_TYPE_KEY = "FEDML_CONNECTION_TYPE" + WORKER_CONNECTIVITY_TYPE_HTTP = "http" + WORKER_CONNECTIVITY_TYPE_HTTP_PROXY = "http_proxy" + WORKER_CONNECTIVITY_TYPE_MQTT = "mqtt" + WORKER_CONNECTIVITY_TYPE_DEFAULT = WORKER_CONNECTIVITY_TYPE_HTTP MSG_MODELOPS_DEPLOYMENT_STATUS_INITIALIZING = "INITIALIZING" MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYING = "DEPLOYING" @@ -135,6 +143,9 @@ class ClientConstants(object): DEVICE_DIFF_DELETE_OPERATION = "op: delete" DEVICE_DIFF_REPLACE_OPERATION = "op: replace" + READINESS_PROBE_DEFAULT = "DEFAULT" + LIVENESS_PROBE_DEFAULT = "DEFAULT" + LOGIN_MODE_ON_PREMISE_INDEX = 0 LOGIN_MODE_FEDML_CLOUD_INDEX = 1 LOGIN_MODE_PUBLIC_CLOUD_INDEX = 2 @@ -143,20 +154,20 @@ class ClientConstants(object): MODEL_DATA_TYPE_INT = "int" MODEL_DATA_TYPE_FLOAT = "float" MODEL_DATA_TYPE_STR = "str" - MODEL_DATA_TYPE_MAPPING = {"TYPE_BOOL": MODEL_DATA_TYPE_INT, "TYPE_UINT8": MODEL_DATA_TYPE_INT, - "TYPE_UINT16": MODEL_DATA_TYPE_INT, "TYPE_UINT32": MODEL_DATA_TYPE_INT, - "TYPE_UINT64": MODEL_DATA_TYPE_INT, "TYPE_INT8": MODEL_DATA_TYPE_INT, - "TYPE_INT16": MODEL_DATA_TYPE_INT, "TYPE_INT32": MODEL_DATA_TYPE_INT, - "TYPE_INT64": MODEL_DATA_TYPE_INT, "TYPE_FP16": MODEL_DATA_TYPE_FLOAT, - "TYPE_FP32": MODEL_DATA_TYPE_FLOAT, "TYPE_FP64": MODEL_DATA_TYPE_FLOAT, - "TYPE_STRING": MODEL_DATA_TYPE_STR, "TYPE_BF16": MODEL_DATA_TYPE_INT, - "BOOL": MODEL_DATA_TYPE_INT, "UINT8": MODEL_DATA_TYPE_INT, - "UINT16": MODEL_DATA_TYPE_INT, "UINT32": MODEL_DATA_TYPE_INT, - "UINT64": MODEL_DATA_TYPE_INT, "INT8": MODEL_DATA_TYPE_INT, - "INT16": MODEL_DATA_TYPE_INT, "INT32": MODEL_DATA_TYPE_INT, - "INT64": MODEL_DATA_TYPE_INT, "FP16": MODEL_DATA_TYPE_FLOAT, - "FP32": MODEL_DATA_TYPE_FLOAT, "FP64": MODEL_DATA_TYPE_FLOAT, - "STRING": MODEL_DATA_TYPE_STR, "BF16": MODEL_DATA_TYPE_INT} + + # Model config yaml related + DEPLOY_TIMEOUT_SEC_KEY = "deploy_timeout_sec" + DEPLOY_TIMEOUT_SEC_DEFAULT = 600 + + EXPOSE_SUBDOMAINS_KEY = "expose_subdomains" + + CUSTOMIZED_VOLUMES_MOUNT_KEY = "volumes" + + CUSTOMIZED_WORKSPACE_MOUNT_PATH_KEY = "workspace_mount_path" + + CUSTOMIZED_SERVICE_KEY = "service" + + ENV_USER_ENCRYPTED_API_KEY = "FEDML_USER_ENCRYPTED_API_KEY" @staticmethod def get_fedml_home_dir(): @@ -274,6 +285,13 @@ def get_model_serving_dir(): os.makedirs(model_file_dir, exist_ok=True) return model_file_dir + @staticmethod + def get_deploy_failed_log_dir(): + model_file_dir = os.path.join(ClientConstants.get_fedml_home_dir(), "fedml", "logs", "failed_logs") + if not os.path.exists(model_file_dir): + os.makedirs(model_file_dir, exist_ok=True) + return model_file_dir + @staticmethod def get_model_infer_data_dir(): model_infer_data_dir = os.path.join(ClientConstants.get_fedml_home_dir(), "fedml", "models_infer_data") @@ -443,6 +461,14 @@ def get_public_ip(): logging.info("Failed to get public ip: {}".format(e)) return ip + @staticmethod + def get_inference_worker_proxy_port() -> int: + # Use dotenv to load the environment variables + fedml.load_env() + worker_proxy_port = int(os.getenv(ClientConstants.ENV_CLIENT_PROXY_PORT_KEY, + default=ClientConstants.LOCAL_CLIENT_API_PORT)) + return worker_proxy_port + @staticmethod def check_process_is_running(process_id): for proc in psutil.process_iter(): diff --git a/python/fedml/computing/scheduler/model_scheduler/device_client_runner.py b/python/fedml/computing/scheduler/model_scheduler/device_client_runner.py deleted file mode 100755 index 8bb03eebbd..0000000000 --- a/python/fedml/computing/scheduler/model_scheduler/device_client_runner.py +++ /dev/null @@ -1,1483 +0,0 @@ -import json -import logging -import multiprocessing -import sys - -from multiprocessing import Process -import os -import platform -import shutil -import subprocess -import threading - -import time -import traceback -import urllib -import uuid -import zipfile -from urllib.parse import urlparse, urljoin - -import requests - -import yaml - -import fedml -from fedml import mlops -from fedml.computing.scheduler.model_scheduler.device_model_msg_object import FedMLModelMsgObject -from fedml.computing.scheduler.scheduler_core.compute_cache_manager import ComputeCacheManager - -from fedml.computing.scheduler.scheduler_core.compute_utils import ComputeUtils -from fedml.core.distributed.communication.s3.remote_storage import S3Storage -from .device_model_cache import FedMLModelCache -from ..comm_utils import sys_utils, security_utils - -from ..comm_utils.container_utils import ContainerUtils - -from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog - -from ....core.distributed.communication.mqtt.mqtt_manager import MqttManager -from ..comm_utils.yaml_utils import load_yaml_config -from .device_client_constants import ClientConstants - -from ....core.mlops.mlops_metrics import MLOpsMetrics - -from ....core.mlops.mlops_configs import MLOpsConfigs -from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon -from ....core.mlops.mlops_status import MLOpsStatus -from ..comm_utils.sys_utils import get_sys_runner_info, get_python_program -from .device_model_deployment import start_deployment, run_http_inference_with_curl_request -from .device_client_data_interface import FedMLClientDataInterface -from ....core.mlops.mlops_utils import MLOpsUtils -from ..comm_utils.job_utils import JobRunnerUtils -from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils -from .device_mqtt_inference_protocol import FedMLMqttInference -from .device_model_db import FedMLModelDatabase -from ..comm_utils.constants import SchedulerConstants -from fedml.computing.scheduler.comm_utils.job_monitor import JobMonitor - -from .device_replica_handler import FedMLDeviceReplicaHandler - -from fedml.computing.scheduler.scheduler_core.endpoint_sync_protocol import FedMLEndpointSyncProtocol -import ssl - - -class RunnerError(Exception): - """ Runner failed. """ - pass - - -class RunnerCompletedError(Exception): - """ Runner completed. """ - pass - - -class FedMLClientRunner: - FEDML_BOOTSTRAP_RUN_OK = "[FedML]Bootstrap Finished" - - def __init__(self, args, edge_id=0, request_json=None, agent_config=None, run_id=0): - self.local_api_process = None - self.run_process_event = None - self.run_process_event_map = dict() - self.run_process_completed_event = None - self.run_process_completed_event_map = dict() - self.run_inference_event_map = dict() - self.run_inference_response_map = dict() - self.run_process_map = dict() - self.device_status = None - self.current_training_status = None - self.mqtt_mgr = None - self.client_mqtt_mgr = None - self.client_mqtt_is_connected = False - self.client_mqtt_lock = None - self.edge_id = edge_id - self.run_id = run_id - self.unique_device_id = None - self.args = args - self.request_json = request_json - self.version = args.version - self.device_id = args.device_id - self.cur_dir = os.path.split(os.path.realpath(__file__))[0] - if args.current_running_dir is not None: - self.cur_dir = args.current_running_dir - self.sudo_cmd = "" - self.is_mac = False - if platform.system() == "Darwin": - self.is_mac = True - - self.agent_config = agent_config - self.fedml_data_base_package_dir = os.path.join("/", "fedml", "data") - self.fedml_data_local_package_dir = os.path.join("/", "fedml", "fedml-package", "fedml", "data") - self.fedml_data_dir = self.fedml_data_base_package_dir - self.fedml_config_dir = os.path.join("/", "fedml", "conf") - - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES = {} - - self.mlops_metrics = None - self.client_active_list = dict() - self.infer_host = "127.0.0.1" - self.redis_addr = "local" - self.redis_port = "6379" - self.redis_password = "fedml_default" - - self.model_runner_mapping = dict() - self.ntp_offset = MLOpsUtils.get_ntp_offset() - self.running_request_json = dict() - self.endpoint_inference_runners = dict() - self.mqtt_inference_obj = None - - self.subscribed_topics = list() - self.user_name = None - - self.replica_handler = None - - def unzip_file(self, zip_file, unzip_file_path) -> str: - unziped_file_name = "" - if zipfile.is_zipfile(zip_file): - with zipfile.ZipFile(zip_file, "r") as zipf: - zipf.extractall(unzip_file_path) - unziped_file_name = zipf.namelist()[0] - else: - raise Exception("Invalid zip file {}".format(zip_file)) - - return unziped_file_name - - def retrieve_and_unzip_package(self, package_name, package_url): - """ - Download the package from the url and unzip it to the local package directory - ~/.fedml/fedml-model-client/fedml/model_packages/${end_point_id}_${end_point_name}_${model_name}_${model_version} - Under this folder, there should be the zipped file and the unzipped folder. - the zipped file starts with fedml_run_${end_point_id}_${end_point_name}_${model_name}_${model_version} - """ - # Models root directory - local_package_path = ClientConstants.get_model_package_dir() - os.makedirs(local_package_path, exist_ok=True) - - # Specify this model directory using ${end_point_id}_${end_point_name}_${model_name}_${model_version} - run_id = self.request_json["end_point_id"] - end_point_name = self.request_json["end_point_name"] - model_config = self.request_json["model_config"] - model_name = model_config["model_name"] - model_version = model_config["model_version"] - - model_version = model_version.replace(" ", "-") # Avoid using space for folder name - model_version = model_version.replace(":", "-") # Since docker mount will conflict with ":" - - this_run_model_dir = f"{run_id}_{end_point_name}_{model_name}_{model_version}" - this_run_model_full_path = os.path.join(local_package_path, this_run_model_dir) - os.makedirs(this_run_model_full_path, exist_ok=True) - - # Download the zipped package, overwrite it even if it exists - filename, filename_without_extension, file_extension = ClientConstants.get_filename_and_extension(package_url) - local_package_file = os.path.join(this_run_model_full_path, - f"fedml_run_{self.run_id}_{self.edge_id}_{filename_without_extension}") - if os.path.exists(local_package_file): - os.remove(local_package_file) - logging.info("Download from package_url {}".format(package_url)) - ssl._create_default_https_context = ssl._create_unverified_context - urllib.request.urlretrieve(package_url, local_package_file, - reporthook=self.package_download_progress) - - # Unzip the package in the same folder, overwrite the unzipped folder even if it exists - unzip_package_path = os.path.join(this_run_model_full_path, - f"unzip_fedml_run_{self.run_id}_{self.edge_id}_{filename_without_extension}") - try: - shutil.rmtree(unzip_package_path, ignore_errors=True) - except Exception as e: - pass - package_dir_name = self.unzip_file(local_package_file, unzip_package_path) - unzip_package_full_path = os.path.join(unzip_package_path, package_dir_name) - model_bin_file = os.path.join(unzip_package_path, "fedml_model.bin") # Will deprecated - logging.info("local_package_file {}, unzip_package_path {}, unzip file full path {}".format( - local_package_file, unzip_package_path, unzip_package_full_path)) - - return unzip_package_full_path, model_bin_file - - def retrieve_binary_model_file(self, package_name, package_url): - local_package_path = ClientConstants.get_model_package_dir() - if not os.path.exists(local_package_path): - os.makedirs(local_package_path, exist_ok=True) - unzip_package_path = ClientConstants.get_model_dir() - local_package_file = "{}".format(os.path.join(local_package_path, package_name)) - if os.path.exists(local_package_file): - os.remove(local_package_file) - urllib.request.urlretrieve(package_url, local_package_file, - reporthook=self.package_download_progress) - - unzip_package_path = os.path.join(unzip_package_path, package_name) - if not os.path.exists(unzip_package_path): - os.makedirs(unzip_package_path, exist_ok=True) - dst_model_file = os.path.join(unzip_package_path, package_name) - if os.path.exists(local_package_file): - shutil.copy(local_package_file, dst_model_file) - - return unzip_package_path, dst_model_file - - def package_download_progress(self, count, blksize, filesize): - self.check_runner_stop_event() - - downloaded = count * blksize - downloaded = filesize if downloaded > filesize else downloaded - progress = (downloaded / filesize * 100) if filesize != 0 else 0 - progress_int = int(progress) - downloaded_kb = format(downloaded / 1024, '.2f') - - # since this hook function is stateless, we need a state to avoid printing progress repeatedly - if count == 0: - self.prev_download_progress = 0 - if progress_int != self.prev_download_progress and progress_int % 5 == 0: - self.prev_download_progress = progress_int - logging.info("package downloaded size {} KB, progress {}%".format(downloaded_kb, progress_int)) - - def build_dynamic_constrain_variables(self, run_id, run_config): - pass - - def update_local_fedml_config(self, run_id, model_config, model_config_parameters): - model_name = model_config["model_name"] - model_storage_url = model_config["model_storage_url"] - - # Retrieve model package or model binary file. - unzip_package_path, model_bin_file = self.retrieve_and_unzip_package(model_name, model_storage_url) - - # Load the config to memory - fedml_local_config_file = os.path.join(unzip_package_path, "fedml_model_config.yaml") - - # Inject the config from UI to pkg yaml - package_conf_object = model_config_parameters - - # Save the config to local - with open(fedml_local_config_file, "w") as f: - yaml.dump(package_conf_object, f) - - logging.info("The package_conf_object is {}".format(package_conf_object)) - - return unzip_package_path, model_bin_file, package_conf_object - - def build_dynamic_args(self, run_config, package_conf_object, base_dir): - pass - - def download_model_package(self, package_name, package_url): - # Copy config file from the client - unzip_package_path = self.retrieve_and_unzip_package( - package_name, package_url - ) - - return unzip_package_path - - def run(self, process_event, completed_event): - # print(f"Model worker runner process id {os.getpid()}, run id {self.run_id}") - - if platform.system() != "Windows": - os.setsid() - - os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' - os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') - - self.run_process_event = process_event - self.run_process_completed_event = completed_event - run_id = self.request_json.get("end_point_id") - - try: - FedMLModelDatabase.get_instance().set_database_base_dir(ClientConstants.get_database_dir()) - FedMLModelDatabase.get_instance().create_table() - - MLOpsUtils.set_ntp_offset(self.ntp_offset) - self.setup_client_mqtt_mgr() - - if not self.run_impl(): - logging.info( - f"[endpoint/device][{run_id}/{self.edge_id}] " - f"Failed to run the model deployment. run_impl return False.") - - # This if condition only happens when run_impl return False in a controllable way - # Under this condition, the run_impl itself should have handled the cleanup - # So no need to self.release_gpu_ids(run_id) - except RunnerError: - logging.error( - f"[endpoint/device][{run_id}/{self.edge_id}] " - f"Failed due to RunnerError {traceback.format_exc()}") - self.release_gpu_ids(run_id) - - self.reset_devices_status(self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED) - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - except RunnerCompletedError: - logging.error( - f"[endpoint/device][{run_id}/{self.edge_id}] " - f"Failed due to RunnerCompletedError {traceback.format_exc()}") - self.release_gpu_ids(run_id) - - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - except Exception as e: - logging.error( - f"[endpoint/device][{run_id}/{self.edge_id}] " - f"Failed due to exception {traceback.format_exc()}") - - self.cleanup_run_when_starting_failed() - self.mlops_metrics.client_send_exit_train_msg( - run_id, self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) - - self.release_gpu_ids(run_id) - - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - time.sleep(2) - sys.exit(1) - finally: - logging.info("[Worker] Release resources after deployment.") - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - if self.mlops_metrics is not None: - self.mlops_metrics.stop_sys_perf() - time.sleep(3) - self.release_client_mqtt_mgr() - - def release_gpu_ids(self, run_id): - JobRunnerUtils.get_instance().release_gpu_ids(run_id, self.edge_id) - - def check_runner_stop_event(self): - if self.run_process_event.is_set(): - logging.info("Received stopping event.") - raise RunnerError("Runner stopped") - - if self.run_process_completed_event is not None and self.run_process_completed_event.is_set(): - logging.info("Received completed event.") - raise RunnerCompletedError("Runner completed") - - def run_impl(self): - # Get deployment params - run_id = self.request_json["end_point_id"] - end_point_name = self.request_json["end_point_name"] - device_ids = self.request_json["device_ids"] - master_ip = self.request_json["master_node_ip"] - model_config = self.request_json["model_config"] - model_name = model_config["model_name"] - model_id = model_config["model_id"] - model_version = model_config["model_version"] - model_config_parameters = self.request_json["parameters"] - inference_port = model_config_parameters.get("worker_internal_port", - ClientConstants.MODEL_INFERENCE_DEFAULT_PORT) - inference_port_external = model_config_parameters.get("worker_external_port", inference_port) - inference_engine = model_config_parameters.get("inference_engine", - ClientConstants.INFERENCE_ENGINE_TYPE_INT_DEFAULT) - inference_end_point_id = run_id - - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - - logging.info(f"[Worker] Received model deployment request from master for endpoint {run_id}.") - if self.replica_handler is not None: - logging.info(f"=================Worker replica Handler ======================" - f"Reconcile with num diff {self.replica_handler.replica_num_diff} " - f"and version diff {self.replica_handler.replica_version_diff}." - f"=============================================================") - else: - logging.error(f"[Worker] Replica handler is None.") - return False - - self.check_runner_stop_event() - - # Report the deployment status to mlops - self.mlops_metrics.report_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_INITIALIZING, - is_from_model=True, running_json=json.dumps(self.request_json), run_id=run_id) - self.mlops_metrics.report_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_RUNNING, - is_from_model=True, run_id=run_id) - - self.check_runner_stop_event() - - # Reconcile the replica number (op: add, remove) - prev_rank, op, op_num = self.replica_handler.reconcile_num_replica() - - # Reconcile the replica version (op: update) - replica_rank_to_update = [] - if not op: - replica_rank_to_update, op = self.replica_handler.reconcile_replica_version() - - if not op: - logging.info("[Worker] No need to reconcile.") - return True - - logging.info( - f"================Worker Reconcile Operations ======================\n" - f" op: {op}; op num: {op_num}.\n" - f"==================================================================\n") - - # If not rollback, download package from MLOps; otherwise, use the backup package - if op != "rollback": - logging.info("Download and unzip model to local...") - unzip_package_path, _, _ = \ - self.update_local_fedml_config(run_id, model_config, model_config_parameters) - if unzip_package_path is None: - logging.info("Failed to update local fedml config.") - self.check_runner_stop_event() - self.cleanup_run_when_starting_failed() - self.mlops_metrics.client_send_exit_train_msg(run_id, self.edge_id, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) - return False - - if not os.path.exists(unzip_package_path): - logging.info("Failed to unzip file.") - self.check_runner_stop_event() - self.cleanup_run_when_starting_failed() - self.mlops_metrics.client_send_exit_train_msg(run_id, self.edge_id, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) - return False - else: - logging.info("Try to use backup package to rollback...") - # Find folder under "~/.fedml/fedml-model-client/fedml/model_packages \ - # /${end_point_id}_${end_point_name}_${model_name}_${model_version}" - backup_folder_full_path = None - models_root_dir = ClientConstants.get_model_package_dir() - - # Find the version (notified by master) to rollback - version_diff_dict = self.request_json["replica_version_diff"][str(self.edge_id)] - version_rollback_to = None - for replica_no, rollback_ops in version_diff_dict.items(): - version_rollback_to = rollback_ops["new_version"] # Note that new_version is the version to rollback - break - if version_rollback_to is None: - logging.error(f"No old version found for run_id: {self.run_id} " - f"edge_id: {self.edge_id}, rollback failed. No old version found in request_json.") - return False - model_version = version_rollback_to - - # Format the version to match the folder name - model_version_formatted = version_rollback_to.replace(" ", "-") - model_version_formatted = model_version_formatted.replace(":", "-") - - last_run_folder_sub_fd = f"{run_id}_{end_point_name}_{model_name}_{model_version_formatted}" - for folder in os.listdir(models_root_dir): - if last_run_folder_sub_fd in folder: - backup_folder_full_path = os.path.join(models_root_dir, folder) - break - if backup_folder_full_path is None: - logging.error(f"No backup folder found for run_id: {self.run_id} edge_id: {self.edge_id} " - f"under {models_root_dir} with sub folder {last_run_folder_sub_fd}, rollback failed.") - return False - - # Inside backup folder, find unzipped package with prefix unzip_fedml_run - unzip_package_path_parent = None - for folder in os.listdir(backup_folder_full_path): - if folder.startswith("unzip_fedml_run"): - unzip_package_path_parent = os.path.join(backup_folder_full_path, folder) - break - - # Inside unzip folder, find the unzipped package, should be the only one - unzip_package_path = None - for folder in os.listdir(unzip_package_path_parent): - if os.path.isdir(os.path.join(unzip_package_path_parent, folder)): - unzip_package_path = os.path.join(unzip_package_path_parent, folder) - break - - if unzip_package_path is None: - logging.error(f"No unzipped package found for run_id: {self.run_id} edge_id: {self.edge_id} " - f"under {backup_folder_full_path}, rollback failed.") - return False - - self.check_runner_stop_event() - - running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \ - "", "", model_version, {}, {} - - if op == "add": - worker_ip = self.get_ip_address(self.request_json) - for rank in range(prev_rank + 1, prev_rank + 1 + op_num): - try: - running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \ - start_deployment( - end_point_id=inference_end_point_id, end_point_name=end_point_name, model_id=model_id, - model_version=model_version, model_storage_local_path=unzip_package_path, - inference_model_name=model_name, inference_engine=inference_engine, - infer_host=worker_ip, master_ip=master_ip, edge_id=self.edge_id, - master_device_id=device_ids[0], replica_rank=rank, - gpu_per_replica=int(self.replica_handler.gpu_per_replica) - ) - except Exception as e: - inference_output_url = "" - logging.error(f"[Worker] Exception at deployment: {traceback.format_exc()}") - - if inference_output_url == "": - logging.error("[Worker] Failed to deploy the model.") - - # Release the gpu occupancy - FedMLModelCache.get_instance().set_redis_params() - replica_occupied_gpu_ids_str = FedMLModelCache.get_instance().get_replica_gpu_ids( - run_id, end_point_name, model_name, self.edge_id, rank + 1) - logging.info(f"Release gpu ids {replica_occupied_gpu_ids_str} for " - f"failed deployment of replica no {rank + 1}.") - - if replica_occupied_gpu_ids_str is not None: - replica_occupied_gpu_ids = json.loads(replica_occupied_gpu_ids_str) - JobRunnerUtils.get_instance().release_partial_job_gpu(run_id, - self.edge_id, replica_occupied_gpu_ids) - - # Send failed result back to master - result_payload = self.send_deployment_results( - end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED, - model_id, model_name, inference_output_url, inference_model_version, inference_port, - inference_engine, model_metadata, model_config) - - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.broadcast_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, - is_from_model=True, run_id=self.run_id) - - self.mlops_metrics.client_send_exit_train_msg( - run_id, self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) - - return False - else: - # Send failed successful result back to master - logging.info("Finished deployment, continue to send results to master...") - result_payload = self.send_deployment_results( - end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, - model_id, model_name, inference_output_url, model_version, inference_port_external, - inference_engine, model_metadata, model_config, replica_no=rank + 1) - - if inference_port_external != inference_port: - # Save internal port to local db - logging.info("inference_port_external {} != inference_port {}".format( - inference_port_external, inference_port)) - result_payload = self.construct_deployment_results( - end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, - model_id, model_name, inference_output_url, model_version, inference_port, - inference_engine, model_metadata, model_config, replica_no=rank + 1) - - FedMLModelDatabase.get_instance().set_deployment_result( - run_id, end_point_name, model_name, model_version, self.edge_id, - json.dumps(result_payload), replica_no=rank + 1) - - logging.info(f"Deploy replica {rank + 1} / {prev_rank + 1 + op_num} successfully.") - time.sleep(5) - - time.sleep(1) - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.broadcast_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, - is_from_model=True, run_id=self.run_id) - return True - elif op == "remove": - for rank_to_delete in range(prev_rank, prev_rank - op_num, -1): - self.replica_handler.remove_replica(rank_to_delete) - - FedMLModelCache.get_instance().set_redis_params() - replica_occupied_gpu_ids_str = FedMLModelCache.get_instance().get_replica_gpu_ids( - run_id, end_point_name, model_name, self.edge_id, rank_to_delete + 1) - - replica_occupied_gpu_ids = json.loads(replica_occupied_gpu_ids_str) - - JobRunnerUtils.get_instance().release_partial_job_gpu(run_id, self.edge_id, replica_occupied_gpu_ids) - - FedMLModelDatabase.get_instance().delete_deployment_result_with_device_id_and_rank( - run_id, end_point_name, model_name, self.edge_id, rank_to_delete) - - # Report the deletion msg to master - result_payload = self.send_deployment_results( - end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DELETED, - model_id, model_name, inference_output_url, model_version, inference_port_external, - inference_engine, model_metadata, model_config, replica_no=rank_to_delete + 1) - - time.sleep(1) - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.broadcast_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, - is_from_model=True, run_id=self.run_id) - - # TODO: If delete all replica, then delete the job and related resources - if rank_to_delete == 0: - pass - return True - elif op == "update" or op == "rollback": - # Update is combine of delete and add - worker_ip = self.get_ip_address(self.request_json) - for rank in replica_rank_to_update: - # Delete a replica (container) if exists - self.replica_handler.remove_replica(rank) - - FedMLModelCache.get_instance().set_redis_params() - replica_occupied_gpu_ids_str = FedMLModelCache.get_instance().get_replica_gpu_ids( - run_id, end_point_name, model_name, self.edge_id, rank + 1) - - replica_occupied_gpu_ids = json.loads(replica_occupied_gpu_ids_str) - logging.info(f"Release gpu ids {replica_occupied_gpu_ids} for update / rollback.") - - # TODO (Raphael) check if this will allow another job to seize the gpu during high concurrency: - try: - JobRunnerUtils.get_instance().release_partial_job_gpu( - run_id, self.edge_id, replica_occupied_gpu_ids) - except Exception as e: - if op == "rollback": - pass - else: - logging.error(f"Failed to release gpu ids {replica_occupied_gpu_ids} for update.") - return False - - # Delete the deployment result from local db - FedMLModelDatabase.get_instance().delete_deployment_result_with_device_id_and_rank( - run_id, end_point_name, model_name, self.edge_id, rank) - - logging.info(f"Delete replica with no {rank + 1} successfully.") - time.sleep(1) - - # Add a replica (container) - # TODO: Reduce the duplicated code - logging.info(f"Start to deploy the model with replica no {rank + 1} ...") - try: - running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \ - start_deployment( - end_point_id=inference_end_point_id, end_point_name=end_point_name, model_id=model_id, - model_version=model_version, model_storage_local_path=unzip_package_path, - inference_model_name=model_name, inference_engine=inference_engine, - infer_host=worker_ip, master_ip=master_ip, edge_id=self.edge_id, - master_device_id=device_ids[0], replica_rank=rank, - gpu_per_replica=int(self.replica_handler.gpu_per_replica) - ) - except Exception as e: - inference_output_url = "" - logging.error(f"Exception at deployment: {traceback.format_exc()}") - - if inference_output_url == "": - logging.error("Failed to deploy the model...") - - # If update failed, should release this replica's gpu - FedMLModelCache.get_instance().set_redis_params() - replica_occupied_gpu_ids_str = FedMLModelCache.get_instance().get_replica_gpu_ids( - run_id, end_point_name, model_name, self.edge_id, rank + 1) - - replica_occupied_gpu_ids = json.loads(replica_occupied_gpu_ids_str) - - JobRunnerUtils.get_instance().release_partial_job_gpu( - run_id, self.edge_id, replica_occupied_gpu_ids) - - result_payload = self.send_deployment_results( - end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED, - model_id, model_name, inference_output_url, inference_model_version, inference_port, - inference_engine, model_metadata, model_config) - - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.broadcast_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, - is_from_model=True, run_id=self.run_id) - - self.mlops_metrics.client_send_exit_train_msg( - run_id, self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) - - return False - else: - logging.info("Finished deployment, continue to send results to master...") - result_payload = self.send_deployment_results( - end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, - model_id, model_name, inference_output_url, model_version, inference_port_external, - inference_engine, model_metadata, model_config, replica_no=rank + 1) - - if inference_port_external != inference_port: # Save internal port to local db - logging.info("inference_port_external {} != inference_port {}".format( - inference_port_external, inference_port)) - result_payload = self.construct_deployment_results( - end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, - model_id, model_name, inference_output_url, model_version, inference_port, - inference_engine, model_metadata, model_config, replica_no=rank + 1) - - FedMLModelDatabase.get_instance().set_deployment_result( - run_id, end_point_name, model_name, model_version, self.edge_id, - json.dumps(result_payload), replica_no=rank + 1) - - logging.info(f"Update replica with no {rank + 1} successfully. Op num {op_num}") - time.sleep(5) - time.sleep(1) - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.broadcast_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, - is_from_model=True, run_id=self.run_id) - return True - - else: - # The delete op will be handled by callback_delete_deployment - logging.error(f"Unsupported op {op} with op num {op_num}") - return False - - def construct_deployment_results(self, end_point_name, device_id, model_status, - model_id, model_name, model_inference_url, - model_version, inference_port, inference_engine, - model_metadata, model_config, replica_no=1): - deployment_results_payload = {"end_point_id": self.run_id, "end_point_name": end_point_name, - "model_id": model_id, "model_name": model_name, - "model_url": model_inference_url, "model_version": model_version, - "port": inference_port, - "inference_engine": inference_engine, - "model_metadata": model_metadata, - "model_config": model_config, - "model_status": model_status, - "inference_port": inference_port, - "replica_no": replica_no, - } - return deployment_results_payload - - def construct_deployment_status(self, end_point_name, device_id, - model_id, model_name, model_version, - model_inference_url, model_status, - inference_port=ClientConstants.MODEL_INFERENCE_DEFAULT_PORT, - replica_no=1, # start from 1 - ): - deployment_status_payload = {"end_point_id": self.run_id, "end_point_name": end_point_name, - "device_id": device_id, - "model_id": model_id, "model_name": model_name, - "model_version": model_version, - "model_url": model_inference_url, "model_status": model_status, - "inference_port": inference_port, - "replica_no": replica_no, - } - return deployment_status_payload - - def send_deployment_results(self, end_point_name, device_id, model_status, - model_id, model_name, model_inference_url, - model_version, inference_port, inference_engine, - model_metadata, model_config, replica_no=1): - deployment_results_topic = "model_device/model_device/return_deployment_result/{}/{}".format( - self.run_id, device_id) - - deployment_results_payload = self.construct_deployment_results( - end_point_name, device_id, model_status, - model_id, model_name, model_inference_url, - model_version, inference_port, inference_engine, - model_metadata, model_config, replica_no=replica_no) - - logging.info("[client] send_deployment_results: topic {}, payload {}.".format(deployment_results_topic, - deployment_results_payload)) - self.client_mqtt_mgr.send_message_json(deployment_results_topic, json.dumps(deployment_results_payload)) - return deployment_results_payload - - def send_deployment_status(self, end_point_name, device_id, - model_id, model_name, model_version, - model_inference_url, model_status, - inference_port=ClientConstants.MODEL_INFERENCE_DEFAULT_PORT, - replica_no=1, # start from 1 - ): - # Deprecated - pass - - def reset_devices_status(self, edge_id, status): - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.edge_id = edge_id - self.mlops_metrics.broadcast_client_training_status( - edge_id, status, is_from_model=True, run_id=self.run_id) - - def cleanup_run_when_starting_failed(self): - logging.info("Cleanup run successfully when starting failed.") - - self.reset_devices_status(self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED) - - time.sleep(2) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - pass - - time.sleep(1) - - def cleanup_run_when_finished(self): - logging.info("Cleanup run successfully when finished.") - - self.reset_devices_status(self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED) - - time.sleep(2) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - pass - - time.sleep(1) - - def on_client_mqtt_disconnected(self, mqtt_client_object): - if self.client_mqtt_lock is None: - self.client_mqtt_lock = threading.Lock() - - self.client_mqtt_lock.acquire() - self.client_mqtt_is_connected = False - self.client_mqtt_lock.release() - - def on_client_mqtt_connected(self, mqtt_client_object): - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - - self.mlops_metrics.set_messenger(self.client_mqtt_mgr) - self.mlops_metrics.run_id = self.run_id - - if self.client_mqtt_lock is None: - self.client_mqtt_lock = threading.Lock() - - self.client_mqtt_lock.acquire() - self.client_mqtt_is_connected = True - self.client_mqtt_lock.release() - - def setup_client_mqtt_mgr(self): - if self.client_mqtt_mgr is not None: - return - - if self.client_mqtt_lock is None: - self.client_mqtt_lock = threading.Lock() - - self.client_mqtt_mgr = MqttManager( - self.agent_config["mqtt_config"]["BROKER_HOST"], - self.agent_config["mqtt_config"]["BROKER_PORT"], - self.agent_config["mqtt_config"]["MQTT_USER"], - self.agent_config["mqtt_config"]["MQTT_PWD"], - self.agent_config["mqtt_config"]["MQTT_KEEPALIVE"], - "FedML_ModelClientAgent_Metrics_@{}@_{}_{}_{}".format(self.user_name, self.args.current_device_id, - str(os.getpid()), - str(uuid.uuid4())) - ) - - self.client_mqtt_mgr.add_connected_listener(self.on_client_mqtt_connected) - self.client_mqtt_mgr.add_disconnected_listener(self.on_client_mqtt_disconnected) - self.client_mqtt_mgr.connect() - self.client_mqtt_mgr.loop_start() - - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - self.mlops_metrics.set_messenger(self.client_mqtt_mgr) - self.mlops_metrics.run_id = self.run_id - - def release_client_mqtt_mgr(self): - try: - if self.client_mqtt_mgr is not None: - self.client_mqtt_mgr.loop_stop() - self.client_mqtt_mgr.disconnect() - - self.client_mqtt_lock.acquire() - if self.client_mqtt_mgr is not None: - self.client_mqtt_is_connected = False - self.client_mqtt_mgr = None - self.client_mqtt_lock.release() - except Exception: - pass - - def ota_upgrade(self, payload, request_json): - run_id = request_json["end_point_id"] - force_ota = False - ota_version = None - - try: - parameters = request_json.get("parameters", None) - common_args = parameters.get("common_args", None) - force_ota = common_args.get("force_ota", False) - ota_version = common_args.get("ota_version", None) - except Exception as e: - pass - - if force_ota and ota_version is not None: - should_upgrade = True if ota_version != fedml.__version__ else False - upgrade_version = ota_version - else: - try: - fedml_is_latest_version, local_ver, remote_ver = sys_utils.check_fedml_is_latest_version(self.version) - except Exception as e: - return - - should_upgrade = False if fedml_is_latest_version else True - upgrade_version = remote_ver - - if should_upgrade: - FedMLClientDataInterface.get_instance(). \ - save_started_job(run_id, self.edge_id, time.time(), - ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, - payload) - - logging.info(f"Upgrade to version {upgrade_version} ...") - - sys_utils.do_upgrade(self.version, upgrade_version) - - raise Exception("Restarting after upgraded...") - - def callback_start_deployment(self, topic, payload): - # Get deployment params - request_json = json.loads(payload) - run_id = request_json["end_point_id"] - inference_end_point_id = run_id - - try: - MLOpsConfigs.fetch_all_configs() - except Exception as e: - pass - - # Start log processor for current run - run_id = inference_end_point_id - self.args.run_id = run_id - self.args.edge_id = self.edge_id - MLOpsRuntimeLog(args=self.args).init_logs() - MLOpsRuntimeLogDaemon.get_instance(self.args).set_log_source( - ClientConstants.FEDML_LOG_SOURCE_TYPE_MODEL_END_POINT) - MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor(run_id, self.edge_id) - - # self.ota_upgrade(payload, request_json) - - # Start client with multiprocessing mode - request_json["run_id"] = run_id - run_id_str = str(run_id) - self.request_json = request_json - self.running_request_json[run_id_str] = request_json - client_runner = FedMLClientRunner( - self.args, edge_id=self.edge_id, request_json=request_json, agent_config=self.agent_config, run_id=run_id - ) - client_runner.infer_host = self.get_ip_address(request_json) - self.run_process_event_map[run_id_str] = multiprocessing.Event() - self.run_process_event_map[run_id_str].clear() - client_runner.run_process_event = self.run_process_event_map[run_id_str] - self.run_process_completed_event_map[run_id_str] = multiprocessing.Event() - self.run_process_completed_event_map[run_id_str].clear() - client_runner.run_process_completed_event = self.run_process_completed_event_map[run_id_str] - self.model_runner_mapping[run_id_str] = client_runner - - # Replica Handler will be init for every deployment - replica_handler = FedMLDeviceReplicaHandler(self.edge_id, self.request_json) - client_runner.replica_handler = replica_handler - - self.run_id = run_id - self.run_process_map[run_id_str] = Process(target=client_runner.run, args=( - self.run_process_event_map[run_id_str], self.run_process_completed_event_map[run_id_str] - )) - - self.run_process_map[run_id_str].start() - ClientConstants.save_run_process(run_id, self.run_process_map[run_id_str].pid) - ClientConstants.save_runner_infos(self.args.device_id + "." + self.args.os_name, self.edge_id, run_id=run_id) - - def set_runner_stopped_event(self, run_id): - run_id_str = str(run_id) - client_runner = self.model_runner_mapping.get(run_id_str, None) - if client_runner is not None: - if client_runner.run_process_event is not None: - client_runner.run_process_event.set() - self.model_runner_mapping.pop(run_id_str) - - def set_runner_completed_event(self, run_id): - run_id_str = str(run_id) - client_runner = self.model_runner_mapping.get(run_id_str, None) - if client_runner is not None: - if client_runner.run_process_completed_event is not None: - client_runner.run_process_completed_event.set() - self.model_runner_mapping.pop(run_id_str) - - def callback_delete_deployment(self, topic, payload): - logging.info("[Worker] callback_delete_deployment") - - # Parse payload as the model message object. - model_msg_object = FedMLModelMsgObject(topic, payload) - - # Delete all replicas on this device - try: - ClientConstants.remove_deployment( - model_msg_object.end_point_name, model_msg_object.model_name, model_msg_object.model_version, - model_msg_object.run_id, model_msg_object.model_id, edge_id=self.edge_id) - except Exception as e: - logging.info(f"Exception when removing deployment {traceback.format_exc()}") - pass - - self.set_runner_stopped_event(model_msg_object.run_id) - - logging.info(f"[endpoint/device][{model_msg_object.run_id}/{self.edge_id}] " - f"Release gpu resource when the worker deployment deleted.") - JobRunnerUtils.get_instance().release_gpu_ids(model_msg_object.run_id, self.edge_id) - - if self.running_request_json.get(str(model_msg_object.run_id)) is not None: - try: - self.running_request_json.pop(str(model_msg_object.run_id)) - except Exception as e: - logging.error(f"Error when removing running_request_json: {traceback.format_exc()}") - pass - - FedMLClientDataInterface.get_instance().delete_job_from_db(model_msg_object.run_id) - FedMLModelDatabase.get_instance().delete_deployment_result_with_device_id( - model_msg_object.run_id, model_msg_object.end_point_name, model_msg_object.model_name, - self.edge_id) - - # Delete FEDML_GLOBAL_ENDPOINT_RUN_ID_MAP_TAG-${run_id} both in redis and local db - ComputeCacheManager.get_instance().gpu_cache.delete_endpoint_run_id_map(str(model_msg_object.run_id)) - - # Delete FEDML_EDGE_ID_MODEL_DEVICE_ID_MAP_TAG-${run_id} both in redis and local db - ComputeCacheManager.get_instance().gpu_cache.delete_edge_model_id_map(str(model_msg_object.run_id)) - - # Delete FEDML_GLOBAL_DEVICE_RUN_GPU_IDS_TAG-${run_id}-${device_id} both in redis and local db - ComputeCacheManager.get_instance().gpu_cache.delete_device_run_gpu_ids(str(self.edge_id), - str(model_msg_object.run_id)) - - # Delete FEDML_GLOBAL_DEVICE_RUN_NUM_GPUS_TAG-${run_id}-${device_id} both in redis and local db - ComputeCacheManager.get_instance().gpu_cache.delete_device_run_num_gpus(str(self.edge_id), - str(model_msg_object.run_id)) - - # Delete FEDML_MODEL_REPLICA_GPU_IDS_TAG-${run_id}-${end_point_name}-${model_name}-${device_id}-* - FedMLModelCache.get_instance().set_redis_params() - FedMLModelCache.get_instance().delete_all_replica_gpu_ids(model_msg_object.run_id, - model_msg_object.end_point_name, - model_msg_object.model_name, self.edge_id) - - def exit_run_with_exception_entry(self): - try: - self.setup_client_mqtt_mgr() - self.exit_run_with_exception() - except Exception as e: - self.release_client_mqtt_mgr() - sys.exit(1) - finally: - self.release_client_mqtt_mgr() - - def exit_run_with_exception(self): - logging.info("Exit run successfully.") - - ClientConstants.cleanup_learning_process(self.run_id) - ClientConstants.cleanup_run_process(self.run_id) - - self.mlops_metrics.report_client_id_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, - is_from_model=True, run_id=self.run_id) - - time.sleep(1) - - def callback_exit_train_with_exception(self, topic, payload): - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json.get("runId", None) - if run_id is None: - run_id = request_json.get("run_id", None) - if run_id is None: - run_id = request_json.get("id", None) - - if run_id is None: - return - - # Stop client with multiprocessing mode - self.request_json = request_json - client_runner = FedMLClientRunner( - self.args, edge_id=self.edge_id, request_json=request_json, agent_config=self.agent_config, run_id=run_id - ) - try: - Process(target=client_runner.exit_run_with_exception_entry).start() - except Exception as e: - pass - - def cleanup_client_with_status(self): - self.setup_client_mqtt_mgr() - - if self.device_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED: - self.cleanup_run_when_finished() - elif self.device_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED: - self.cleanup_run_when_starting_failed() - - self.release_client_mqtt_mgr() - - def callback_runner_id_status(self, topic, payload): - # logging.info("callback_runner_id_status: topic = %s, payload = %s" % (topic, payload)) - - request_json = json.loads(payload) - run_id = request_json["run_id"] - edge_id = request_json["edge_id"] - status = request_json["status"] - - self.save_training_status(edge_id, status) - - if status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED or \ - status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED: - # Stop client with multiprocessing mode - self.request_json = request_json - client_runner = FedMLClientRunner( - self.args, - edge_id=self.edge_id, - request_json=request_json, - agent_config=self.agent_config, - run_id=run_id, - ) - client_runner.device_status = status - status_process = Process(target=client_runner.cleanup_client_with_status) - status_process.start() - status_process.join(15) - - # Stop log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, edge_id) - - def callback_report_current_status(self, topic, payload): - self.send_agent_active_msg() - - @staticmethod - def process_ota_upgrade_msg(): - os.system("pip install -U fedml") - - def callback_client_ota_msg(self, topic, payload): - request_json = json.loads(payload) - cmd = request_json["cmd"] - - if cmd == ClientConstants.FEDML_OTA_CMD_UPGRADE: - FedMLClientRunner.process_ota_upgrade_msg() - # Process(target=FedMLClientRunner.process_ota_upgrade_msg).start() - raise Exception("After upgraded, restart runner...") - elif cmd == ClientConstants.FEDML_OTA_CMD_RESTART: - raise Exception("Restart runner...") - - def save_training_status(self, edge_id, training_status): - self.current_training_status = training_status - ClientConstants.save_training_infos(edge_id, training_status) - - @staticmethod - def get_device_id(): - device_file_path = os.path.join(ClientConstants.get_data_dir(), - ClientConstants.LOCAL_RUNNER_INFO_DIR_NAME) - file_for_device_id = os.path.join(device_file_path, "devices.id") - if not os.path.exists(device_file_path): - os.makedirs(device_file_path) - elif os.path.exists(file_for_device_id): - with open(file_for_device_id, 'r', encoding='utf-8') as f: - device_id_from_file = f.readline() - if device_id_from_file is not None and device_id_from_file != "": - return device_id_from_file - - if platform.system() == "Darwin": - cmd_get_serial_num = "system_profiler SPHardwareDataType | grep Serial | awk '{gsub(/ /,\"\")}{print}' " \ - "|awk -F':' '{print $2}' " - device_id = os.popen(cmd_get_serial_num).read() - device_id = device_id.replace('\n', '').replace(' ', '') - if device_id is None or device_id == "": - device_id = hex(uuid.getnode()) - else: - device_id = "0x" + device_id - else: - if "nt" in os.name: - - def get_uuid(): - guid = "" - try: - cmd = "wmic csproduct get uuid" - guid = str(subprocess.check_output(cmd)) - pos1 = guid.find("\\n") + 2 - guid = guid[pos1:-15] - except Exception as ex: - pass - return str(guid) - - device_id = str(get_uuid()) - logging.info(device_id) - elif "posix" in os.name: - device_id = sys_utils.get_device_id_in_docker() - if device_id is None: - device_id = hex(uuid.getnode()) - else: - device_id = sys_utils.run_subprocess_open( - "hal-get-property --udi /org/freedesktop/Hal/devices/computer --key system.hardware.uuid".split() - ) - device_id = hex(device_id) - - if device_id is not None and device_id != "": - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - else: - device_id = hex(uuid.uuid4()) - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - - return device_id - - def get_ip_address(self, request_json): - # OPTION 1: Use local ip - ip = ClientConstants.get_local_ip() - - # OPTION 2: Auto detect public ip - if "parameters" in request_json and \ - ClientConstants.AUTO_DETECT_PUBLIC_IP in request_json["parameters"] and \ - request_json["parameters"][ClientConstants.AUTO_DETECT_PUBLIC_IP]: - ip = ClientConstants.get_public_ip() - logging.info("Auto detect public ip for worker: " + ip) - - # OPTION 3: Use user indicated ip - if self.infer_host is not None and self.infer_host != "127.0.0.1" and self.infer_host != "localhost": - ip = self.infer_host - - return ip - - def bind_account_and_device_id(self, url, account_id, device_id, os_name, role="md.on_premise_device"): - ip = requests.get('https://checkip.amazonaws.com').text.strip() - fedml_ver, exec_path, os_ver, cpu_info, python_ver, torch_ver, mpi_installed, \ - cpu_usage, available_mem, total_mem, gpu_info, gpu_available_mem, gpu_total_mem, \ - gpu_count, gpu_vendor, cpu_count, gpu_device_name = get_sys_runner_info() - host_name = sys_utils.get_host_name() - json_params = { - "accountid": account_id, - "deviceid": device_id, - "state": ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE, - "status": ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE, - "type": os_name, - "processor": cpu_info, - "core_type": cpu_info, - "network": "", - "role": role, - "os_ver": os_ver, - "memory": total_mem, - "ip": ip, - "extra_infos": {"fedml_ver": fedml_ver, "exec_path": exec_path, "os_ver": os_ver, - "cpu_info": cpu_info, "python_ver": python_ver, "torch_ver": torch_ver, - "mpi_installed": mpi_installed, "cpu_usage": cpu_usage, - "available_mem": available_mem, "total_mem": total_mem, - "cpu_count": cpu_count, "gpu_count": 0, "host_name": host_name} - } - if gpu_count > 0: - if gpu_total_mem is not None: - json_params["gpu"] = gpu_info if gpu_info is not None else "" + ", Total GPU Memory: " + gpu_total_mem - else: - json_params["gpu"] = gpu_info if gpu_info is not None else "" - json_params["extra_infos"]["gpu_info"] = gpu_info if gpu_info is not None else "" - if gpu_available_mem is not None: - json_params["extra_infos"]["gpu_available_mem"] = gpu_available_mem - if gpu_total_mem is not None: - json_params["extra_infos"]["gpu_total_mem"] = gpu_total_mem - - json_params["extra_infos"]["gpu_count"] = gpu_count - json_params["extra_infos"]["gpu_vendor"] = gpu_vendor - json_params["extra_infos"]["gpu_device_name"] = gpu_device_name - - gpu_available_id_list = sys_utils.get_available_gpu_id_list(limit=gpu_count) - gpu_available_count = len(gpu_available_id_list) if gpu_available_id_list is not None else 0 - gpu_list = sys_utils.get_gpu_list() - json_params["extra_infos"]["gpu_available_count"] = gpu_available_count - json_params["extra_infos"]["gpu_available_id_list"] = gpu_available_id_list - json_params["extra_infos"]["gpu_list"] = gpu_list - else: - json_params["gpu"] = "None" - json_params["extra_infos"]["gpu_available_count"] = 0 - json_params["extra_infos"]["gpu_available_id_list"] = [] - json_params["extra_infos"]["gpu_list"] = [] - - _, cert_path = MLOpsConfigs.get_request_params() - if cert_path is not None: - try: - requests.session().verify = cert_path - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - except requests.exceptions.SSLError as err: - MLOpsConfigs.install_root_ca_file() - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - else: - response = requests.post(url, json=json_params, headers={"Connection": "close"}) - edge_id = -1 - user_name = None - extra_url = None - if response.status_code != 200: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - pass - else: - # print("url = {}, response = {}".format(url, response)) - status_code = response.json().get("code") - if status_code == "SUCCESS": - edge_id = response.json().get("data").get("id") - user_name = response.json().get("data").get("userName", None) - extra_url = response.json().get("data").get("url", None) - if edge_id is None or edge_id <= 0: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - else: - if status_code == SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR: - raise SystemExit(SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR) - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - return -1, None, None - return edge_id, user_name, extra_url - - def fetch_configs(self): - return MLOpsConfigs.fetch_all_configs() - - def send_agent_active_msg(self): - active_topic = "flclient_agent/active" - status = MLOpsStatus.get_instance().get_client_agent_status(self.edge_id) - if ( - status is not None - and status != ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE - and status != ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE - ): - return - - try: - current_job = FedMLClientDataInterface.get_instance().get_job_by_id(self.run_id) - except Exception as e: - current_job = None - if current_job is None: - if status is not None and status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE: - status = ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE - else: - return - else: - status = ClientConstants.get_device_state_from_run_edge_state(current_job.status) - active_msg = {"ID": self.edge_id, "status": status} - MLOpsStatus.get_instance().set_client_agent_status(self.edge_id, status) - self.mqtt_mgr.send_message_json(active_topic, json.dumps(active_msg)) - - def recover_start_deployment_msg_after_upgrading(self): - try: - current_job = FedMLClientDataInterface.get_instance().get_current_job() - if current_job is not None and \ - current_job.status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING: - logging.info("start deployment after upgrading.") - topic_start_deployment = "model_ops/model_device/start_deployment/{}".format(str(self.edge_id)) - self.callback_start_deployment(topic_start_deployment, current_job.running_json) - except Exception as e: - logging.info("recover starting deployment message after upgrading: {}".format(traceback.format_exc())) - - def on_agent_mqtt_connected(self, mqtt_client_object): - # The MQTT message topic format is as follows: // - - # Setup MQTT message listener for starting deployment - topic_start_deployment = "model_ops/model_device/start_deployment/{}".format(str(self.edge_id)) - self.mqtt_mgr.add_message_listener(topic_start_deployment, self.callback_start_deployment) - - # Setup MQTT message listener for delete deployment - topic_delete_deployment = "model_ops/model_device/delete_deployment/{}".format(str(self.edge_id)) - self.mqtt_mgr.add_message_listener(topic_delete_deployment, self.callback_delete_deployment) - - # Setup MQTT message listener for running failed - topic_exit_train_with_exception = "flserver_agent/" + str(self.edge_id) + "/exit_train_with_exception" - self.mqtt_mgr.add_message_listener(topic_exit_train_with_exception, self.callback_exit_train_with_exception) - - # Setup MQTT message listener for client status switching - topic_client_status = "fl_client/flclient_agent_" + str(self.edge_id) + "/status" - self.mqtt_mgr.add_message_listener(topic_client_status, self.callback_runner_id_status) - - # Setup MQTT message listener to report current device status. - topic_report_status = "mlops/report_device_status" - self.mqtt_mgr.add_message_listener(topic_report_status, self.callback_report_current_status) - - # Setup MQTT message listener to OTA messages from the MLOps. - topic_ota_msg = "mlops/flclient_agent_" + str(self.edge_id) + "/ota" - self.mqtt_mgr.add_message_listener(topic_ota_msg, self.callback_client_ota_msg) - - if self.mqtt_inference_obj is None: - self.mqtt_inference_obj = FedMLMqttInference(agent_config=self.agent_config, mqtt_mgr=self.mqtt_mgr) - self.mqtt_inference_obj.setup_listener_for_endpoint_inference_request(self.edge_id) - - # Subscribe topics for starting deployment, stopping deployment and fetching client status. - mqtt_client_object.subscribe(topic_start_deployment, qos=2) - mqtt_client_object.subscribe(topic_delete_deployment, qos=2) - mqtt_client_object.subscribe(topic_client_status, qos=2) - mqtt_client_object.subscribe(topic_report_status, qos=2) - mqtt_client_object.subscribe(topic_exit_train_with_exception, qos=2) - mqtt_client_object.subscribe(topic_ota_msg, qos=2) - - self.subscribed_topics.clear() - self.subscribed_topics.append(topic_start_deployment) - self.subscribed_topics.append(topic_delete_deployment) - self.subscribed_topics.append(topic_client_status) - self.subscribed_topics.append(topic_report_status) - self.subscribed_topics.append(topic_exit_train_with_exception) - self.subscribed_topics.append(topic_ota_msg) - - # Broadcast the first active message. - self.send_agent_active_msg() - - # Echo results - # print("\n\nCongratulations, your device is connected to the FedML MLOps platform successfully!") - # print( - # "Your FedML Edge ID is " + str(self.edge_id) + ", unique device ID is " - # + str(self.unique_device_id) - # + "\n" - # ) - - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - - def on_agent_mqtt_disconnected(self, mqtt_client_object): - MLOpsStatus.get_instance().set_client_agent_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE - ) - - try: - if self.mqtt_inference_obj is not None: - self.mqtt_inference_obj.remove_listener_for_endpoint_inference_request(self.edge_id) - except Exception as e: - pass - - def setup_agent_mqtt_connection(self, service_config): - # Setup MQTT connection - self.mqtt_mgr = MqttManager( - service_config["mqtt_config"]["BROKER_HOST"], - service_config["mqtt_config"]["BROKER_PORT"], - service_config["mqtt_config"]["MQTT_USER"], - service_config["mqtt_config"]["MQTT_PWD"], - service_config["mqtt_config"]["MQTT_KEEPALIVE"], - "FedML_ModelClientAgent_Daemon_@" + self.user_name + "@_" + self.args.current_device_id + str(uuid.uuid4()), - "flclient_agent/last_will_msg", - json.dumps({"ID": self.edge_id, "status": ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE}) - ) - self.agent_config = service_config - - # Init local database - FedMLClientDataInterface.get_instance().create_job_table() - try: - FedMLModelDatabase.get_instance().set_database_base_dir(ClientConstants.get_database_dir()) - FedMLModelDatabase.get_instance().create_table() - except Exception as e: - pass - - client_api_cmd = "fedml.computing.scheduler.model_scheduler.device_client_api:api" - client_api_pids = RunProcessUtils.get_pid_from_cmd_line(client_api_cmd) - if client_api_pids is None or len(client_api_pids) <= 0: - # Start local API services - cur_dir = os.path.dirname(__file__) - fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir))) - python_program = get_python_program() - self.local_api_process = ClientConstants.exec_console_with_script( - "{} -m uvicorn {} --host 0.0.0.0 --port {} --reload --reload-delay 3 --reload-dir {} " - "--log-level critical".format( - python_program, client_api_cmd, - ClientConstants.LOCAL_CLIENT_API_PORT, fedml_base_dir - ), - should_capture_stdout=False, - should_capture_stderr=False - ) - # if self.local_api_process is not None and self.local_api_process.pid is not None: - # print(f"Model worker local API process id {self.local_api_process.pid}") - - # MLOpsRuntimeLogDaemon.get_instance(self.args).stop_all_log_processor() - - # Setup MQTT connected listener - self.mqtt_mgr.add_connected_listener(self.on_agent_mqtt_connected) - self.mqtt_mgr.add_disconnected_listener(self.on_agent_mqtt_disconnected) - self.mqtt_mgr.connect() - - self.setup_client_mqtt_mgr() - self.mlops_metrics.report_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE, is_from_model=True) - MLOpsStatus.get_instance().set_client_agent_status(self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE) - - self.recover_start_deployment_msg_after_upgrading() - - def stop_agent(self): - if self.run_process_event is not None: - self.run_process_event.set() - - if self.mqtt_mgr is not None: - try: - for topic in self.subscribed_topics: - self.mqtt_mgr.unsubscribe_msg(topic) - except Exception as e: - pass - - self.mqtt_mgr.loop_stop() - self.mqtt_mgr.disconnect() - - self.release_client_mqtt_mgr() - - def start_agent_mqtt_loop(self, should_exit_sys=False): - # Start MQTT message loop - try: - self.mqtt_mgr.loop_forever() - except Exception as e: - if str(e) == "Restarting after upgraded...": - logging.info("Restarting after upgraded...") - else: - logging.info("Client tracing: {}".format(traceback.format_exc())) - finally: - self.stop_agent() - - if should_exit_sys: - time.sleep(5) - sys.exit(1) diff --git a/python/examples/deploy/quick_start/src/app/__init__.py b/python/fedml/computing/scheduler/model_scheduler/device_client_runner_deprecated.py old mode 100644 new mode 100755 similarity index 100% rename from python/examples/deploy/quick_start/src/app/__init__.py rename to python/fedml/computing/scheduler/model_scheduler/device_client_runner_deprecated.py diff --git a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py index e711a9e6a6..28d50d5a50 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_http_inference_protocol.py @@ -1,13 +1,14 @@ -import traceback -from typing import Mapping -from urllib.parse import urlparse +import logging import httpx +import traceback from .device_client_constants import ClientConstants -import requests + from fastapi.responses import Response from fastapi.responses import StreamingResponse +from urllib.parse import urlparse +from typing import Mapping class FedMLHttpInference: @@ -15,15 +16,17 @@ def __init__(self): pass @staticmethod - async def is_inference_ready(inference_url, timeout=None): - ''' + async def is_inference_ready(inference_url, path="ready", timeout=None): + """ True: inference is ready False: cannot be reached, will try other protocols None: can be reached, but not ready - ''' + """ url_parsed = urlparse(inference_url) - ready_url = f"http://{url_parsed.hostname}:{url_parsed.port}/ready" + ready_url = f"http://{url_parsed.hostname}:{url_parsed.port}/{path}" response_ok = False + + # TODO (Raphael): Support more methods and return codes rules. try: async with httpx.AsyncClient() as client: ready_response = await client.get(url=ready_url, timeout=timeout) @@ -47,9 +50,8 @@ async def is_inference_ready(inference_url, timeout=None): @staticmethod async def run_http_inference_with_curl_request( inference_url, inference_input_list, inference_output_list, - inference_type="default", engine_type="default", timeout=None + inference_type="default", engine_type="default", timeout=None, method="POST" ): - model_inference_result = {} if inference_type == "default": model_api_headers = {'Content-Type': 'application/json', 'Connection': 'close', 'Accept': 'application/json'} @@ -64,11 +66,10 @@ async def run_http_inference_with_curl_request( "outputs": inference_output_list } - response_ok = False try: if model_inference_json.get("stream", False): model_inference_result = StreamingResponse( - stream_generator(inference_url, input_json=model_inference_json), + stream_generator(inference_url, input_json=model_inference_json, method=method), media_type="text/event-stream", headers={ "Content-Type": model_api_headers.get("Accept", "text/event-stream"), @@ -77,8 +78,8 @@ async def run_http_inference_with_curl_request( ) response_ok = True else: - response_ok, model_inference_result = await redirect_request_to_worker( - inference_type, inference_url, model_api_headers, model_inference_json, timeout) + response_ok, model_inference_result = await redirect_non_stream_req_to_worker( + inference_type, inference_url, model_api_headers, model_inference_json, timeout, method=method) except Exception as e: response_ok = False model_inference_result = {"response": f"{traceback.format_exc()}"} @@ -86,21 +87,22 @@ async def run_http_inference_with_curl_request( return response_ok, model_inference_result -async def stream_generator(inference_url, input_json): +async def stream_generator(inference_url, input_json, method="POST"): async with httpx.AsyncClient() as client: - async with client.stream("POST", inference_url, json=input_json, + async with client.stream(method, inference_url, json=input_json, timeout=ClientConstants.WORKER_STREAM_API_TIMEOUT) as response: async for chunk in response.aiter_lines(): # we consumed a newline, need to put it back yield f"{chunk}\n" -async def redirect_request_to_worker(inference_type, inference_url, model_api_headers, model_inference_json, timeout=None): +async def redirect_non_stream_req_to_worker(inference_type, inference_url, model_api_headers, model_inference_json, + timeout=None, method="POST"): response_ok = True try: async with httpx.AsyncClient() as client: - response = await client.post( - url=inference_url, headers=model_api_headers, json=model_inference_json, timeout=timeout + response = await client.request( + method=method, url=inference_url, headers=model_api_headers, json=model_inference_json, timeout=timeout ) except Exception as e: response_ok = False @@ -108,13 +110,17 @@ async def redirect_request_to_worker(inference_type, inference_url, model_api_he return response_ok, model_inference_result if response.status_code == 200: - if inference_type == "default": - model_inference_result = response.json() - elif inference_type == "image/png": - binary_content: bytes = response.content - model_inference_result = Response(content=binary_content, media_type="image/png") - else: - model_inference_result = response.json() + try: + if inference_type == "image/png": + # wrapped media type for image + binary_content: bytes = response.content + model_inference_result = Response(content=binary_content, media_type=inference_type) + else: + model_inference_result = response.json() + except Exception as e: + response_ok = True + logging.warning(f"Status code 200, but cannot trans response to json due to: {e}.") + model_inference_result = {"response": f"{response.content}"} else: model_inference_result = {"response": f"{response.content}"} diff --git a/python/fedml/computing/scheduler/model_scheduler/device_http_proxy_inference_protocol.py b/python/fedml/computing/scheduler/model_scheduler/device_http_proxy_inference_protocol.py index 53f5a002eb..180b10994b 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_http_proxy_inference_protocol.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_http_proxy_inference_protocol.py @@ -50,9 +50,11 @@ async def run_http_proxy_inference_with_request( endpoint_id, inference_url, inference_input_list, inference_output_list, inference_type="default", timeout=None + # TODO(Raphael): Add support for GET and other methods ): inference_response = {} - http_proxy_url = f"http://{urlparse(inference_url).hostname}:{ClientConstants.LOCAL_CLIENT_API_PORT}/api/v1/predict" + worker_proxy_port = ClientConstants.get_inference_worker_proxy_port() + http_proxy_url = f"http://{urlparse(inference_url).hostname}:{worker_proxy_port}/api/v1/predict" if inference_type == "default": model_api_headers = {'Content-Type': 'application/json', 'Connection': 'close', 'Accept': 'application/json'} diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py index edcdf7d0f1..7a47c1961e 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cache.py @@ -33,6 +33,8 @@ class FedMLModelCache(Singleton): FEDML_KEY_COUNT_PER_SCAN = 1000 + FEDML_PENDING_REQUESTS_COUNTER = "FEDML_PENDING_REQUESTS_COUNTER" + def __init__(self): if not hasattr(self, "redis_pool"): self.redis_pool = None @@ -110,7 +112,8 @@ def set_user_setting_replica_num(self, end_point_id, replica_num: int, enable_auto_scaling: bool = False, scale_min: int = 0, scale_max: int = 0, state: str = "UNKNOWN", target_queries_per_replica: int = 60, aggregation_window_size_seconds: int = 60, - scale_down_delay_seconds: int = 120 + scale_down_delay_seconds: int = 120, timeout_s: int = 30, + user_encrypted_api_key: str = "" ) -> bool: """ Key: FEDML_MODEL_ENDPOINT_REPLICA_USER_SETTING_TAG-- @@ -136,7 +139,9 @@ def set_user_setting_replica_num(self, end_point_id, "scale_min": scale_min, "scale_max": scale_max, "state": state, "target_queries_per_replica": target_queries_per_replica, "aggregation_window_size_seconds": aggregation_window_size_seconds, - "scale_down_delay_seconds": scale_down_delay_seconds + "scale_down_delay_seconds": scale_down_delay_seconds, + ServerConstants.INFERENCE_REQUEST_TIMEOUT_KEY: timeout_s, + ServerConstants.USER_ENCRYPTED_API_KEY: user_encrypted_api_key } try: self.redis_connection.set(self.get_user_setting_replica_num_key(end_point_id), json.dumps(replica_num_dict)) @@ -166,6 +171,15 @@ def update_user_setting_replica_num(self, end_point_id: str, state: str = "UNKNO return False return True + def get_user_encrypted_api_key(self, end_point_id: str) -> str: + try: + replica_num_dict = self.redis_connection.get(self.get_user_setting_replica_num_key(end_point_id)) + replica_num_dict = json.loads(replica_num_dict) + return replica_num_dict.get(ServerConstants.USER_ENCRYPTED_API_KEY, "") + except Exception as e: + logging.error(e) + return "" + def get_all_endpoints_user_setting(self) -> List[dict]: """ Return a list of dict, each dict is the user setting of an endpoint. @@ -236,7 +250,7 @@ def delete_deployment_result(self, element: str, end_point_id, end_point_name, m device_id, replica_no, _ = self.get_result_item_info(element) self.model_deployment_db.delete_deployment_result_with_device_id_and_rank( - end_point_id, end_point_name, model_name, device_id, replica_rank=replica_no-1) + end_point_id, end_point_name, model_name, device_id, replica_rank=replica_no - 1) return @@ -272,8 +286,9 @@ def get_deployment_result_list(self, end_point_id, end_point_name, model_name): result_list = self.model_deployment_db.get_deployment_result_list(end_point_id, end_point_name, model_name) try: for result in result_list: - self.redis_connection.rpush(self.get_deployment_result_key(end_point_id, end_point_name, model_name), - json.dumps(result)) + self.redis_connection.rpush( + self.get_deployment_result_key(end_point_id, end_point_name, model_name), + json.dumps(result)) except Exception as e: logging.info(e) pass @@ -288,7 +303,27 @@ def get_all_deployment_result_list(self): result_list.extend(self.redis_connection.lrange(key, 0, -1)) except Exception as e: logging.error(e) - # TODO(Raphael): Use Sqlite for the replica backup + + # Get cached results from the persist sqlite database + if len(result_list) <= 0: + db_result_list = list() + try: + db_result_list = self.model_deployment_db.get_all_deployment_results_list() + except Exception as e: + logging.error(f"Failed to get all deployment results from the database due to {e}") + pass + + for result in db_result_list: + try: + self.redis_connection.rpush(self.get_deployment_result_key( + result["end_point_id"], result["end_point_name"], result["model_name"]), + json.dumps(result["replica_info"])) + except Exception as e: + logging.error(e) + pass + + for result in db_result_list: + result_list.append(result["replica_info"]) return result_list @@ -298,7 +333,8 @@ def get_deployment_result_list_size(self, end_point_id, end_point_name, model_na def get_deployment_status_list(self, end_point_id, end_point_name, model_name): try: - status_list = self.redis_connection.lrange(self.get_deployment_status_key(end_point_id, end_point_name, model_name), 0, -1) + status_list = self.redis_connection.lrange( + self.get_deployment_status_key(end_point_id, end_point_name, model_name), 0, -1) except Exception as e: status_list = None @@ -306,8 +342,9 @@ def get_deployment_status_list(self, end_point_id, end_point_name, model_name): status_list = self.model_deployment_db.get_deployment_status_list(end_point_id, end_point_name, model_name) try: for status in status_list: - self.redis_connection.rpush(self.get_deployment_status_key(end_point_id, end_point_name, model_name), - json.dumps(status)) + self.redis_connection.rpush( + self.get_deployment_status_key(end_point_id, end_point_name, model_name), + json.dumps(status)) except Exception as e: pass return status_list @@ -316,7 +353,8 @@ def get_deployment_status_list_size(self, end_point_id, end_point_name, model_na status_list = self.get_deployment_status_list(end_point_id, end_point_name, model_name) return len(status_list) - def get_status_item_info(self, status_item): + @staticmethod + def get_status_item_info(status_item): status_item_json = json.loads(status_item) if isinstance(status_item_json, str): status_item_json = json.loads(status_item_json) @@ -327,13 +365,14 @@ def get_status_item_info(self, status_item): status_payload = status_item_json["status"] return device_id, status_payload - def get_result_item_info(self, result_item): + @staticmethod + def get_result_item_info(result_item): result_item_json = json.loads(result_item) if isinstance(result_item_json, str): result_item_json = json.loads(result_item_json) device_id = result_item_json["cache_device_id"] - replica_no = result_item_json.get("cache_replica_no", 1) # Compatible with the old version + replica_no = result_item_json.get("cache_replica_no", 1) # Compatible with the old version if isinstance(result_item_json["result"], str): result_payload = json.loads(result_item_json["result"]) @@ -341,9 +380,13 @@ def get_result_item_info(self, result_item): result_payload = result_item_json["result"] return device_id, replica_no, result_payload - def get_idle_device(self, end_point_id, end_point_name, - model_name, model_version, - check_end_point_status=True, limit_specific_model_version=False): + def get_idle_device(self, + end_point_id, + end_point_name, + model_name, + model_version, + check_end_point_status=True, + limit_specific_model_version=False): # Deprecated the model status logic, query directly from the deployment result list idle_device_list = list() @@ -360,15 +403,15 @@ def get_idle_device(self, end_point_id, end_point_name, found_model_name == model_name and (not limit_specific_model_version or found_model_version == model_version)): if "model_status" in result_payload and result_payload["model_status"] == "DEPLOYED": - idle_device_list.append({"device_id": device_id, "end_point_id": end_point_id}) + idle_device_list.append({"device_id": device_id, "result_payload": result_payload.copy()}) - logging.info(f"{len(idle_device_list)} devices has this model on it: {idle_device_list}") + logging.debug(f"{len(idle_device_list)} devices this model has on it: {idle_device_list}") if len(idle_device_list) <= 0: return None, None # # Randomly shuffle - # shuffle the list of deployed devices and get the first one as the target idle device. + # the list of deployed devices and get the first one as the target idle device. # if len(idle_device_list) <= 0: # return None, None # shuffle(idle_device_list) @@ -376,54 +419,21 @@ def get_idle_device(self, end_point_id, end_point_name, # Round Robin total_device_num = len(idle_device_list) - redis_round_robin_key = self.get_round_robin_prev_device(end_point_id, end_point_name, model_name, model_version) - - selected_device_index = 0 - try: - if self.redis_connection.exists(redis_round_robin_key): - selected_device_index = int(self.redis_connection.get(redis_round_robin_key)) - selected_device_index %= total_device_num - else: - selected_device_index = 0 - next_selected_device_index = (selected_device_index + 1) % total_device_num - self.redis_connection.set(redis_round_robin_key, str(next_selected_device_index)) - except Exception as e: - logging.info("Inference Device selection Failed:") - logging.info(e) - - logging.info(f"Using Round Robin, the device index is {selected_device_index}") - idle_device_dict = idle_device_list[selected_device_index] - - # Note that within the same endpoint_id, there could be one device with multiple same models - same_model_device_rank = 0 - start = selected_device_index - while(start != 0 and idle_device_list[start]["device_id"] == idle_device_list[start-1]["device_id"]): - start -= 1 - same_model_device_rank += 1 - - # Find deployment result from the target idle device. - try: - for result_item in result_list: - logging.info("enter the for loop") - device_id, _, result_payload = self.get_result_item_info(result_item) - found_end_point_id = result_payload["end_point_id"] - found_end_point_name = result_payload["end_point_name"] - found_model_status = result_payload["model_status"] + redis_round_robin_key = self.get_round_robin_prev_device(end_point_id, end_point_name, model_name, + model_version) + if self.redis_connection.exists(redis_round_robin_key): + selected_device_index = int(self.redis_connection.get(redis_round_robin_key)) + selected_device_index %= total_device_num + else: + selected_device_index = 0 - if found_model_status != "DEPLOYED": - continue + next_idx = (selected_device_index + 1) % total_device_num + self.redis_connection.set(redis_round_robin_key, next_idx) - if str(found_end_point_id) == str(idle_device_dict["end_point_id"]) \ - and device_id == idle_device_dict["device_id"]: - if same_model_device_rank > 0: - same_model_device_rank -= 1 - continue - logging.info(f"The chosen device is {device_id}") - return result_payload, device_id - except Exception as e: - logging.info(str(e)) + idle_device_info = idle_device_list[selected_device_index] + payload = idle_device_info["result_payload"] - return None, None + return payload, idle_device_info["device_id"] def get_latest_version(self, status_list): latest_version = None @@ -432,8 +442,8 @@ def get_latest_version(self, status_list): try: _, status_payload = self.get_status_item_info(status_item) model_version = status_payload["model_version"] - prefix = model_version.split("-")[0] # version-date - prefix_int = int(prefix[1:]) # v12 -> 12 + prefix = model_version.split("-")[0] # version-date + prefix_int = int(prefix[1:]) # v12 -> 12 if latest_version is None: latest_version = model_version @@ -528,32 +538,47 @@ def delete_end_point(self, end_point_id, end_point_name, model_name, model_versi # Device id is either deploy master or deploy worker try: logging.info("Will Delete the related redis keys permanently") - self.redis_connection.expire(self.get_deployment_result_key(end_point_id, end_point_name, model_name), ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) - self.redis_connection.expire(self.get_deployment_status_key(end_point_id, end_point_name, model_name), ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) - self.redis_connection.expire(self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, model_version), ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) - self.redis_connection.expire(self.get_deployment_token_key(end_point_id, end_point_name, model_name), ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) - - any_version_round_robin_key = self.get_round_robin_prev_device_any_version(end_point_id, end_point_name, model_name) + self.redis_connection.expire(self.get_deployment_result_key(end_point_id, end_point_name, model_name), + ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) + self.redis_connection.expire(self.get_deployment_status_key(end_point_id, end_point_name, model_name), + ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) + self.redis_connection.expire( + self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, model_version), + ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) + self.redis_connection.expire(self.get_deployment_token_key(end_point_id, end_point_name, model_name), + ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) + + any_version_round_robin_key = self.get_round_robin_prev_device_any_version(end_point_id, end_point_name, + model_name) for key in self.redis_connection.scan_iter(any_version_round_robin_key + "*"): self.redis_connection.expire(key, ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) - self.redis_connection.expire(self.get_deployment_device_info_key(end_point_id), ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) - self.redis_connection.expire(self.get_end_point_activation_key(end_point_id), ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) - self.redis_connection.expire(self.get_end_point_status_key(end_point_id), ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) - self.redis_connection.expire(self.get_user_setting_replica_num_key(end_point_id), ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) + self.redis_connection.expire(self.get_deployment_device_info_key(end_point_id), + ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) + self.redis_connection.expire(self.get_end_point_activation_key(end_point_id), + ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) + self.redis_connection.expire(self.get_end_point_status_key(end_point_id), + ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) + self.redis_connection.expire(self.get_user_setting_replica_num_key(end_point_id), + ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) # Delete all replicas gpu ids - matched_prefix_replica = self.get_replica_gpu_ids_key_all_replicas(end_point_id, end_point_name, model_name, device_id) + matched_prefix_replica = self.get_replica_gpu_ids_key_all_replicas(end_point_id, end_point_name, model_name, + device_id) for key in self.redis_connection.scan_iter(matched_prefix_replica + "*"): self.redis_connection.expire(key, ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) logging.info(f"Those keys are deleted: {key}") # Delete the compute gpu cache - self.redis_connection.expire(ComputeGpuCache.get_run_total_num_gpus_key(end_point_id), ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) - self.redis_connection.expire(ComputeGpuCache.get_run_total_num_gpus_key(end_point_id), ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) - self.redis_connection.expire(ComputeGpuCache.get_run_device_ids_key(end_point_id), ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) - self.redis_connection.expire(ComputeGpuCache.get_edge_model_id_map_key(end_point_id), ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) + self.redis_connection.expire(ComputeGpuCache.get_run_total_num_gpus_key(end_point_id), + ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) + self.redis_connection.expire(ComputeGpuCache.get_run_total_num_gpus_key(end_point_id), + ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) + self.redis_connection.expire(ComputeGpuCache.get_run_device_ids_key(end_point_id), + ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) + self.redis_connection.expire(ComputeGpuCache.get_edge_model_id_map_key(end_point_id), + ServerConstants.MODEL_CACHE_KEY_EXPIRE_TIME) logging.info(f"Those keys are deleted:" f"{ComputeGpuCache.get_endpoint_run_id_map_key(end_point_id)}, " @@ -663,7 +688,8 @@ def get_end_point_token(self, end_point_id, end_point_name, model_name): token = None try: if self.redis_connection.exists(self.get_deployment_token_key(end_point_id, end_point_name, model_name)): - token = self.redis_connection.get(self.get_deployment_token_key(end_point_id, end_point_name, model_name)) + token = self.redis_connection.get( + self.get_deployment_token_key(end_point_id, end_point_name, model_name)) except Exception as e: token = None @@ -671,7 +697,8 @@ def get_end_point_token(self, end_point_id, end_point_name, model_name): token = self.model_deployment_db.get_end_point_token(end_point_id, end_point_name, model_name) if token is not None: try: - self.redis_connection.set(self.get_deployment_token_key(end_point_id, end_point_name, model_name), token) + self.redis_connection.set(self.get_deployment_token_key(end_point_id, end_point_name, model_name), + token) except Exception as e: pass @@ -703,32 +730,41 @@ def get_endpoint_devices_replica_num(self, end_point_id): return replica_num - def get_deployment_result_key(self, end_point_id, end_point_name, model_name): - return "{}-{}-{}-{}".format(FedMLModelCache.FEDML_MODEL_DEPLOYMENT_RESULT_TAG, end_point_id, end_point_name, model_name) + @staticmethod + def get_deployment_result_key(end_point_id, end_point_name, model_name): + return "{}-{}-{}-{}".format(FedMLModelCache.FEDML_MODEL_DEPLOYMENT_RESULT_TAG, end_point_id, end_point_name, + model_name) - def get_deployment_status_key(self, end_point_id, end_point_name, model_name): - return "{}-{}-{}-{}".format(FedMLModelCache.FEDML_MODEL_DEPLOYMENT_STATUS_TAG, end_point_id, end_point_name, model_name) + @staticmethod + def get_deployment_status_key(end_point_id, end_point_name, model_name): + return "{}-{}-{}-{}".format(FedMLModelCache.FEDML_MODEL_DEPLOYMENT_STATUS_TAG, end_point_id, end_point_name, + model_name) - def get_end_point_status_key(self, end_point_id): + @staticmethod + def get_end_point_status_key(end_point_id): return "{}{}".format(FedMLModelCache.FEDML_MODEL_END_POINT_STATUS_TAG, end_point_id) @staticmethod def get_end_point_activation_key(end_point_id): return "{}{}".format(FedMLModelCache.FEDML_MODEL_END_POINT_ACTIVATION_TAG, end_point_id) - def get_deployment_device_info_key(self, end_point_id): + @staticmethod + def get_deployment_device_info_key(end_point_id): return "{}{}".format(FedMLModelCache.FEDML_MODEL_DEVICE_INFO_TAG, end_point_id) @staticmethod def get_deployment_token_key(end_point_id, end_point_name, model_name): - return "{}-{}-{}-{}".format(FedMLModelCache.FEDML_MODEL_END_POINT_TOKEN_TAG, end_point_id, end_point_name, model_name) + return "{}-{}-{}-{}".format(FedMLModelCache.FEDML_MODEL_END_POINT_TOKEN_TAG, end_point_id, end_point_name, + model_name) @staticmethod def get_deployment_token_key_eid(end_point_id): return "{}-{}".format(FedMLModelCache.FEDML_MODEL_END_POINT_TOKEN_TAG, end_point_id) - def get_round_robin_prev_device(self, end_point_id, end_point_name, model_name, version): - return "{}-{}-{}-{}-{}".format(FedMLModelCache.FEDML_MODEL_ROUND_ROBIN_PREVIOUS_DEVICE_TAG, end_point_id, end_point_name, model_name, version) + @staticmethod + def get_round_robin_prev_device(end_point_id, end_point_name, model_name, version): + return "{}-{}-{}-{}-{}".format(FedMLModelCache.FEDML_MODEL_ROUND_ROBIN_PREVIOUS_DEVICE_TAG, end_point_id, + end_point_name, model_name, version) @staticmethod def get_round_robin_prev_device_any_version(endpoint_id, endpoint_name, model_name): @@ -757,8 +793,9 @@ def set_monitor_metrics(self, end_point_id, end_point_name, "total_request_num": total_request_num, "current_qps": current_qps, "avg_qps": avg_qps, "timestamp": timestamp, "device_id": device_id} try: - self.redis_connection.rpush(self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, model_version), - json.dumps(metrics_dict)) + self.redis_connection.rpush( + self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, model_version), + json.dumps(metrics_dict)) except Exception as e: pass self.model_deployment_db.set_monitor_metrics(end_point_id, end_point_name, @@ -769,16 +806,20 @@ def set_monitor_metrics(self, end_point_id, end_point_name, def get_latest_monitor_metrics(self, end_point_id, end_point_name, model_name, model_version): try: - if self.redis_connection.exists(self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, model_version)): - return self.redis_connection.lindex(self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, model_version), -1) + if self.redis_connection.exists( + self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, model_version)): + return self.redis_connection.lindex( + self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, model_version), -1) except Exception as e: pass - metrics_dict = self.model_deployment_db.get_latest_monitor_metrics(end_point_id, end_point_name, model_name, model_version) + metrics_dict = self.model_deployment_db.get_latest_monitor_metrics(end_point_id, end_point_name, model_name, + model_version) if metrics_dict is not None: try: - self.redis_connection.rpush(self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, model_version), - metrics_dict) + self.redis_connection.rpush( + self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, model_version), + metrics_dict) except Exception as e: pass @@ -786,21 +827,25 @@ def get_latest_monitor_metrics(self, end_point_id, end_point_name, model_name, m def get_monitor_metrics_item(self, end_point_id, end_point_name, model_name, model_version, index): try: - if self.redis_connection.exists(self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, model_version)): - metrics_item = self.redis_connection.lindex(self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, - model_version), index) - return metrics_item, index+1 + if self.redis_connection.exists( + self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, model_version)): + metrics_item = self.redis_connection.lindex( + self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, + model_version), index) + return metrics_item, index + 1 except Exception as e: pass - metrics_dict = self.model_deployment_db.get_monitor_metrics_item(end_point_id, end_point_name, model_name, model_version, index) + metrics_dict = self.model_deployment_db.get_monitor_metrics_item(end_point_id, end_point_name, model_name, + model_version, index) if metrics_dict is not None: try: - self.redis_connection.rpush(self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, model_version), - metrics_dict) + self.redis_connection.rpush( + self.get_monitor_metrics_key(end_point_id, end_point_name, model_name, model_version), + metrics_dict) except Exception as e: pass - return metrics_dict, index+1 + return metrics_dict, index + 1 return None, 0 @@ -824,38 +869,37 @@ def get_monitor_metrics_key(self, end_point_id, end_point_name, model_name, mode end_point_id, end_point_name, model_name, model_version) def get_endpoint_metrics(self, - endpoint_id, + end_point_id, k_recent=None) -> List[Any]: model_deployment_monitor_metrics = list() try: key_pattern = "{}*{}*".format( self.FEDML_MODEL_DEPLOYMENT_MONITOR_TAG, - endpoint_id) - model_deployment_monitor_endpoint_keys = \ + end_point_id) + model_deployment_monitor_endpoint_key = \ self.redis_connection.keys(pattern=key_pattern) # Since the reply is a list, we need to make sure the list # is non-empty otherwise the index will raise an error. - if model_deployment_monitor_endpoint_keys: + if model_deployment_monitor_endpoint_key: model_deployment_monitor_endpoint_key = \ - model_deployment_monitor_endpoint_keys[0] - else: - raise Exception("Function `get_endpoint_metrics` Key {} does not exist." - .format(key_pattern)) - # Set start and end index depending on the size of the - # list and the requested number of most recent records. - num_records = self.redis_connection.llen(name=model_deployment_monitor_endpoint_key) - # if k_most_recent is None, then fetch all by default. - start, end = 0, -1 - # if k_most_recent is positive then fetch [-k_most_recent:] - if k_recent and k_recent > 0: - start = num_records - k_recent - model_deployment_monitor_metrics = \ - self.redis_connection.lrange( - name=model_deployment_monitor_endpoint_key, - start=start, - end=end) - model_deployment_monitor_metrics = [ - json.loads(m) for m in model_deployment_monitor_metrics] + model_deployment_monitor_endpoint_key[0] + + # Set start and end index depending on the size of the + # list and the requested number of most recent records. + num_records = self.redis_connection.llen( + name=model_deployment_monitor_endpoint_key) + # if k_most_recent is None, then fetch all by default. + start, end = 0, -1 + # if k_most_recent is positive then fetch [-k_most_recent:] + if k_recent and k_recent > 0: + start = num_records - k_recent + model_deployment_monitor_metrics = \ + self.redis_connection.lrange( + name=model_deployment_monitor_endpoint_key, + start=start, + end=end) + model_deployment_monitor_metrics = [ + json.loads(m) for m in model_deployment_monitor_metrics] except Exception as e: logging.error(e) @@ -868,24 +912,24 @@ def get_endpoint_replicas_results(self, endpoint_id) -> List[Any]: key_pattern = "{}*{}*".format( self.FEDML_MODEL_DEPLOYMENT_RESULT_TAG, endpoint_id) - model_deployment_result_key = \ + model_deployment_result_keys = \ self.redis_connection.keys(pattern=key_pattern) - if model_deployment_result_key: + if model_deployment_result_keys: model_deployment_result_key = \ - model_deployment_result_key[0] + model_deployment_result_keys[0] + replicas_results = \ + self.redis_connection.lrange( + name=model_deployment_result_key, + start=0, + end=-1) + # Format the result value to a properly formatted json. + for replica_idx, replica in enumerate(replicas_results): + replicas_results[replica_idx] = json.loads(replica) + replicas_results[replica_idx]["result"] = \ + json.loads(replicas_results[replica_idx]["result"]) else: raise Exception("Function `get_endpoint_replicas_results` Key {} does not exist." .format(key_pattern)) - replicas_results = \ - self.redis_connection.lrange( - name=model_deployment_result_key, - start=0, - end=-1) - - # Format the result value to a properly formatted json. - for replica_idx, replica in enumerate(replicas_results): - replicas_results[replica_idx] = json.loads(replica) - replicas_results[replica_idx]["result"] = json.loads(replicas_results[replica_idx]["result"]) except Exception as e: logging.error(e) @@ -898,11 +942,16 @@ def get_endpoint_settings(self, endpoint_id) -> Dict: key_pattern = "{}*{}*".format( self.FEDML_MODEL_ENDPOINT_REPLICA_USER_SETTING_TAG, endpoint_id) - endpoint_settings = \ + + endpoint_settings_keys = \ self.redis_connection.keys(pattern=key_pattern) - if endpoint_settings: + + if len(endpoint_settings_keys) > 0: endpoint_settings = \ - json.load(endpoint_settings[0]) + self.redis_connection.get(endpoint_settings_keys[0]) + + if not isinstance(endpoint_settings, dict): + endpoint_settings = json.loads(endpoint_settings) else: raise Exception("Function `get_endpoint_settings` Key {} does not exist." .format(key_pattern)) @@ -966,3 +1015,22 @@ def delete_endpoint_scaling_down_decision_time(self, end_point_id) -> bool: return bool(self.redis_connection.hdel( self.FEDML_MODEL_ENDPOINT_SCALING_DOWN_DECISION_TIME_TAG, end_point_id)) + + def get_pending_requests_counter(self, end_point_id=None) -> int: + # If the endpoint does not exist inside the Hash collection, set its counter to 0. + if end_point_id and self.redis_connection.hexists(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id): + return int(self.redis_connection.hget(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id)) + return 0 + + def update_pending_requests_counter(self, end_point_id, increase=False, decrease=False) -> int: + if not self.redis_connection.hexists(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id): + self.redis_connection.hset(self.FEDML_PENDING_REQUESTS_COUNTER, mapping={end_point_id: 0}) + if increase: + self.redis_connection.hincrby(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id, 1) + if decrease: + # Careful on the negative, there is no native function for hash decreases. + self.redis_connection.hincrby(self.FEDML_PENDING_REQUESTS_COUNTER, end_point_id, -1) + # Making sure the counter never becomes negative! + if self.get_pending_requests_counter(end_point_id) < 0: + self.redis_connection.hset(self.FEDML_PENDING_REQUESTS_COUNTER, mapping={end_point_id: 0}) + return self.get_pending_requests_counter(end_point_id) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cards.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cards.py index 8feb757a63..8697d0a62c 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_cards.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_cards.py @@ -14,7 +14,6 @@ from fedml.core.common.singleton import Singleton from fedml.computing.scheduler.model_scheduler.modelops_configs import ModelOpsConfigs -from fedml.computing.scheduler.model_scheduler.device_model_deployment import get_model_info from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants from fedml.computing.scheduler.model_scheduler.device_model_object import FedMLModelList, FedMLEndpointDetail from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants @@ -785,7 +784,7 @@ def push_model_to_s3(self, model_name, model_zip_path, user_id, show_progress=Tr args = {"config_version": self.config_version} _, s3_config = ModelOpsConfigs.get_instance(args).fetch_configs(self.config_version) s3_storage = S3Storage(s3_config) - model_dst_key = "{}@{}@{}".format(user_id, model_name, str(uuid.uuid4())) + model_dst_key = "{}@{}@{}.zip".format(user_id, model_name, str(uuid.uuid4())) model_storage_url = s3_storage.upload_file_with_progress(model_zip_path, model_dst_key, show_progress=show_progress, out_progress_to_err=True, diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_db.py b/python/fedml/computing/scheduler/model_scheduler/device_model_db.py index 1f43f719f3..606d8c010b 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_db.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_db.py @@ -1,6 +1,7 @@ import json import logging import os +import platform import time from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants @@ -9,6 +10,7 @@ from sqlalchemy.ext.declarative import declarative_base from fedml.core.common.singleton import Singleton from sqlalchemy.sql import text +from typing import List, Dict Base = declarative_base() @@ -41,9 +43,11 @@ def set_deployment_status(self, end_point_id, end_point_name, model_name, model_ self.set_deployment_results_info(end_point_id, end_point_name, model_name, model_version, device_id, deployment_status=deployment_status, replica_no=replica_no) - def get_deployment_result_list(self, end_point_id, end_point_name, model_name, model_version=None): + def get_deployment_result_list(self, end_point_id, end_point_name, model_name, model_version=None) -> List[str]: """ - query from sqlite db using e_id + Get the orm use get_deployment_results_info, + but (1) nested results with cache_device_id, cache_replica_no. + (2) return a list of json string, so that redis can store it. """ result_list = self.get_deployment_results_info(end_point_id, end_point_name, model_name, model_version) ret_result_list = list() @@ -54,6 +58,39 @@ def get_deployment_result_list(self, end_point_id, end_point_name, model_name, m ret_result_list.append(json.dumps(result_dict)) return ret_result_list + def get_all_deployment_results_list(self) -> List[Dict]: + """ + Similar to _get_all_deployment_results_info, + but return a list of json string, so that redis can store it. + + return a list of dict, for each item: + [ + { + "end_point_id": "", + "end_point_name": "", + "model_name":"", + "replica_res": "" # Json string + }, + ] + value in the dict is a string that contains the deployment result. + """ + flat_ep_list = self._get_all_deployment_results_info() + ret_result_list = list() + for result in flat_ep_list: + result_dict = { + "end_point_id": result.end_point_id, + "end_point_name": result.end_point_name, + "model_name": result.model_name, + "replica_info": json.dumps( + { + "cache_device_id": result.device_id, + "cache_replica_no": int(result.replica_no), + "result": result.deployment_result + } + ) + } + ret_result_list.append(result_dict) + return ret_result_list def get_deployment_status_list(self, end_point_id, end_point_name, model_name, model_version=None): result_list = self.get_deployment_results_info(end_point_id, end_point_name, model_name, model_version) @@ -155,7 +192,8 @@ def delete_deployment_run_info(self, end_point_id): end_point_id=f'{end_point_id}').delete() self.db_connection.commit() - def get_result_item_info(self, result_item): + @staticmethod + def get_result_item_info(result_item): result_item_json = json.loads(result_item) if isinstance(result_item_json, dict): result_item_json = json.loads(result_item) @@ -168,7 +206,8 @@ def get_result_item_info(self, result_item): result_payload = result_item_json["result"] return device_id, replica_no, result_payload - def get_status_item_info(self, status_item): + @staticmethod + def get_status_item_info(status_item): status_item_json = json.loads(status_item) if isinstance(status_item_json, dict): status_item_json = json.loads(status_item) @@ -261,7 +300,10 @@ def open_job_db(self): self.db_base_dir = ServerConstants.get_database_dir() job_db_path = os.path.join(self.db_base_dir, FedMLModelDatabase.MODEL_DEPLOYMENT_DB) - self.db_engine = create_engine('sqlite:////{}'.format(job_db_path), echo=False) + if platform.system() == "Windows": + self.db_engine = create_engine('sqlite:///{}'.format(job_db_path), echo=False) + else: + self.db_engine = create_engine('sqlite:////{}'.format(job_db_path), echo=False) db_session_class = sessionmaker(bind=self.db_engine) self.db_connection = db_session_class() @@ -316,6 +358,11 @@ def get_deployment_results_info(self, end_point_id, end_point_name, model_name, FedMLDeploymentResultInfoModel.model_version == f'{model_version}')).all() return result_info + def _get_all_deployment_results_info(self): + self.open_job_db() + result_info = self.db_connection.query(FedMLDeploymentResultInfoModel).all() + return result_info + def set_deployment_results_info(self, end_point_id, end_point_name, model_name, model_version, device_id, deployment_result=None, deployment_status=None, replica_no=None): diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py index 8670633eeb..baee7a2973 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py @@ -1,40 +1,32 @@ +import fedml + import logging import os -import pickle -import platform -import shutil import time import traceback import yaml import datetime +import docker import requests import torch import torch.nn -import tritonclient.http as http_client import collections.abc -import fedml from fedml.computing.scheduler.comm_utils import sys_utils, security_utils -from fedml.computing.scheduler.comm_utils.container_utils import ContainerUtils +from fedml.computing.scheduler.comm_utils.hardware_utils import HardwareUtil from fedml.computing.scheduler.comm_utils.job_utils import JobRunnerUtils - -for type_name in collections.abc.__all__: - setattr(collections, type_name, getattr(collections.abc, type_name)) - from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants -import io - -import docker -from ..scheduler_core.compute_cache_manager import ComputeCacheManager +from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants +from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache from ..scheduler_core.compute_utils import ComputeUtils from ..comm_utils.container_utils import ContainerUtils - from .device_http_inference_protocol import FedMLHttpInference -from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache +for type_name in collections.abc.__all__: + setattr(collections, type_name, getattr(collections.abc, type_name)) no_real_gpu_allocation = None @@ -68,284 +60,158 @@ def request_gpu_ids_on_deployment(edge_id, end_point_id, num_gpus=None, master_d def start_deployment(end_point_id, end_point_name, model_id, model_version, model_storage_local_path, inference_model_name, inference_engine, infer_host, master_ip, edge_id, master_device_id=None, replica_rank=0, - gpu_per_replica=1): + gpu_per_replica=1, request_json=None): + if request_json is None: + request_json = dict() logging.info("[Worker] Model deployment is starting...") # Real gpu per replica (container-level) num_gpus = gpu_per_replica gpu_ids, gpu_attach_cmd = None, "" + # Concatenate the full model name running_model_name = ClientConstants.get_running_model_name( end_point_name, inference_model_name, model_version, end_point_id, model_id, edge_id=edge_id) - # Parse the model config file and get the necessary information for the deployment + # Parse the model config file model_config_path = os.path.join(model_storage_local_path, "fedml_model_config.yaml") with open(model_config_path, 'r') as file: config = yaml.safe_load(file) + inference_type = "default" # Resource related - use_gpu = config.get('use_gpu', True) - num_gpus_frm_yml = config.get('num_gpus', None) - if not use_gpu: - num_gpus = 0 - else: - if num_gpus_frm_yml is not None: - num_gpus = int(num_gpus_frm_yml) - usr_indicated_wait_time = config.get('deploy_timeout', 900) - usr_indicated_worker_port = config.get('worker_port', "") - if usr_indicated_worker_port == "": - usr_indicated_worker_port = os.environ.get("FEDML_WORKER_PORT", "") - shm_size = config.get('shm_size', None) - storage_opt = config.get('storage_opt', None) - tmpfs = config.get('tmpfs', None) - cpus = config.get('cpus', None) - if cpus is not None: - cpus = int(cpus) - memory = config.get('memory', None) - - if usr_indicated_worker_port == "": - usr_indicated_worker_port = None - else: - usr_indicated_worker_port = int(usr_indicated_worker_port) + use_gpu, num_gpus, shm_size, storage_opt, tmpfs, cpus, memory, port_inside_container = \ + parse_resource_related_config(config, gpu_per_replica) - worker_port_env = os.environ.get("FEDML_WORKER_PORT", "") - worker_port_from_config = config.get('worker_port', "") - logging.info(f"usr_indicated_worker_port {usr_indicated_worker_port}, worker port env {worker_port_env}, " - f"worker port from config {worker_port_from_config}") + # Image related + inference_image_name, image_pull_policy, registry_name, registry_provider, \ + registry_user_name, registry_user_password = parse_image_registry_related_config(config) - usr_indicated_retry_cnt = max(int(usr_indicated_wait_time) // 10, 1) - inference_image_name = config.get('inference_image_name', - ClientConstants.INFERENCE_SERVER_CUSTOME_IMAGE) - image_pull_policy = config.get('image_pull_policy', SchedulerConstants.IMAGE_PULL_POLICY_IF_NOT_PRESENT) + # Service app related + dst_bootstrap_dir, dst_model_serving_dir, relative_entry_fedml_format, expose_subdomains, \ + customized_image_entry_cmd, customized_readiness_check, customized_liveliness_check, customized_uri = \ + handle_container_service_app(config, model_storage_local_path) - # Source code dir, bootstrap dir, data cache dir + # Storage related src_code_dir = os.path.join(model_storage_local_path, config.get('source_code_dir', "")) - - # Get the bootstrap and job commands inside the yaml file - bootstrap_cmds_str_frm_yaml = config.get('bootstrap', "") - job_cmds_str_frm_yaml = config.get('job', "") - - if bootstrap_cmds_str_frm_yaml != "" or job_cmds_str_frm_yaml != "": - auto_gen_bootstrap_file_name = "fedml-deploy-bootstrap-entry-auto-gen.sh" - src_bootstrap_file_path = os.path.join(model_storage_local_path, auto_gen_bootstrap_file_name) - with open(src_bootstrap_file_path, 'w') as f: - f.write("cd /home/fedml/models_serving/\n") - f.write(bootstrap_cmds_str_frm_yaml) - f.write("\n") - f.write("cd /home/fedml/models_serving/\n") - f.write(job_cmds_str_frm_yaml) - else: - src_bootstrap_file_path = "" - data_cache_dir_input = config.get('data_cache_dir', "") - request_input_example = config.get('request_input_example', None) - extra_envs = config.get('environment_variables', None) - - # Serving dir inside docker - dst_model_serving_dir = "/home/fedml/models_serving" - relative_entry = config.get('entry_point') - if src_bootstrap_file_path != "": - dst_bootstrap_dir = os.path.join(dst_model_serving_dir, auto_gen_bootstrap_file_name) - else: - dst_bootstrap_dir = "" + usr_customized_workspace_dst = config.get(ClientConstants.CUSTOMIZED_WORKSPACE_MOUNT_PATH_KEY, "") - # If using customized image, then bootstrap + job will be the entry point - enable_custom_image = config.get("enable_custom_image", False) - customized_image_entry_cmd = \ - "/bin/bash /home/fedml/models_serving/fedml-deploy-bootstrap-entry-auto-gen.sh" - - docker_registry_user_name = config.get("docker_registry_user_name", "") - docker_registry_user_password = config.get("docker_registry_user_password", "") - docker_registry = config.get("docker_registry", "") - - port_inside_container = int(config.get("port_inside_container", 2345)) - use_triton = config.get("use_triton", False) - if use_triton: - inference_type = "triton" - else: - inference_type = "default" + # Others + extra_envs = config.get('environment_variables', None) + usr_indicated_wait_time = config.get(ClientConstants.DEPLOY_TIMEOUT_SEC_KEY, + config.get("deploy_timeout", ClientConstants.DEPLOY_TIMEOUT_SEC_DEFAULT)) + usr_indicated_retry_cnt = max(int(usr_indicated_wait_time) // 10, 1) + request_input_example = config.get('request_input_example', None) - # Config check - if src_code_dir == "": - raise Exception("Please indicate source_code_dir in the fedml_model_config.yaml") - if relative_entry == "": - logging.warning("You missed main_entry in the fedml_model_config.yaml") + # Parameter's check + if inference_engine != ClientConstants.INFERENCE_ENGINE_TYPE_INT_DEFAULT: + raise Exception(f"inference engine {inference_engine} is not supported") - # Request the GPU ids for the deployment + # Request the GPU if num_gpus > 0: gpu_ids, gpu_attach_cmd = request_gpu_ids_on_deployment( edge_id, end_point_id, num_gpus=num_gpus, master_device_id=master_device_id) - - # set replica and their gpu ids FedMLModelCache.get_instance().set_redis_params() FedMLModelCache.get_instance().set_replica_gpu_ids( end_point_id, end_point_name, inference_model_name, edge_id, replica_rank+1, gpu_ids) logging.info("GPU ids allocated: {}".format(gpu_ids)) + # Create the model serving dir if not exists model_serving_dir = ClientConstants.get_model_serving_dir() if not os.path.exists(model_serving_dir): os.makedirs(model_serving_dir, exist_ok=True) - converted_model_path = os.path.join(model_storage_local_path, ClientConstants.FEDML_CONVERTED_MODEL_DIR_NAME) - if os.path.exists(converted_model_path): - model_file_list = os.listdir(converted_model_path) - for model_file in model_file_list: - src_model_file = os.path.join(converted_model_path, model_file) - dst_model_file = os.path.join(model_serving_dir, model_file) - if os.path.isdir(src_model_file): - if not os.path.exists(dst_model_file): - shutil.copytree(src_model_file, dst_model_file, copy_function=shutil.copy, - ignore_dangling_symlinks=True) - else: - if not os.path.exists(dst_model_file): - shutil.copyfile(src_model_file, dst_model_file) - - if inference_engine != ClientConstants.INFERENCE_ENGINE_TYPE_INT_DEFAULT: - raise Exception(f"inference engine {inference_engine} is not supported") - # Get the master device id - logging.info(f"master ip: {master_ip}, worker ip: {infer_host}") + # Determine whether to report public ip or localhost if infer_host == master_ip: logging.info("infer_host is the same as master ip, will use 127.0.0.1 to avoid firewall issue") infer_host = "127.0.0.1" + else: + logging.info("Master and worker are located in different machines, will use the public ip for inference") + # Init container interface client try: client = docker.from_env() - if enable_custom_image and docker_registry_user_name != "" and docker_registry_user_password != "" \ - and docker_registry != "": - client.login(username=docker_registry_user_name, password=docker_registry_user_password, - registry=docker_registry) + if registry_provider == "Docker" and registry_user_name != "" and registry_user_password != "" \ + and registry_name != "": + client.login(username=registry_user_name, password=registry_user_password, + registry=registry_name) except Exception: logging.error("Failed to connect to the docker daemon, please ensure that you have " "installed Docker Desktop or Docker Engine, and the docker is running") return "", "", None, None, None + # Pull the inference image + logging.info(f"Start pulling the inference image {inference_image_name}... with policy {image_pull_policy}") + ContainerUtils.get_instance().pull_image_with_policy(image_pull_policy, inference_image_name) + + # Remove if the container exists container_prefix = ("{}".format(ClientConstants.FEDML_DEFAULT_SERVER_CONTAINER_NAME_PREFIX) + "__" + security_utils.get_content_hash(running_model_name)) - default_server_container_name = container_prefix + "__" + str(replica_rank) - try: exist_container_obj = client.containers.get(default_server_container_name) except docker.errors.NotFound: exist_container_obj = None except docker.errors.APIError: raise Exception("Failed to get the container object") - # Allocate the GPU # TODO: Make sure no competition for each replica in a single deployment if exist_container_obj is not None: client.api.remove_container(exist_container_obj.id, v=True, force=True) - device_requests = [] - if no_real_gpu_allocation is not None: - use_gpu = not no_real_gpu_allocation - if use_gpu: - logging.info("Number of GPUs: {}".format(num_gpus)) - if gpu_ids is not None: - gpu_id_list = map(lambda x: str(x), gpu_ids) - device_requests.append( - docker.types.DeviceRequest(device_ids=list(gpu_id_list), capabilities=[['gpu']])) - else: - device_requests.append( - docker.types.DeviceRequest(count=num_gpus, capabilities=[['gpu']])) - logging.info(f"device_requests: {device_requests}") - # Pull the inference image - logging.info(f"Start pulling the inference image {inference_image_name}... with policy {image_pull_policy}") - ContainerUtils.get_instance().pull_image_with_policy(image_pull_policy, inference_image_name) - - volumns = [] + # Build host config + volumes = [] binds = {} environment = {} - # data_cache_dir mounting - assert type(data_cache_dir_input) == dict or type(data_cache_dir_input) == str - if type(data_cache_dir_input) == str: - # In this case, we mount to the same folder, if it has ~, we replace it with /home/fedml - src_data_cache_dir, dst_data_cache_dir = "", "" - if data_cache_dir_input != "": - if data_cache_dir_input[0] == "~": - src_data_cache_dir = os.path.expanduser(data_cache_dir_input) - dst_data_cache_dir = data_cache_dir_input.replace("~", "/home/fedml") - else: - # check if the data_cache_dir is a relative path - if data_cache_dir_input[0] != "/": - raise "data_cache_dir_input has to be an absolute path or start with ~" - else: - src_data_cache_dir = data_cache_dir_input - dst_data_cache_dir = data_cache_dir_input - logging.info(f"src_data_cache_dir: {src_data_cache_dir}, dst_data_cache_dir: {dst_data_cache_dir}") + # Handle the union volume mount + _handle_union_volume_mount(binds, volumes, environment, data_cache_dir_input) + + # Handle the default volume mount + handle_volume_mount(volumes, binds, environment, relative_entry_fedml_format, src_code_dir, + dst_model_serving_dir, usr_customized_workspace_dst) + + # Host config + host_config_dict = { + "binds": binds, + "port_bindings": { + port_inside_container: None + }, + "shm_size": shm_size, + "storage_opt": storage_opt, + "tmpfs": tmpfs, + "cpu_count": cpus, + "mem_limit": memory + } + + device_mapping = {} + if no_real_gpu_allocation is not None: + use_gpu = not no_real_gpu_allocation + if use_gpu: + logging.info("Number of GPUs: {}".format(num_gpus)) + device_mapping = HardwareUtil.get_docker_gpu_device_mapping(gpu_ids, num_gpus) + logging.info(f"device_mapping: {device_mapping}") - if type(src_data_cache_dir) == str and src_data_cache_dir != "": - logging.info("Start copying the data cache to the container...") - if os.path.exists(src_data_cache_dir): - volumns.append(src_data_cache_dir) - binds[src_data_cache_dir] = { - "bind": dst_data_cache_dir, - "mode": "rw" - } - environment["DATA_CACHE_FOLDER"] = dst_data_cache_dir - else: - for k, v in data_cache_dir_input.items(): - if os.path.exists(k): - volumns.append(v) - binds[k] = { - "bind": v, - "mode": "rw" - } - else: - logging.warning(f"{k} does not exist, skip mounting it to the container") - logging.info(f"Data cache mount: {volumns}, {binds}") - - # Default mounting - if not enable_custom_image or (enable_custom_image and relative_entry != ""): - logging.info("Start copying the source code to the container...") - volumns.append(src_code_dir) - binds[src_code_dir] = { - "bind": dst_model_serving_dir, - "mode": "rw" - } - environment["MAIN_ENTRY"] = relative_entry - - # Environment variables - if not enable_custom_image: - # For some image, the default user is root. Unified to fedml. - environment["HOME"] = "/home/fedml" + if device_mapping: + host_config_dict.update(device_mapping) - environment["BOOTSTRAP_DIR"] = dst_bootstrap_dir - environment["FEDML_CURRENT_RUN_ID"] = end_point_id - environment["FEDML_CURRENT_EDGE_ID"] = edge_id - environment["FEDML_REPLICA_RANK"] = replica_rank - environment["FEDML_CURRENT_VERSION"] = fedml.get_env_version() - environment["FEDML_ENV_VERSION"] = fedml.get_env_version() - environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST"] = fedml.get_local_on_premise_platform_host() - environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_PORT"] = fedml.get_local_on_premise_platform_port() - - if extra_envs is not None: - for key in extra_envs: - environment[key] = extra_envs[key] + # Handle the environment variables + handle_env_vars(environment, relative_entry_fedml_format, extra_envs, dst_bootstrap_dir, + end_point_id, edge_id, replica_rank, request_json) + # Create the container try: + host_config = client.api.create_host_config(**host_config_dict) new_container = client.api.create_container( image=inference_image_name, name=default_server_container_name, - volumes=volumns, + volumes=volumes, ports=[port_inside_container], # port open inside the container environment=environment, - host_config=client.api.create_host_config( - binds=binds, - port_bindings={ - port_inside_container: usr_indicated_worker_port # Could be either None or a port number - }, - device_requests=device_requests, - shm_size=shm_size, - storage_opt=storage_opt, - tmpfs=tmpfs, - cpu_count=cpus, - mem_limit=memory, - ), + host_config=host_config, detach=True, - command=customized_image_entry_cmd if enable_custom_image else None, - entrypoint=customized_image_entry_cmd if enable_custom_image else None + command=customized_image_entry_cmd, ) client.api.start(container=new_container.get("Id")) except Exception as e: @@ -357,88 +223,56 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version, while True: cnt += 1 try: - if usr_indicated_worker_port is not None: - inference_http_port = usr_indicated_worker_port - break - else: - # Find the random port - port_info = client.api.port(new_container.get("Id"), port_inside_container) - inference_http_port = port_info[0]["HostPort"] - logging.info("inference_http_port: {}".format(inference_http_port)) - break + # Find the random port + port_info = client.api.port(new_container.get("Id"), port_inside_container) + inference_http_port = port_info[0]["HostPort"] + logging.info("host port allocated: {}".format(inference_http_port)) + break except: if cnt >= 5: raise Exception("Failed to get the port allocation") time.sleep(3) - # Logging the info from the container when starting - log_deployment_result(end_point_id, model_id, default_server_container_name, + # Logging the info from the container when initializing + log_deployment_output(end_point_id, model_id, default_server_container_name, ClientConstants.CMD_TYPE_RUN_DEFAULT_SERVER, inference_model_name, inference_engine, inference_http_port, inference_type, retry_interval=10, deploy_attempt_threshold=usr_indicated_retry_cnt, request_input_example=request_input_example, infer_host=infer_host, - enable_custom_image=enable_custom_image) + readiness_check=customized_readiness_check) # Return the running model name and the inference output url inference_output_url, running_model_version, ret_model_metadata, ret_model_config = \ - get_model_info(inference_model_name, inference_engine, inference_http_port, - infer_host, False, inference_type, request_input_example=request_input_example, - enable_custom_image=enable_custom_image) + check_container_readiness(inference_http_port=inference_http_port, infer_host=infer_host, + readiness_check=customized_readiness_check, + request_input_example=request_input_example, + customized_uri=customized_uri) if inference_output_url == "": return running_model_name, "", None, None, None # Successfully get the result from the container model_metadata = ret_model_metadata + model_metadata["liveliness_check"] = customized_liveliness_check + model_metadata["readiness_check"] = customized_readiness_check + model_metadata[ClientConstants.EXPOSE_SUBDOMAINS_KEY] = expose_subdomains logging.info(f"[Worker][Replica{replica_rank}] Model deployment is successful with inference_output_url: " f"{inference_output_url}, model_metadata: {model_metadata}, model_config: {ret_model_config}") return running_model_name, inference_output_url, model_version, model_metadata, ret_model_config -def build_inference_req(end_point_name, model_name, token, in_model_metadata): - model_inputs = in_model_metadata["inputs"] - ret_inputs = list() - - for input_item in model_inputs: - ret_item = input_item - shape = ret_item["shape"] - data_type = ret_item["datatype"] - if ClientConstants.MODEL_DATA_TYPE_MAPPING[data_type] == ClientConstants.MODEL_DATA_TYPE_INT: - for i in range(len(shape)): - if shape[i] == -1: # if input shape is dynamic, we set a default value 1 - shape[i] = 1 - ret_item["data"] = torch.randint(0, 1, shape).tolist() - else: - for i in range(len(shape)): - if shape[i] == -1: # if input shape is dynamic, we set a default value 1 - shape[i] = 1 - ret_item["data"] = torch.zeros(shape).tolist() - ret_inputs.append(ret_item) - - input_json = {"end_point_name": end_point_name, - "model_name": model_name, - "token": str(token), - "inputs": ret_inputs, - "outputs": in_model_metadata["outputs"]} - output_json = in_model_metadata["outputs"] - - return input_json, output_json - - def should_exit_logs(end_point_id, model_id, cmd_type, model_name, inference_engine, inference_port, inference_type="default", request_input_example=None, infer_host="127.0.0.1", - enable_custom_image=False): + readiness_check=ClientConstants.READINESS_PROBE_DEFAULT): if cmd_type == ClientConstants.CMD_TYPE_RUN_DEFAULT_SERVER: # TODO: Exited Quickly if the container is Exited or Removed # If the container has exited, return True, means we should exit the logs - # container_name = "{}".format(ClientConstants.FEDML_DEFAULT_SERVER_CONTAINER_NAME_PREFIX) + "__" + \ - # security_utils.get_content_hash(model_name) try: inference_output_url, model_version, model_metadata, model_config = \ - get_model_info(model_name, inference_engine, inference_port, infer_host, - inference_type=inference_type, request_input_example=request_input_example, - enable_custom_image=enable_custom_image) + check_container_readiness(inference_http_port=inference_port, infer_host=infer_host, + readiness_check=readiness_check, + request_input_example=request_input_example) if inference_output_url != "": logging.info("Log test for deploying model successfully, inference url: {}, " "model metadata: {}, model config: {}". @@ -453,12 +287,12 @@ def should_exit_logs(end_point_id, model_id, cmd_type, model_name, inference_eng return False -def log_deployment_result(end_point_id, model_id, cmd_container_name, cmd_type, +def log_deployment_output(end_point_id, model_id, cmd_container_name, cmd_type, inference_model_name, inference_engine, inference_http_port, inference_type="default", retry_interval=10, deploy_attempt_threshold=10, request_input_example=None, infer_host="127.0.0.1", - enable_custom_image=False): + readiness_check=ClientConstants.READINESS_PROBE_DEFAULT): deploy_attempt = 0 last_log_time = datetime.datetime.now() @@ -507,14 +341,29 @@ def log_deployment_result(end_point_id, model_id, cmd_container_name, cmd_type, if container_obj.status == "exited": logging.info("Container {} has exited, automatically remove it".format(cmd_container_name)) + + # Save the failed log into ~/.fedml/fedml-model-client/fedml/logs/failed_logs/ + # $run_id/$container_name.log + try: + parent_dir = os.path.join(ClientConstants.get_deploy_failed_log_dir()) + os.makedirs(parent_dir, exist_ok=True) + error_logs_dir = os.path.join(ClientConstants.get_deploy_failed_log_dir(), str(end_point_id)) + os.makedirs(error_logs_dir, exist_ok=True) + error_log_file = os.path.join(error_logs_dir, f"{cmd_container_name}.log") + with open(error_log_file, "w") as f: + f.write(f"Container {cmd_container_name} has exited\n") + f.write(f"Error logs: {err_logs}\n") + f.write(f"Output logs: {out_logs}\n") + except Exception as e: + logging.error(f"Failed to save the error logs with exception {e}") + client.api.remove_container(container_obj.id, v=True, force=True) break - # should_exit_logs will ping the inference container - # return True if ready + # should_exit_logs will ping the inference container, return True if ready if should_exit_logs(end_point_id, model_id, cmd_type, inference_model_name, inference_engine, inference_http_port, inference_type, request_input_example, - infer_host, enable_custom_image=enable_custom_image): + infer_host, readiness_check=readiness_check): break # Not yet ready, retry @@ -536,12 +385,63 @@ def log_deployment_result(end_point_id, model_id, cmd_container_name, cmd_type, time.sleep(retry_interval) -def is_client_inference_container_ready(infer_url_host, inference_http_port, inference_model_name, local_infer_url, - inference_type="default", model_version="", request_input_example=None): - # logging.info(f"Inference type: {inference_type}, infer_url_host {infer_url_host}, \ - # inference_http_port: {inference_http_port}, local_infer_url {local_infer_url}") +def parse_resource_related_config(config, gpu_num_frm_platform=0): + use_gpu = config.get('use_gpu', True) + num_gpus_frm_yml = config.get('num_gpus', None) + + num_gpus = gpu_num_frm_platform + # Priority: num_gpus from yaml > num_gpus from platform + if use_gpu: + if num_gpus_frm_yml is not None: + num_gpus = int(num_gpus_frm_yml) + else: + num_gpus = 0 + + shm_size = config.get('shm_size', None) + # set shm_size to 8G if not specified + if not shm_size: + shm_size = "8G" + + storage_opt = config.get('storage_opt', None) + tmpfs = config.get('tmpfs', None) + cpus = config.get('cpus', None) + if cpus is not None: + cpus = int(cpus) + memory = config.get('memory', None) + port_inside_container = int(config.get("port", 2345)) + + return use_gpu, num_gpus, shm_size, storage_opt, tmpfs, cpus, memory, port_inside_container + + +def parse_image_registry_related_config(config): + inference_image_name = config.get('inference_image_name', ClientConstants.INFERENCE_SERVER_CUSTOME_IMAGE) + image_pull_policy = config.get('image_pull_policy', SchedulerConstants.IMAGE_PULL_POLICY_IF_NOT_PRESENT) + + # Optional + registry_specs = config.get('registry_specs', {}) + registry_name = registry_specs.get("docker_registry_user_name", "") + registry_provider = registry_specs.get("registry_provider", "") + registry_user_name = config.get("registry_user_name", "") + registry_user_password = config.get("registry_user_password", "") + + return (inference_image_name, image_pull_policy, registry_name, registry_provider, + registry_user_name, registry_user_password) + + +def is_client_inference_container_ready(infer_url_host, inference_http_port, + readiness_check=ClientConstants.READINESS_PROBE_DEFAULT, + request_input_example=None, container_id=None, customized_uri=None): + # Construct the model metadata (input and output) + model_metadata = {} + if request_input_example is not None and len(request_input_example) > 0: + model_metadata["inputs"] = request_input_example + else: + model_metadata["inputs"] = {"text": "What is a good cure for hiccups?"} + model_metadata["outputs"] = [] + model_metadata["type"] = "default" - if inference_type == "default": + # Check the readiness of the container + if readiness_check == ClientConstants.READINESS_PROBE_DEFAULT: default_client_container_ready_url = "http://{}:{}/ready".format("0.0.0.0", inference_http_port) response = None try: @@ -551,275 +451,238 @@ def is_client_inference_container_ready(infer_url_host, inference_http_port, inf if not response or response.status_code != 200: return "", "", {}, {} - # Report the deployed model info - model_metadata = {} - if request_input_example is not None and len(request_input_example) > 0: - model_metadata["inputs"] = request_input_example - else: - model_metadata["inputs"] = {"text": "What is a good cure for hiccups?"} - model_metadata["outputs"] = [] - model_metadata["type"] = "default" return "http://{}:{}/predict".format(infer_url_host, inference_http_port), None, model_metadata, None else: - triton_server_url = "{}:{}".format(infer_url_host, inference_http_port) - if model_version == "" or model_version is None: - model_version = ClientConstants.INFERENCE_MODEL_VERSION - logging.info( - f"triton_server_url: {triton_server_url} model_version: {model_version} model_name: {inference_model_name}") - triton_client = http_client.InferenceServerClient(url=triton_server_url, verbose=False) - if not triton_client.is_model_ready( - model_name=inference_model_name, model_version=model_version - ): - return "", model_version, {}, {} - logging.info(f"Model {inference_model_name} is ready, start to get model metadata...") - model_metadata = triton_client.get_model_metadata(model_name=inference_model_name, model_version=model_version) - model_config = triton_client.get_model_config(model_name=inference_model_name, model_version=model_version) - version_list = model_metadata.get("versions", None) - if version_list is not None and len(version_list) > 0: - model_version = version_list[0] - else: - model_version = ClientConstants.INFERENCE_MODEL_VERSION + if not isinstance(readiness_check, dict): + logging.error(f"Unknown readiness check type: {readiness_check}") + return "", "", {}, {} - inference_output_url = "http://{}:{}/{}/models/{}/versions/{}/infer".format(infer_url_host, - inference_http_port, - ClientConstants.INFERENCE_INFERENCE_SERVER_VERSION, - inference_model_name, - model_version) + if "httpGet" in readiness_check: + if "path" in readiness_check["httpGet"]: + check_path = readiness_check["httpGet"]["path"] + if not isinstance(check_path, str): + logging.error(f"Invalid path type: {check_path}, expected str") + return "", "", {}, {} + else: + if not check_path.startswith("/"): + check_path = "/" + check_path + response = None + try: + response = requests.get(f"http://{infer_url_host}:{inference_http_port}{check_path}") + except: + pass + if not response or response.status_code != 200: + return "", "", {}, {} + else: + logging.error("'path' is not specified in httpGet readiness check") + return "", "", {}, {} + elif "exec" in readiness_check: + # TODO(raphael): Support arbitrary readiness check command by using container id and docker exec + pass + else: + # Ref K8S, if no readiness check, we assume the container is ready immediately + pass - return inference_output_url, model_version, model_metadata, model_config + # Construct the customized URI + path = "" + if customized_uri is not None: + if "httpPost" in customized_uri and "path" in customized_uri["httpPost"]: + path = customized_uri["httpPost"]["path"] + if not isinstance(path, str): + logging.error(f"Invalid path type: {path}, expected str") + return "", "", {}, {} + else: + if not path.startswith("/"): + path = "/" + path + # TODO(raphael): Finalized more customized URI types + readiness_check_url = f"http://{infer_url_host}:{inference_http_port}{path}" + return readiness_check_url, None, model_metadata, None -def get_model_info(model_name, inference_engine, inference_http_port, infer_host="127.0.0.1", is_hg_model=False, - inference_type="default", request_input_example=None, enable_custom_image=False): - if model_name is None: - return "", "", {}, {} - local_infer_url = "{}:{}".format(infer_host, inference_http_port) +def _handle_union_volume_mount(binds, volumes, environment, data_cache_dir_input=None): + """ + Private: data_cache_dir is the union folder on host machine, which will be shard across different containers, + the control of this folder should be handled by the platform. + """ + if isinstance(data_cache_dir_input, str): + # In this case, we mount to the same folder, if it has ~, we replace it with /home/fedml + if data_cache_dir_input != "": + if data_cache_dir_input[0] == "~": + src_data_cache_dir = os.path.expanduser(data_cache_dir_input) + dst_data_cache_dir = data_cache_dir_input.replace("~", "/home/fedml") + else: + # check if the data_cache_dir is a relative path + if data_cache_dir_input[0] != "/": + raise "data_cache_dir_input has to be an absolute path or start with ~" + else: + src_data_cache_dir = data_cache_dir_input + dst_data_cache_dir = data_cache_dir_input + logging.info(f"src_data_cache_dir: {src_data_cache_dir}, dst_data_cache_dir: {dst_data_cache_dir}") - if is_hg_model: - inference_model_name = "{}_{}_inference".format(model_name, str(inference_engine)) + if isinstance(src_data_cache_dir, str) and src_data_cache_dir != "": + logging.info("Start copying the data cache to the container...") + if os.path.exists(src_data_cache_dir): + volumes.append(src_data_cache_dir) + binds[src_data_cache_dir] = { + "bind": dst_data_cache_dir, + "mode": "rw" + } + environment["DATA_CACHE_FOLDER"] = dst_data_cache_dir + elif isinstance(data_cache_dir_input, dict): + for k, v in data_cache_dir_input.items(): + if os.path.exists(k): + volumes.append(v) + binds[k] = { + "bind": v, + "mode": "rw" + } + else: + logging.warning(f"{k} does not exist, skip mounting it to the container") + logging.info(f"Data cache mount: {volumes}, {binds}") else: - inference_model_name = model_name - - response_from_client_container = is_client_inference_container_ready( - infer_host, inference_http_port, inference_model_name, local_infer_url, - inference_type, model_version="", request_input_example=request_input_example) - - return response_from_client_container + logging.info("data_cache_dir_input is not a string or a dictionary, skip mounting it to the container") + + +def handle_volume_mount(volumes, binds, environment, relative_entry_fedml_format="", src_code_dir="", + dst_model_serving_dir="", usr_customized_workspace_dst=""): + # If fedml format entry point is specified, inject the source code, e.g., main.py (FedMLPredictor inside) + volumes.append(src_code_dir) + dst_mount_dir = dst_model_serving_dir + + if usr_customized_workspace_dst != "" and relative_entry_fedml_format == "": + # We only allow user to indicate the workspace mount rule when they are using the custom image + dst_mount_dir = usr_customized_workspace_dst + + binds[src_code_dir] = { + "bind": dst_mount_dir, + "mode": "rw" + } + + logging.info(f"Mounting the source code to the container..., target: {dst_mount_dir}") + + if relative_entry_fedml_format != "": + environment["MAIN_ENTRY"] = relative_entry_fedml_format + + +def handle_container_service_app(config, model_storage_local_path): + # Bootstrap, job and entrypoint related + dst_model_serving_dir = config.get(ClientConstants.CUSTOMIZED_WORKSPACE_MOUNT_PATH_KEY, + "/home/fedml/models_serving") + bootstrap_cmds_str_frm_yaml = config.get('bootstrap', "") + job_cmds_str_frm_yaml = config.get('job', "") + + auto_gen_bootstrap_file_name = "fedml-deploy-bootstrap-entry-auto-gen.sh" + if bootstrap_cmds_str_frm_yaml != "" or job_cmds_str_frm_yaml != "": + src_bootstrap_file_path = os.path.join(model_storage_local_path, auto_gen_bootstrap_file_name) + with open(src_bootstrap_file_path, 'w') as f: + f.write(f"cd {dst_model_serving_dir}/\n") + f.write(bootstrap_cmds_str_frm_yaml) + f.write("\n") + f.write(f"cd {dst_model_serving_dir}/\n") + f.write(job_cmds_str_frm_yaml) + else: + src_bootstrap_file_path = "" + if src_bootstrap_file_path != "": + dst_bootstrap_dir = os.path.join(dst_model_serving_dir, auto_gen_bootstrap_file_name) -def run_http_inference_with_curl_request(inference_url, inference_input_list, inference_output_list, - inference_type="default", engine_type="default", timeout=None): - return FedMLHttpInference.run_http_inference_with_curl_request( - inference_url, inference_input_list, inference_output_list, - inference_type=inference_type, engine_type=engine_type, timeout=timeout) + # User could specify "workspace_mount_path", override the default path + if ClientConstants.CUSTOMIZED_WORKSPACE_MOUNT_PATH_KEY in config: + dst_bootstrap_dir = os.path.join(config[ClientConstants.CUSTOMIZED_WORKSPACE_MOUNT_PATH_KEY], + auto_gen_bootstrap_file_name) + else: + dst_bootstrap_dir = "" + + # If the entry point is in fedml format (e.g., "main.py") + relative_entry_fedml_format = config.get('entry_point', "") + + # User indicate either fedml format python main entry filename or entry command + expose_subdomains = config.get(ClientConstants.EXPOSE_SUBDOMAINS_KEY, False) + customized_image_entry_cmd = config.get('container_run_command', None) # Could be str or list + + if customized_image_entry_cmd is not None and relative_entry_fedml_format != "": + logging.warning("Both entry_point and container_run_command are specified, " + "entry_point will be ignored") + relative_entry_fedml_format = "" + + # if usr indicates both the customized_image_entry_cmd and bootstrap, we will inject the bootstrap into the entry + # Using /bin/bash to run the bootstrap script, there are three legal formats for the customized_image_entry_cmd + # However, only the third one is supported in this function + """ + 1. CMD ["executable","param1","param2"] (exec form) + e.g. + CMD ["python3", "/app/app.py", "--port", "8080"] + + 2. CMD ["param1","param2"] (exec form, as default parameters to ENTRYPOINT) + e.g. + ENTRYPOINT ["nginx"] + CMD ["-g", "daemon off;"] + + 3. CMD command param1 param2 + e.g. + echo "Container is running" && curl http://example.com + """ + if dst_bootstrap_dir != "" and customized_image_entry_cmd is not None: + if isinstance(customized_image_entry_cmd, str): + if customized_image_entry_cmd == "": + # We do not know the original CMD in the Dockerfile and do not want to overwrite it + pass + else: + # TODO(Raphael): Try to fix the compatibility issue with the first two formats and + # also the restriction of /bin/bash + customized_image_entry_cmd = \ + f"/bin/bash -c '/bin/bash {dst_bootstrap_dir} && {customized_image_entry_cmd}'" + else: + logging.warning("The customized_image_entry_cmd is not a string, skip injecting the bootstrap script") + customized_readiness_check = config.get('readiness_probe', ClientConstants.READINESS_PROBE_DEFAULT) + customized_liveliness_check = config.get('liveness_probe', ClientConstants.LIVENESS_PROBE_DEFAULT) + customized_uri = config.get(ClientConstants.CUSTOMIZED_SERVICE_KEY, "") -def convert_model_to_onnx( - torch_model, output_path: str, dummy_input_list, input_size: int, input_is_tensor=True -) -> None: - from collections import OrderedDict - import torch - from torch.onnx import TrainingMode - - torch.onnx.export(torch_model, # model being run - dummy_input_list if input_is_tensor else tuple(dummy_input_list), - # model input (or a tuple for multiple inputs) - f=output_path, # where to save the model (can be a file or file-like object) - export_params=True, # store the trained parameter weights inside the model file - opset_version=11, # the ONNX version to export the model to - do_constant_folding=False, # whether to execute constant folding for optimization - input_names=["input1", "input2"], - # the model's input names - output_names=['output'], # the model's output names - training=TrainingMode.EVAL, - verbose=True, - dynamic_axes={"input1": {0: "batch_size"}, - "input2": {0: "batch_size"}, - "output": {0: "batch_size"}} - ) - - -def test_start_triton_server(model_serving_dir): - sudo_prefix = "sudo " - sys_name = platform.system() - if sys_name == "Darwin": - sudo_prefix = "" - gpu_attach_cmd = "" + return (dst_bootstrap_dir, dst_model_serving_dir, relative_entry_fedml_format, expose_subdomains, + customized_image_entry_cmd, customized_readiness_check, customized_liveliness_check, customized_uri) - triton_server_container_name = "{}".format(ClientConstants.FEDML_TRITON_SERVER_CONTAINER_NAME_PREFIX) - triton_server_cmd = "{}docker stop {}; {}docker rm {}; {}docker run --name {} {} -p{}:8000 " \ - "-p{}:8001 -p{}:8002 " \ - "--shm-size {} " \ - "-v {}:/models {} " \ - "bash -c \"pip install transformers && tritonserver --strict-model-config=false " \ - "--model-control-mode=poll --repository-poll-secs={} " \ - "--model-repository=/models\" ".format(sudo_prefix, triton_server_container_name, - sudo_prefix, triton_server_container_name, - sudo_prefix, triton_server_container_name, - gpu_attach_cmd, - ClientConstants.INFERENCE_HTTP_PORT, - ClientConstants.INFERENCE_GRPC_PORT, - 8002, - "4096m", - model_serving_dir, - ClientConstants.INFERENCE_SERVER_IMAGE, - ClientConstants.FEDML_MODEL_SERVING_REPO_SCAN_INTERVAL) - logging.info("Run triton inference server: {}".format(triton_server_cmd)) - triton_server_process = ClientConstants.exec_console_with_script(triton_server_cmd, - should_capture_stdout=False, - should_capture_stderr=False, - no_sys_out_err=True) - - -def test_convert_pytorch_model_to_onnx(model_net_file, model_bin_file, model_name, model_in_params): - torch_model = torch.jit.load(model_net_file) - with open(model_bin_file, 'rb') as model_pkl_file: - model_state_dict = pickle.load(model_pkl_file) - torch_model.load_state_dict(model_state_dict) - torch_model.eval() - - input_size = model_in_params["input_size"] - input_types = model_in_params["input_types"] - - dummy_input_list = [] - for index, input_i in enumerate(input_size): - if input_types[index] == "int": - this_input = torch.tensor(torch.randint(0, 1, input_i)) - else: - this_input = torch.tensor(torch.zeros(input_i)) - dummy_input_list.append(this_input) - onnx_model_dir = os.path.join(ClientConstants.get_model_cache_dir(), - ClientConstants.FEDML_CONVERTED_MODEL_DIR_NAME, - model_name, ClientConstants.INFERENCE_MODEL_VERSION) - if not os.path.exists(onnx_model_dir): - os.makedirs(onnx_model_dir, exist_ok=True) - onnx_model_path = os.path.join(onnx_model_dir, "model.onnx") +def handle_env_vars(environment, relative_entry_fedml_format, extra_envs, dst_bootstrap_dir, end_point_id, edge_id, + replica_rank, request_json): + enable_custom_image = False if relative_entry_fedml_format != "" else True + if not enable_custom_image: + # For some image, the default user is root. Unified to fedml. + environment["HOME"] = "/home/fedml" - convert_model_to_onnx(torch_model, onnx_model_path, dummy_input_list, input_size, - input_is_tensor=True) + if request_json and ServerConstants.USER_ENCRYPTED_API_KEY in request_json: + environment[ClientConstants.ENV_USER_ENCRYPTED_API_KEY] = request_json[ServerConstants.USER_ENCRYPTED_API_KEY] - model_serving_dir = os.path.join(ClientConstants.get_model_cache_dir(), - ClientConstants.FEDML_CONVERTED_MODEL_DIR_NAME) - return model_serving_dir + environment["BOOTSTRAP_DIR"] = dst_bootstrap_dir + environment["FEDML_CURRENT_RUN_ID"] = end_point_id + environment["FEDML_CURRENT_EDGE_ID"] = edge_id + environment["FEDML_REPLICA_RANK"] = replica_rank + environment["FEDML_CURRENT_VERSION"] = fedml.get_env_version() + environment["FEDML_ENV_VERSION"] = fedml.get_env_version() + environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_HOST"] = fedml.get_local_on_premise_platform_host() + environment["FEDML_ENV_LOCAL_ON_PREMISE_PLATFORM_PORT"] = fedml.get_local_on_premise_platform_port() + if extra_envs is not None: + for key in extra_envs: + environment[key] = extra_envs[key] -def start_gpu_model_load_process(): - from multiprocessing import Process - import time - process = Process(target=load_gpu_model_to_cpu_device) - process.start() - while True: - time.sleep(1) +def check_container_readiness(inference_http_port, infer_host="127.0.0.1", request_input_example=None, + readiness_check=ClientConstants.READINESS_PROBE_DEFAULT, + customized_uri=None): + response_from_client_container = is_client_inference_container_ready( + infer_host, inference_http_port, readiness_check=readiness_check, + request_input_example=request_input_example, customized_uri=customized_uri) + return response_from_client_container -def load_gpu_model_to_cpu_device(): - import pickle - import io - import torch - class CPU_Unpickler(pickle.Unpickler): - def find_class(self, module, name): - if module == 'torch.storage' and name == '_load_from_bytes': - return lambda b: torch.load(io.BytesIO(b), map_location='cpu') - else: - return super().find_class(module, name) - - model_file = "/home/fedml/.fedml/fedml-client/fedml/models/theta_rec_auc_81_single_label/theta_rec_auc_81_single_label" - with open(model_file, "rb") as model_pkl_file: - if not torch.cuda.is_available(): - model = CPU_Unpickler(model_pkl_file).load() - if model is None: - print("Failed to load gpu model to cpu device") - else: - print("Succeeded to load gpu model to cpu device") +def run_http_inference_with_curl_request(inference_url, inference_input_list, inference_output_list, + inference_type="default", engine_type="default", timeout=None): + return FedMLHttpInference.run_http_inference_with_curl_request( + inference_url, inference_input_list, inference_output_list, + inference_type=inference_type, engine_type=engine_type, timeout=timeout) if __name__ == "__main__": - start_gpu_model_load_process() - - model_serving_dir = test_convert_pytorch_model_to_onnx("./sample-open-training-model-net", - "./sample-open-training-model", - "rec-model", - {"input_size": [[1, 24], [1, 2]], - "input_types": ["int", "float"]}) - - test_start_triton_server(model_serving_dir) - - # input_data = {"model_version": "v0-Sun Feb 05 12:17:16 GMT 2023", - # "model_name": "model_414_45_open-model-test_v0-Sun-Feb-05-12-17-16-GMT-2023", - # # "data": "file:///Users/alexliang/fedml_data/mnist-image.png", - # "data": "https://raw.githubusercontent.com/niyazed/triton-mnist-example/master/images/sample_image.png", - # "end_point_id": 414, "model_id": 45, "token": "a09a18a14c4c4d89a8d5f9515704c073"} - # - # data_list = list() - # data_list.append(input_data["data"]) - # run_http_inference_with_lib_http_api_with_image_data(input_data["model_name"], - # 5001, 1, data_list, "") - # - # - # class LogisticRegression(torch.nn.Module): - # def __init__(self, input_dim, output_dim): - # super(LogisticRegression, self).__init__() - # self.linear = torch.nn.Linear(input_dim, output_dim) - # - # def forward(self, x): - # outputs = torch.sigmoid(self.linear(x)) - # return outputs - # - # - # model = LogisticRegression(28 * 28, 10) - # checkpoint = {'model': model} - # model_net_file = "/Users/alexliang/fedml-client/fedml/models/open-model-test/model-net.pt" - # torch.save(checkpoint, model_net_file) - # - # with open("/Users/alexliang/fedml-client/fedml/models/open-model-test/open-model-test", 'rb') as model_pkl_file: - # model_params = pickle.load(model_pkl_file) - # # torch.save(model_params, "/Users/alexliang/fedml-client/fedml/models/open-model-test/a.pt") - # # model = torch.load("/Users/alexliang/fedml-client/fedml/models/open-model-test/a.pt") - # loaded_checkpoint = torch.load(model_net_file) - # loaded_model = loaded_checkpoint["model"] - # loaded_model.load_state_dict(model_params) - # for parameter in loaded_model.parameters(): - # parameter.requires_grad = False - # loaded_model.eval() - # input_names = {"x": 0} - # convert_model_to_onnx(loaded_model, "/Users/alexliang/fedml-client/fedml/models/open-model-test/a.onnx", - # input_names, 28 * 28) - - # parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - # parser.add_argument("--cf", "-c", help="config file") - # parser.add_argument("--role", "-r", type=str, default="client", help="role") - # parser.add_argument("--model_storage_local_path", "-url", type=str, default="/home/ubuntu", - # help="model storage local path") - # parser.add_argument("--inference_model_name", "-n", type=str, default="fedml-model", - # help="inference model name") - # parser.add_argument("--inference_engine", "-engine", type=str, default="ONNX", help="inference engine") - # parser.add_argument("--inference_http_port", "-http", type=int, default=8000, help="inference http port") - # parser.add_argument("--inference_grpc_port", "-gprc", type=int, default=8001, help="inference grpc port") - # parser.add_argument("--inference_metric_port", "-metric", type=int, default=8002, help="inference metric port") - # parser.add_argument("--inference_use_gpu", "-gpu", type=str, default="gpu", help="inference use gpu") - # parser.add_argument("--inference_memory_size", "-mem", type=str, default="256m", help="inference memory size") - # parser.add_argument("--inference_convertor_image", "-convertor", type=str, - # default=ClientConstants.INFERENCE_CONVERTOR_IMAGE, help="inference convertor image") - # parser.add_argument("--inference_server_image", "-server", type=str, - # default=ClientConstants.INFERENCE_SERVER_IMAGE, help="inference server image") - # args = parser.parse_args() - # args.user = args.user - # - # pip_source_dir = os.path.dirname(__file__) - # __running_model_name, __inference_output_url, __model_version, __model_metadata, __model_config = \ - # start_deployment( - # args.model_storage_local_path, - # args.inference_model_name, - # args.inference_engine, - # args.inference_http_port, - # args.inference_grpc_port, - # args.inference_metric_port, - # args.inference_use_gpu, - # args.inference_memory_size, - # args.inference_convertor_image, - # args.inference_server_image) - # print("Model deployment results, running model name: {}, url: {}, model metadata: {}, model config: {}".format( - # __running_model_name, __inference_output_url, __model_metadata, __model_config)) + pass diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py index b8d85edd31..9adc17538d 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_model_inference.py @@ -1,67 +1,101 @@ +import argparse +import json import logging import time import traceback -from urllib.parse import urlparse import os + from typing import Any, Mapping, MutableMapping, Union +from urllib.parse import urlparse -from fastapi import FastAPI, Request, Response, status -from fastapi.responses import StreamingResponse +from fastapi import FastAPI, Request, Response, status, APIRouter +from fastapi.responses import StreamingResponse, JSONResponse +import fedml +from fedml.api.modules.constants import ModuleConstants +from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants +from fedml.computing.scheduler.model_scheduler.device_client_constants import ClientConstants from fedml.computing.scheduler.model_scheduler.device_http_inference_protocol import FedMLHttpInference from fedml.computing.scheduler.model_scheduler.device_server_constants import ServerConstants from fedml.computing.scheduler.model_scheduler.device_model_monitor import FedMLModelMetrics from fedml.computing.scheduler.model_scheduler.device_model_cache import FedMLModelCache from fedml.computing.scheduler.model_scheduler.device_mqtt_inference_protocol import FedMLMqttInference from fedml.computing.scheduler.model_scheduler.device_http_proxy_inference_protocol import FedMLHttpProxyInference -from fedml.computing.scheduler.comm_utils import sys_utils - -try: - from pydantic import BaseSettings -except Exception as e: - pass -try: - from pydantic_settings import BaseSettings -except Exception as e: - pass - - -# class Settings(BaseSettings): -# redis_addr: str -# redis_port: str -# redis_password: str -# end_point_name: str -# model_name: str -# model_version: str -# model_infer_url: str -# version: str -# use_mqtt_inference: bool -# use_worker_gateway: bool -# ext_info: str -# -# -# settings = Settings() - -class settings: - redis_addr = "127.0.0.1" - redis_port = 6379 - redis_password = "fedml_default" - end_point_name = "" - model_name = "" - model_version = "" - model_infer_url = "127.0.0.1" - version = "dev" - use_mqtt_inference = False - use_worker_gateway = False - ext_info = "2b34303961245c4f175f2236282d7a272c040b0904747579087f6a760112030109010c215d54505707140005190a051c347f365c4a430c020a7d39120e26032a78730f797f7c031f0901657e75" +from fedml.computing.scheduler.comm_utils.network_util import replace_url_with_path +from fedml.core.mlops.mlops_configs import MLOpsConfigs +from fedml.core.mlops import MLOpsRuntimeLog, MLOpsRuntimeLogDaemon + + +class Settings: + server_name = "DEVICE_INFERENCE_GATEWAY" + fedml.load_env() + redis_addr = os.getenv(ModuleConstants.ENV_FEDML_INFER_REDIS_ADDR, SchedulerConstants.REDIS_ADDR) + redis_port = os.getenv(ModuleConstants.ENV_FEDML_INFER_REDIS_PORT, SchedulerConstants.REDIS_PORT) + redis_password = os.getenv(ModuleConstants.ENV_FEDML_INFER_REDIS_PASSWORD, SchedulerConstants.REDIS_PASSWORD) + model_infer_host = os.getenv(ModuleConstants.ENV_FEDML_INFER_HOST, SchedulerConstants.REDIS_INFER_HOST) + version = fedml.get_env_version() + mqtt_config = MLOpsConfigs.fetch_mqtt_config() api = FastAPI() +router = APIRouter() + +FEDML_MODEL_CACHE = FedMLModelCache.get_instance() +FEDML_MODEL_CACHE.set_redis_params(redis_addr=Settings.redis_addr, + redis_port=Settings.redis_port, + redis_password=Settings.redis_password) + + +@api.middleware("http") +async def auth_middleware(request: Request, call_next): + if "/inference" in request.url.path or "/api/v1/predict" in request.url.path: + try: + # Attempt to parse the JSON body. + request_json = await request.json() + except json.JSONDecodeError: + return JSONResponse( + {"error": True, "message": "Invalid JSON."}, + status_code=status.HTTP_400_BAD_REQUEST) + + # Get endpoint's total pending requests. + end_point_id = request_json.get("end_point_id", None) + pending_requests_num = FEDML_MODEL_CACHE.get_pending_requests_counter(end_point_id) + if pending_requests_num: + # Fetch metrics of the past k=3 requests. + pask_k_metrics = FEDML_MODEL_CACHE.get_endpoint_metrics( + end_point_id=end_point_id, + k_recent=3) + + # Get the request timeout from the endpoint settings. + request_timeout_s = FEDML_MODEL_CACHE.get_endpoint_settings(end_point_id) \ + .get(ServerConstants.INFERENCE_REQUEST_TIMEOUT_KEY, ServerConstants.INFERENCE_REQUEST_TIMEOUT_DEFAULT) + + # Only proceed if the past k metrics collection is not empty. + if pask_k_metrics: + # Measure the average latency in seconds(!), hence the 0.001 multiplier. + past_k_latencies_sec = \ + [float(j_obj["current_latency"]) * 0.001 for j_obj in pask_k_metrics] + mean_latency = sum(past_k_latencies_sec) / len(past_k_latencies_sec) + + # If timeout threshold is exceeded then cancel and return time out error. + should_block = (mean_latency * pending_requests_num) > request_timeout_s + if should_block: + return JSONResponse( + {"error": True, "message": "Request timed out."}, + status_code=status.HTTP_504_GATEWAY_TIMEOUT) + + response = await call_next(request) + return response + + +@api.on_event("startup") +async def startup_event(): + configure_logging() @api.get('/') async def root(): - return {'message': 'FedML Federated Inference Service!'} + return {'message': 'TensorOpera Inference Service!'} @api.get('/ready') @@ -105,7 +139,7 @@ async def predict_openai(end_point_id, request: Request): try: response = await _predict(end_point_id, input_json, header) except Exception as e: - response = {"error": True, "message": f"{traceback.format_exc()}"} + response = {"error": True, "message": f"{traceback.format_exc()}, exception {e}"} return response @@ -136,83 +170,128 @@ async def predict_with_end_point_id(end_point_id, request: Request, response: Re return inference_response +@router.api_route("/custom_inference/{end_point_id}/{path:path}", methods=["POST", "GET"]) +async def custom_inference(end_point_id, path: str, request: Request): + # Get json data + input_json = await request.json() + + # Get header + header = request.headers + + try: + inference_response = await _predict(end_point_id, input_json, header, path, request.method) + except Exception as e: + inference_response = {"error": True, "message": f"{traceback.format_exc()}"} + + return inference_response + +api.include_router(router) + + async def _predict( end_point_id, input_json, - header=None + header=None, + path=None, + request_method="POST" ) -> Union[MutableMapping[str, Any], Response, StreamingResponse]: - in_end_point_id = end_point_id - in_end_point_name = input_json.get("end_point_name", None) - in_model_name = input_json.get("model_name", None) - in_model_version = input_json.get("model_version", None) - in_end_point_token = input_json.get("token", None) - in_return_type = "default" - if header is not None: - in_return_type = header.get("Accept", "default") - - if in_model_version is None: - in_model_version = "*" # * | latest | specific version - - start_time = time.time_ns() - - # Allow missing end_point_name and model_name in the input parameters. - if in_model_name is None or in_end_point_name is None: - ret_endpoint_name, ret_model_name = retrieve_info_by_endpoint_id(in_end_point_id, in_end_point_name) - if in_model_name is None: - in_model_name = ret_model_name - if in_end_point_name is None: - in_end_point_name = ret_endpoint_name - - # Authenticate request token + # Always increase the pending requests counter on a new incoming request. + FEDML_MODEL_CACHE.update_pending_requests_counter(end_point_id, increase=True) inference_response = {} - if auth_request_token(in_end_point_id, in_end_point_name, in_model_name, in_end_point_token): - # Check the endpoint is activated - if not is_endpoint_activated(in_end_point_id): - inference_response = {"error": True, "message": "endpoint is not activated."} - logging_inference_request(input_json, inference_response) - return inference_response - # Found idle inference device - idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url = \ - found_idle_inference_device(in_end_point_id, in_end_point_name, in_model_name, in_model_version) - if idle_device is None or idle_device == "": - return {"error": True, "error_code": status.HTTP_404_NOT_FOUND, - "message": "can not found active inference worker for this endpoint."} - - # Start timing for model metrics - model_metrics = FedMLModelMetrics(end_point_id, in_end_point_name, - model_id, in_model_name, model_version, - settings.model_infer_url, - settings.redis_addr, settings.redis_port, settings.redis_password, - version=settings.version) - model_metrics.set_start_time(start_time) - - # Send inference request to idle device - logging.info("inference url {}.".format(inference_output_url)) - if inference_output_url != "": - input_list = input_json.get("inputs", input_json) - stream_flag = input_json.get("stream", False) - input_list["stream"] = input_list.get("stream", stream_flag) - output_list = input_json.get("outputs", []) - inference_response = await send_inference_request( - idle_device, end_point_id, inference_output_url, input_list, output_list, inference_type=in_return_type) - - # Calculate model metrics - try: - model_metrics.calc_metrics(end_point_id, in_end_point_name, - model_id, model_name, model_version, - inference_output_url, idle_device) - except Exception as e: - logging.info("Calculate Inference Metrics Exception: {}".format(traceback.format_exc())) - pass + try: + in_end_point_id = end_point_id + in_end_point_name = input_json.get("end_point_name", None) + in_model_name = input_json.get("model_name", None) + in_model_version = input_json.get("model_version", None) + in_end_point_token = input_json.get("token", None) + in_return_type = "default" + if header is not None: + in_return_type = header.get("Accept", "default") + + if in_model_version is None: + in_model_version = "*" # * | latest | specific version + + start_time = time.time_ns() + + # Allow missing end_point_name and model_name in the input parameters. + if in_model_name is None or in_end_point_name is None: + ret_endpoint_name, ret_model_name = retrieve_info_by_endpoint_id(in_end_point_id, in_end_point_name) + if in_model_name is None: + in_model_name = ret_model_name + if in_end_point_name is None: + in_end_point_name = ret_endpoint_name + + # Authenticate request token + if auth_request_token(in_end_point_id, in_end_point_name, in_model_name, in_end_point_token): + # Check the endpoint is activated + if not is_endpoint_activated(in_end_point_id): + inference_response = {"error": True, "message": "endpoint is not activated."} + logging_inference_request(input_json, inference_response) + FEDML_MODEL_CACHE.update_pending_requests_counter(end_point_id, decrease=True) + return inference_response - logging_inference_request(input_json, inference_response) + # Found idle inference device + idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url,\ + connectivity_type = \ + found_idle_inference_device(in_end_point_id, in_end_point_name, in_model_name, in_model_version) + if idle_device is None or idle_device == "": + FEDML_MODEL_CACHE.update_pending_requests_counter(end_point_id, decrease=True) + return {"error": True, "error_code": status.HTTP_404_NOT_FOUND, + "message": "can not found active inference worker for this endpoint."} + + # Start timing for model metrics + model_metrics = FedMLModelMetrics(end_point_id, in_end_point_name, + model_id, in_model_name, model_version, + Settings.model_infer_host, + Settings.redis_addr, + Settings.redis_port, + Settings.redis_password, + version=Settings.version) + # Setting time to the time before authentication and idle device discovery. + model_metrics.set_start_time(start_time) + + # Send inference request to idle device + logging.debug("inference url {}.".format(inference_output_url)) + if inference_output_url != "": + input_list = input_json.get("inputs", input_json) + stream_flag = input_json.get("stream", False) + input_list["stream"] = input_list.get("stream", stream_flag) + output_list = input_json.get("outputs", []) + + # main execution of redirecting the inference request to the idle device + inference_response = await send_inference_request( + idle_device, + end_point_id, + inference_output_url, + input_list, + output_list, + inference_type=in_return_type, + connectivity_type=connectivity_type, + path=path, request_method=request_method) + + # Calculate model metrics + try: + model_metrics.calc_metrics(end_point_id, in_end_point_name, + model_id, model_name, model_version, + inference_output_url, idle_device) + except Exception as e: + logging.info("Calculate Inference Metrics Exception: {}".format(traceback.format_exc())) + pass - return inference_response - else: - inference_response = {"error": True, "message": "token is not valid."} - logging_inference_request(input_json, inference_response) - return inference_response + logging_inference_request(input_json, inference_response) + FEDML_MODEL_CACHE.update_pending_requests_counter(end_point_id, decrease=True) + return inference_response + else: + inference_response = {"error": True, "message": "token is not valid."} + logging_inference_request(input_json, inference_response) + FEDML_MODEL_CACHE.update_pending_requests_counter(end_point_id, decrease=True) + return inference_response + + except Exception as e: + logging.error("Inference Exception: {}".format(traceback.format_exc())) + # Need to reduce the pending requests counter in whatever exception that may be raised. + FEDML_MODEL_CACHE.update_pending_requests_counter(end_point_id, decrease=True) def retrieve_info_by_endpoint_id(end_point_id, in_end_point_name=None, in_model_name=None, @@ -221,15 +300,14 @@ def retrieve_info_by_endpoint_id(end_point_id, in_end_point_name=None, in_model_ We allow missing end_point_name and model_name in the input parameters. return end_point_name, model_name """ - FedMLModelCache.get_instance().set_redis_params(settings.redis_addr, settings.redis_port, settings.redis_password) - redis_key = FedMLModelCache.get_instance(settings.redis_addr, settings.redis_port). \ - get_end_point_full_key_by_id(end_point_id) + redis_key = FEDML_MODEL_CACHE.get_end_point_full_key_by_id(end_point_id) if redis_key is not None: end_point_name = "" model_name = "" if in_end_point_name is not None: end_point_name = in_end_point_name - model_name = redis_key[len(f"{FedMLModelCache.FEDML_MODEL_DEPLOYMENT_STATUS_TAG}-{end_point_id}-{in_end_point_name}-"):] + model_name = redis_key[ + len(f"{FedMLModelCache.FEDML_MODEL_DEPLOYMENT_STATUS_TAG}-{end_point_id}-{in_end_point_name}-"):] else: # e.g. FEDML_MODEL_DEPLOYMENT_STATUS--1234-dummy_endpoint_name-dummy_model_name try: @@ -253,73 +331,81 @@ def found_idle_inference_device(end_point_id, end_point_name, in_model_name, in_ inference_host = "" inference_output_url = "" model_version = "" + connectivity_type = "" + # Found idle device (TODO: optimize the algorithm to search best device for inference) - FedMLModelCache.get_instance().set_redis_params(settings.redis_addr, settings.redis_port, settings.redis_password) - payload, idle_device = FedMLModelCache.get_instance(settings.redis_addr, settings.redis_port). \ + payload, idle_device = FEDML_MODEL_CACHE. \ get_idle_device(end_point_id, end_point_name, in_model_name, in_model_version) - if payload is not None: - logging.info("found idle deployment result {}".format(payload)) - deployment_result = payload - model_name = deployment_result["model_name"] - model_version = deployment_result["model_version"] - model_id = deployment_result["model_id"] - end_point_id = deployment_result["end_point_id"] - inference_output_url = deployment_result["model_url"] + if payload: + model_name = payload["model_name"] + model_version = payload["model_version"] + model_id = payload["model_id"] + end_point_id = payload["end_point_id"] + inference_output_url = payload["model_url"] + connectivity_type = \ + payload.get("connectivity_type", + ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT) url_parsed = urlparse(inference_output_url) inference_host = url_parsed.hostname else: logging.info("not found idle deployment result") - return idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url + res = (idle_device, end_point_id, model_id, model_name, model_version, inference_host, inference_output_url, + connectivity_type) + logging.debug(f"found idle device with metrics: {res}") + return res -async def send_inference_request(idle_device, endpoint_id, inference_url, input_list, output_list, - inference_type="default", has_public_ip=True): - try: - http_infer_available = os.getenv("FEDML_INFERENCE_HTTP_AVAILABLE", True) - if not http_infer_available: - if http_infer_available == "False" or http_infer_available == "false": - http_infer_available = False - - if http_infer_available: - response_ok = await FedMLHttpInference.is_inference_ready( - inference_url, timeout=os.getenv("FEDML_GATEWAY_HTTP_READY_TIMEOUT", 20)) - if response_ok: - response_ok, inference_response = await FedMLHttpInference.run_http_inference_with_curl_request( - inference_url, input_list, output_list, inference_type=inference_type) - logging.info(f"Use http inference. return {response_ok}") - return inference_response - response_ok = await FedMLHttpProxyInference.is_inference_ready( - inference_url, timeout=os.getenv("FEDML_GATEWAY_HTTP_PROXY_READY_TIMEOUT", 20)) - if response_ok: +async def send_inference_request(idle_device, end_point_id, inference_url, input_list, output_list, + inference_type="default", + connectivity_type=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT, + path=None, request_method="POST"): + request_timeout_sec = FEDML_MODEL_CACHE.get_endpoint_settings(end_point_id) \ + .get("request_timeout_sec", ClientConstants.INFERENCE_REQUEST_TIMEOUT) + + inference_url = replace_url_with_path(inference_url, path) + + try: + if connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP: + response_ok, inference_response = await FedMLHttpInference.run_http_inference_with_curl_request( + inference_url, + input_list, + output_list, + inference_type=inference_type, + timeout=request_timeout_sec, + method=request_method) + logging.debug(f"Use http inference. return {response_ok}") + return inference_response + elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_HTTP_PROXY: + logging.debug("Use http proxy inference.") response_ok, inference_response = await FedMLHttpProxyInference.run_http_proxy_inference_with_request( - endpoint_id, inference_url, input_list, output_list, inference_type=inference_type) - logging.info(f"Use http proxy inference. return {response_ok}") + end_point_id, + inference_url, + input_list, + output_list, + inference_type=inference_type, + timeout=request_timeout_sec) + logging.debug(f"Use http proxy inference. return {response_ok}") return inference_response - - if not has_public_ip: - connect_str = "@FEDML@" - random_out = sys_utils.random2(settings.ext_info, "FEDML@9999GREAT") - config_list = random_out.split(connect_str) - agent_config = dict() - agent_config["mqtt_config"] = dict() - agent_config["mqtt_config"]["BROKER_HOST"] = config_list[0] - agent_config["mqtt_config"]["BROKER_PORT"] = int(config_list[1]) - agent_config["mqtt_config"]["MQTT_USER"] = config_list[2] - agent_config["mqtt_config"]["MQTT_PWD"] = config_list[3] - agent_config["mqtt_config"]["MQTT_KEEPALIVE"] = int(config_list[4]) - mqtt_inference = FedMLMqttInference(agent_config=agent_config, run_id=endpoint_id) - response_ok = mqtt_inference.run_mqtt_health_check_with_request( - idle_device, endpoint_id, inference_url) - inference_response = {"error": True, "message": "Failed to use http, http-proxy and mqtt for inference."} - if response_ok: - response_ok, inference_response = mqtt_inference.run_mqtt_inference_with_request( - idle_device, endpoint_id, inference_url, input_list, output_list, inference_type=inference_type) - - logging.info(f"Use mqtt inference. return {response_ok}.") + elif connectivity_type == ClientConstants.WORKER_CONNECTIVITY_TYPE_MQTT: + logging.debug("Use mqtt inference.") + agent_config = {"mqtt_config": Settings.mqtt_config} + mqtt_inference = FedMLMqttInference( + agent_config=agent_config, + run_id=end_point_id) + response_ok, inference_response = mqtt_inference.run_mqtt_inference_with_request( + idle_device, + end_point_id, + inference_url, + input_list, + output_list, + inference_type=inference_type, + timeout=request_timeout_sec) + logging.debug(f"Use mqtt inference. return {response_ok}.") return inference_response - return {"error": True, "message": "Failed to use http, http-proxy for inference, no response from replica."} + else: + return {"error": True, "message": "Failed to use http, http-proxy for inference, no response from replica."} except Exception as e: inference_response = {"error": True, "message": f"Exception when using http, http-proxy and mqtt " @@ -331,23 +417,17 @@ async def send_inference_request(idle_device, endpoint_id, inference_url, input_ def auth_request_token(end_point_id, end_point_name, model_name, token): if token is None: return False - - FedMLModelCache.get_instance().set_redis_params(settings.redis_addr, settings.redis_port, settings.redis_password) - cached_token = FedMLModelCache.get_instance(settings.redis_addr, settings.redis_port). \ + cached_token = FEDML_MODEL_CACHE. \ get_end_point_token(end_point_id, end_point_name, model_name) if cached_token is not None and str(cached_token) == str(token): return True - return False def is_endpoint_activated(end_point_id): if end_point_id is None: return False - - FedMLModelCache.get_instance().set_redis_params(settings.redis_addr, settings.redis_port, settings.redis_password) - activated = FedMLModelCache.get_instance(settings.redis_addr, settings.redis_port).get_end_point_activation( - end_point_id) + activated = FEDML_MODEL_CACHE.get_end_point_activation(end_point_id) return activated @@ -357,13 +437,33 @@ def logging_inference_request(request, response): try: log_dir = ServerConstants.get_log_file_dir() - if not os.path.exists(log_dir): - os.makedirs(log_dir, exist_ok=True) inference_log_file = os.path.join(log_dir, "inference.log") with open(inference_log_file, "a") as f: f.writelines([f"request: {request}, response: {response}\n"]) except Exception as ex: - logging.info("failed to log inference request and response to file.") + logging.info(f"failed to log inference request and response to file with exception {ex}") + + +def configure_logging(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + args = parser.parse_args([]) + + setattr(args, "log_file_dir", ServerConstants.get_log_file_dir()) + setattr(args, "run_id", -1) + setattr(args, "role", "server") + setattr(args, "using_mlops", True) + setattr(args, "config_version", fedml.get_env_version()) + + runner_info = ServerConstants.get_runner_infos() + if not (runner_info and "edge_id" in runner_info): + raise Exception("Inference gateway couldn't be started as edge_id couldn't be parsed from runner_infos.yaml") + setattr(args, "edge_id", int(runner_info.get("edge_id"))) + + MLOpsRuntimeLog.get_instance(args).init_logs(log_level=logging.INFO) + MLOpsRuntimeLogDaemon.get_instance(args).start_log_processor(log_run_id=args.run_id, log_device_id=args.edge_id, + log_source=Settings.server_name, + log_file_prefix=Settings.server_name) + logging.info("start the log processor for inference gateway") if __name__ == "__main__": diff --git a/python/fedml/computing/scheduler/model_scheduler/device_mqtt_inference_protocol.py b/python/fedml/computing/scheduler/model_scheduler/device_mqtt_inference_protocol.py index b0bff261a4..9cd5c1e9a2 100755 --- a/python/fedml/computing/scheduler/model_scheduler/device_mqtt_inference_protocol.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_mqtt_inference_protocol.py @@ -10,7 +10,6 @@ import asyncio -from ..comm_utils.constants import SchedulerConstants from ....core.distributed.communication.mqtt.mqtt_manager import MqttManager from .device_http_inference_protocol import FedMLHttpInference @@ -105,7 +104,7 @@ def run_mqtt_inference_with_request( only_do_health_check=only_do_health_check, timeout=timeout ) - allowed_inference_timeout = SchedulerConstants.MQTT_INFERENCE_TIMEOUT if timeout is None else timeout + allowed_inference_timeout = timeout if timeout else -1 sleep_time_interval = 0.05 total_sleep_time = 0 while True: diff --git a/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py b/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py index 667d57c4f4..ea19efb8b6 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_replica_controller.py @@ -67,7 +67,9 @@ def __init__(self, master_id, request_json: dict): def calc_total_gpu_num(self): total_gpu_num = 0 for device_id, gpu_num in self.devices_avail_gpus.items(): - total_gpu_num += gpu_num + if type(gpu_num) is not int: + logging.warning(f"The value in gpu_topology should be int, but got {type(gpu_num)}. Try to convert it.") + total_gpu_num += int(gpu_num) return total_gpu_num def init_id_replica_num(self): @@ -77,6 +79,11 @@ def init_id_replica_num(self): """ id_replica_num = {} for id, avail_num in self.devices_avail_gpus.items(): + if type(avail_num) is not int: + logging.warning(f"The value in gpu_topology should be int, " + f"but got {type(avail_num)}. Try to convert it.") + avail_num = int(avail_num) + if avail_num % self.gpu_per_replica != 0: raise ValueError("The number of gpus for each device should be divisible by gpu_per_replica") id_replica_num[str(id)] = avail_num // self.gpu_per_replica diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py index 86c7aac992..44eaeb9371 100644 --- a/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py +++ b/python/fedml/computing/scheduler/model_scheduler/device_server_constants.py @@ -5,6 +5,7 @@ import subprocess import sys from os.path import expanduser +from pathlib import Path import psutil import yaml @@ -102,7 +103,13 @@ class ServerConstants(object): AUTO_DETECT_PUBLIC_IP = "auto_detect_public_ip" MODEL_INFERENCE_DEFAULT_PORT = 2203 + ENV_MASTER_INFERENCE_PORT_KEY = "FEDML_MASTER_INFERENCE_GATEWAY_PORT" MODEL_CACHE_KEY_EXPIRE_TIME = 1 * 10 + + INFERENCE_REQUEST_TIMEOUT_KEY = "request_timeout_sec" + INFERENCE_REQUEST_TIMEOUT_DEFAULT = 30 + + USER_ENCRYPTED_API_KEY = "encrypted_api_key" # -----End----- MODEL_DEPLOYMENT_STAGE1 = {"index": 1, "text": "ReceivedRequest"} @@ -142,6 +149,10 @@ class ServerConstants(object): DEVICE_DIFF_ADD_OPERATION = "op: add" DEVICE_DIFF_DELETE_OPERATION = "op: delete" DEVICE_DIFF_REPLACE_OPERATION = "op: replace" + + # Worker comfig yaml related + EXPOSE_SUBDOMAINS_KEY = "expose_subdomains" + @staticmethod def get_fedml_home_dir(): home_dir = expanduser("~") @@ -295,9 +306,10 @@ def get_public_ip(): return ip @staticmethod - def cleanup_run_process(run_id): + def cleanup_run_process(run_id, not_kill_subprocess=False): RunProcessUtils.cleanup_run_process( - run_id, ServerConstants.get_data_dir(), ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME) + run_id, ServerConstants.get_data_dir(), ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME, + not_kill_subprocess=not_kill_subprocess) @staticmethod def save_run_process(run_id, process_id): @@ -328,9 +340,32 @@ def save_bootstrap_process(run_id, process_id): run_id, process_id, ServerConstants.get_data_dir(), ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME, info_file_prefix=SchedulerConstants.RUN_PROCESS_TYPE_BOOTSTRAP_PROCESS) + @staticmethod + def get_runner_infos(): + local_pkg_data_dir = ServerConstants.get_data_dir() + os.makedirs(local_pkg_data_dir, exist_ok=True) + os.makedirs(os.path.join(local_pkg_data_dir, ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME), exist_ok=True) + + runner_info_file = os.path.join(local_pkg_data_dir, ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME, + "runner_infos.yaml") + runner_info = {} + try: + runner_info = yaml.safe_load(Path(runner_info_file).read_text()) + except Exception as e: + logging.error(f"Failed to parse runner info: {e}") + return runner_info + + @staticmethod + def get_inference_master_gateway_port(): + # Use dotenv to load the environment variables + fedml.load_env() + master_inference_port = int(os.getenv(ServerConstants.ENV_MASTER_INFERENCE_PORT_KEY, + default=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT)) + return master_inference_port + + @staticmethod def save_runner_infos(unique_device_id, edge_id, run_id=None): - home_dir = expanduser("~") local_pkg_data_dir = ServerConstants.get_data_dir() os.makedirs(local_pkg_data_dir, exist_ok=True) os.makedirs(os.path.join(local_pkg_data_dir, ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME), exist_ok=True) diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_runner.py b/python/fedml/computing/scheduler/model_scheduler/device_server_runner.py deleted file mode 100755 index 89e74bbd74..0000000000 --- a/python/fedml/computing/scheduler/model_scheduler/device_server_runner.py +++ /dev/null @@ -1,1999 +0,0 @@ -import copy -import json -import logging -import multiprocessing -import platform -import sys - -from multiprocessing import Process -import os -import shutil -import subprocess -import threading - -import time -import traceback -import urllib -import uuid -import zipfile -from os import listdir - -import requests -import torch - -import fedml -from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils -from fedml.core.mlops.mlops_runtime_log import MLOpsFormatter - -from ..comm_utils import sys_utils -from .device_server_data_interface import FedMLServerDataInterface -from ..scheduler_core.endpoint_sync_protocol import FedMLEndpointSyncProtocol -from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog - -from ....core.distributed.communication.mqtt.mqtt_manager import MqttManager -from ..comm_utils.yaml_utils import load_yaml_config -from .device_client_constants import ClientConstants -from .device_server_constants import ServerConstants - -from ....core.mlops.mlops_metrics import MLOpsMetrics - -from ....core.mlops.mlops_configs import MLOpsConfigs -from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon -from ....core.mlops.mlops_status import MLOpsStatus -from ..comm_utils.sys_utils import get_sys_runner_info, get_python_program -from .device_model_cache import FedMLModelCache -from .device_model_msg_object import FedMLModelMsgObject -from ....core.mlops.mlops_utils import MLOpsUtils -from ..comm_utils.constants import SchedulerConstants -from .device_model_db import FedMLModelDatabase -from .device_replica_controller import FedMLDeviceReplicaController - - -class RunnerError(BaseException): - """ Runner failed. """ - pass - - -class RunnerCompletedError(Exception): - """ Runner completed. """ - pass - - -class FedMLServerRunner: - FEDML_CLOUD_SERVER_PREFIX = "fedml-server-run-" - - def __init__(self, args, run_id=0, request_json=None, agent_config=None, edge_id=0): - self.inference_gateway_process = None - self.local_api_process = None - self.run_process_event = None - self.run_process_event_map = dict() - self.run_process_completed_event = None - self.run_process_completed_event_map = dict() - self.run_as_cloud_agent = False - self.run_as_cloud_server = False - self.run_as_edge_server_and_agent = False - self.run_as_cloud_server_and_agent = False - self.fedml_packages_base_dir = None - self.fedml_packages_unzip_dir = None - self.mqtt_mgr = None - self.running_request_json = dict() - self.run_id = run_id - self.client_mqtt_mgr = None - self.client_mqtt_is_connected = False - self.client_mqtt_lock = None - self.unique_device_id = None - self.edge_id = edge_id - self.server_agent_id = 0 - if request_json is not None: - self.server_agent_id = request_json.get("server_id", 0) - self.process = None - self.args = args - self.request_json = copy.deepcopy(request_json) - self.version = args.version - self.device_id = args.device_id - self.cur_dir = os.path.split(os.path.realpath(__file__))[0] - if args.current_running_dir is not None: - self.cur_dir = args.current_running_dir - - self.agent_config = agent_config - self.fedml_data_base_package_dir = os.path.join("/", "fedml", "data") - self.fedml_data_local_package_dir = os.path.join("/", "fedml", "fedml-package", "fedml", "data") - self.fedml_data_dir = self.fedml_data_base_package_dir - self.fedml_config_dir = os.path.join("/", "fedml", "conf") - - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES = {} - - self.mlops_metrics = None - self.run_status = None - self.infer_host = "127.0.0.1" - self.redis_addr = "local" - self.redis_port = "6379" - self.redis_password = "fedml_default" - - self.slave_deployment_statuses_mapping = dict() - self.slave_deployment_results_mapping = dict() - self.slave_update_result_mapping = dict() - - self.model_runner_mapping = dict() - self.ntp_offset = MLOpsUtils.get_ntp_offset() - - self.subscribed_topics = list() - self.user_name = None - - self.replica_controller = None - self.deployed_replica_payload = None - - self.autoscaler_launcher = None - - def build_dynamic_constrain_variables(self, run_id, run_config): - pass - - def unzip_file(self, zip_file, unzip_file_path): - unziped_file_name = "" - if zipfile.is_zipfile(zip_file): - with zipfile.ZipFile(zip_file, "r") as zipf: - zipf.extractall(unzip_file_path) - unziped_file_name = zipf.namelist()[0] - - return unziped_file_name - - def package_download_progress(self, count, blksize, filesize): - self.check_runner_stop_event() - - downloaded = count * blksize - downloaded = filesize if downloaded > filesize else downloaded - progress = (downloaded / filesize * 100) if filesize != 0 else 0 - progress_int = int(progress) - downloaded_kb = format(downloaded / 1024, '.2f') - - # since this hook function is stateless, we need a state to avoid printing progress repeatedly - if count == 0: - self.prev_download_progress = 0 - if progress_int != self.prev_download_progress and progress_int % 5 == 0: - self.prev_download_progress = progress_int - logging.info("package downloaded size {} KB, progress {}%".format(downloaded_kb, progress_int)) - - def retrieve_and_unzip_package(self, package_name, package_url): - local_package_path = ServerConstants.get_model_package_dir() - if not os.path.exists(local_package_path): - os.makedirs(local_package_path, exist_ok=True) - local_package_file = "{}.zip".format(os.path.join(local_package_path, package_name)) - if os.path.exists(local_package_file): - os.remove(local_package_file) - - # Download without renaming - urllib.request.urlretrieve(package_url, filename=None, reporthook=self.package_download_progress) - - unzip_package_path = ServerConstants.get_model_dir() - self.fedml_packages_base_dir = unzip_package_path - try: - shutil.rmtree( - os.path.join(unzip_package_path, package_name), ignore_errors=True - ) - except Exception as e: - pass - logging.info("local_package_file {}, unzip_package_path {}".format( - local_package_file, unzip_package_path)) - package_name = self.unzip_file(local_package_file, unzip_package_path) - unzip_package_path = os.path.join(unzip_package_path, package_name) - return unzip_package_path - - def update_local_fedml_config(self, run_id, run_config): - model_config = run_config - model_name = model_config["model_name"] - model_storage_url = model_config["model_storage_url"] - scale_min = model_config.get("instance_scale_min", 0) - scale_max = model_config.get("instance_scale_max", 0) - inference_engine = model_config.get("inference_engine", 0) - inference_end_point_id = run_id - - # Copy config file from the client - unzip_package_path = self.retrieve_and_unzip_package( - model_name, model_storage_url - ) - fedml_local_config_file = os.path.join(unzip_package_path, "fedml_model_config.yaml") - - # Load the above config to memory - package_conf_object = {} - if os.path.exists(fedml_local_config_file): - package_conf_object = load_yaml_config(fedml_local_config_file) - - return unzip_package_path, package_conf_object - - def get_usr_indicated_token(self, request_json) -> str: - usr_indicated_token = "" - if "parameters" in request_json and "authentication_token" in request_json["parameters"]: - usr_indicated_token = request_json["parameters"]["authentication_token"] - return usr_indicated_token - - def build_dynamic_args(self, run_config, package_conf_object, base_dir): - pass - - def run(self, process_event, completed_event): - # print(f"Model master runner process id {os.getpid()}, run id {self.run_id}") - - if platform.system() != "Windows": - os.setsid() - - os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' - os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') - - self.run_process_event = process_event - self.run_process_completed_event = completed_event - run_id = self.request_json.get("end_point_id") - - try: - MLOpsUtils.set_ntp_offset(self.ntp_offset) - - self.setup_client_mqtt_mgr() - - self.run_impl() - except RunnerError: - logging.info("Runner stopped.") - self.mlops_metrics.report_server_training_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED, - is_from_model=True, edge_id=self.edge_id) - except RunnerCompletedError: - logging.info("Runner completed.") - except Exception as e: - logging.error("Runner exits with exceptions.") - logging.error(traceback.format_exc()) - logging.error(e) - self.mlops_metrics.report_server_training_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, - is_from_model=True, edge_id=self.edge_id) - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - if self.mlops_metrics is not None: - self.mlops_metrics.stop_sys_perf() - time.sleep(3) - sys.exit(1) - finally: - logging.info("[Master] Deployment finished, release resources.") - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - if self.mlops_metrics is not None: - self.mlops_metrics.stop_sys_perf() - time.sleep(3) - if not self.run_as_cloud_server: - self.release_client_mqtt_mgr() - - def parse_model_run_params(self, running_json): - run_id = running_json["end_point_id"] - end_point_name = running_json["end_point_name"] - token = running_json["token"] - user_id = running_json["user_id"] - user_name = running_json["user_name"] - device_ids = running_json["device_ids"] - device_objs = running_json["device_objs"] - - model_config = running_json["model_config"] - model_name = model_config["model_name"] - model_id = model_config["model_id"] - model_storage_url = model_config["model_storage_url"] - scale_min = model_config.get("instance_scale_min", 0) - scale_max = model_config.get("instance_scale_max", 0) - inference_engine = model_config.get("inference_engine", 0) - model_is_from_open = model_config["is_from_open"] - inference_end_point_id = run_id - use_gpu = "gpu" # TODO: Get GPU from device infos - memory_size = "256m" # TODO: Get Memory size for each instance - model_version = model_config["model_version"] - model_config_parameters = running_json.get("parameters", {}) - - inference_port = model_config_parameters.get("server_internal_port", # Internal port is for the gateway - ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) - inference_port_external = model_config_parameters.get("server_external_port", inference_port) - - return run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \ - model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \ - inference_end_point_id, use_gpu, memory_size, model_version, inference_port - - def inference_run(self): - # run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \ - # model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \ - # inference_end_point_id, use_gpu, memory_size, model_version, inference_port = - # self.parse_model_run_params(self.request_json) - # - # inference_server = FedMLModelServingServer(self.args, - # end_point_name, - # model_name, - # model_version, - # inference_request=self.request_json) - # inference_server.run() - pass - - def run_impl(self): - run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \ - model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \ - inference_end_point_id, use_gpu, memory_size, model_version, inference_port = self.parse_model_run_params( - self.request_json) - - # TODO(Raphael): This measurement is for the host machine. Change to container's metrics - self.mlops_metrics.report_sys_perf(self.args, self.agent_config["mqtt_config"], run_id=run_id) - - self.check_runner_stop_event() - - # Send stage: MODEL_DEPLOYMENT_STAGE4 = "ForwardRequest2Slave" - self.send_deployment_stages(self.run_id, model_name, model_id, - "", - ServerConstants.MODEL_DEPLOYMENT_STAGE4["index"], - ServerConstants.MODEL_DEPLOYMENT_STAGE4["text"], - ServerConstants.MODEL_DEPLOYMENT_STAGE4["text"]) - - self.args.run_id = self.run_id - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - - # Report server running status - self.check_runner_stop_event() - self.mlops_metrics.report_server_training_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_STARTING, - is_from_model=True, running_json=json.dumps(self.request_json), edge_id=self.edge_id) - self.send_deployment_status(self.run_id, end_point_name, - model_name, "", - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYING) - - # Start unified inference gateway if it has not started - self.start_device_inference_gateway( - run_id, end_point_name, model_id, model_name, model_version, inference_port=inference_port) - - # (re)Start inference monitor server - self.stop_device_inference_monitor(run_id, end_point_name, model_id, model_name, model_version) - self.start_device_inference_monitor(run_id, end_point_name, model_id, model_name, model_version) - - # Changed the master's status to "IDLE" - self.mlops_metrics.broadcast_server_training_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED, - is_from_model=True, edge_id=self.edge_id) - - # Forward deployment request to slave devices - self.check_runner_stop_event() - - # Handle "op:add" && "op:remove" - devices_sent_add_or_remove_msg = self.send_deployment_start_request_to_edges() - - # Handle "op:update" - try: - devices_sent_update_remove_msg = self.send_first_scroll_update_msg() - - if len(devices_sent_add_or_remove_msg) == 0 and len(devices_sent_update_remove_msg) == 0: - # No device is added, updated or removed - logging.info("No device is added, updated or removed. No action needed for reconciliation.") - ip = self.get_ip_address(self.request_json) - master_port = os.getenv("FEDML_MASTER_PORT", None) - if master_port is not None: - inference_port = int(master_port) - model_inference_port = inference_port - if ip.startswith("http://") or ip.startswith("https://"): - model_inference_url = "{}/api/v1/predict".format(ip) - else: - model_inference_url = "http://{}:{}/api/v1/predict".format(ip, model_inference_port) - - self.set_runner_completed_event(run_id) - - self.send_deployment_status(run_id, end_point_name, - model_name, - model_inference_url, - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED) - - # Set setting to "DEPLOYED" for autoscaling service reference - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - update_user_setting_replica_num(end_point_id=run_id, state="DEPLOYED") - - return - except Exception as e: - logging.error(f"Failed to send first scroll update message due to {e}.") - logging.error(f"Exception traceback {traceback.format_exc()}.") - - logging.info("Start waiting for result callback from workers ...") - - while True: - # Wait for all devices to finish the add / delete / update operation - self.check_runner_stop_event() - time.sleep(3) - - def check_runner_stop_event(self): - if self.run_process_event is not None and self.run_process_event.is_set(): - logging.info("Received stopping event.") - raise RunnerError("Runner stopped") - - if self.run_process_completed_event is not None and self.run_process_completed_event.is_set(): - logging.info("Received completed event.") - raise RunnerCompletedError("Runner completed") - - def start_device_inference_gateway( - self, run_id, end_point_name, model_id, - model_name, model_version, inference_port=ServerConstants.MODEL_INFERENCE_DEFAULT_PORT): - # start unified inference server - running_model_name = ServerConstants.get_running_model_name(end_point_name, - model_name, model_version, run_id, model_id) - python_program = get_python_program() - master_port = os.getenv("FEDML_MASTER_PORT", None) - if master_port is not None: - inference_port = int(master_port) - if not ServerConstants.is_running_on_k8s(): - logging.info(f"start the model inference gateway, end point {run_id}, " - f"model name {model_name} at port {inference_port}...") - self.check_runner_stop_event() - - use_mqtt_inference = os.getenv("FEDML_USE_MQTT_INFERENCE", "False") - use_mqtt_inference = True if use_mqtt_inference.lower() == 'true' else False - use_worker_gateway = os.getenv("FEDML_USE_WORKER_GATEWAY", "False") - use_worker_gateway = True if use_worker_gateway.lower() == 'true' else False - inference_gw_cmd = "fedml.computing.scheduler.model_scheduler.device_model_inference:api" - inference_gateway_pids = RunProcessUtils.get_pid_from_cmd_line(inference_gw_cmd) - if inference_gateway_pids is None or len(inference_gateway_pids) <= 0: - cur_dir = os.path.dirname(__file__) - fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir))) - connect_str = "@FEDML@" - ext_info = sys_utils.random1( - self.agent_config["mqtt_config"]["BROKER_HOST"] + connect_str + - str(self.agent_config["mqtt_config"]["BROKER_PORT"]) + connect_str + - self.agent_config["mqtt_config"]["MQTT_USER"] + connect_str + - self.agent_config["mqtt_config"]["MQTT_PWD"] + connect_str + - str(self.agent_config["mqtt_config"]["MQTT_KEEPALIVE"]), "FEDML@9999GREAT") - self.inference_gateway_process = ServerConstants.exec_console_with_script( - "REDIS_ADDR=\"{}\" REDIS_PORT=\"{}\" REDIS_PASSWORD=\"{}\" " - "END_POINT_NAME=\"{}\" " - "MODEL_NAME=\"{}\" MODEL_VERSION=\"{}\" MODEL_INFER_URL=\"{}\" VERSION=\"{}\" " - "USE_MQTT_INFERENCE={} USE_WORKER_GATEWAY={} EXT_INFO={} " - "{} -m uvicorn {} --host 0.0.0.0 --port {} --reload --reload-delay 3 --reload-dir {} " - "--log-level critical".format( - self.redis_addr, self.redis_port, self.redis_password, - end_point_name, - model_name, model_version, "", self.args.version, - use_mqtt_inference, use_worker_gateway, ext_info, - python_program, inference_gw_cmd, str(inference_port), fedml_base_dir - ), - should_capture_stdout=False, - should_capture_stderr=False - ) - - def start_device_inference_monitor(self, run_id, end_point_name, - model_id, model_name, model_version, check_stopped_event=True): - # start inference monitor server - # Will report the qps related metrics to the MLOps - logging.info(f"start the model inference monitor, end point {run_id}, model name {model_name}...") - if check_stopped_event: - self.check_runner_stop_event() - run_id_str = str(run_id) - pip_source_dir = os.path.dirname(__file__) - monitor_file = os.path.join(pip_source_dir, "device_model_monitor.py") - python_program = get_python_program() - running_model_name = ServerConstants.get_running_model_name(end_point_name, - model_name, model_version, run_id, model_id) - self.monitor_process = ServerConstants.exec_console_with_shell_script_list( - [ - python_program, - monitor_file, - "-v", - self.args.version, - "-ep", - run_id_str, - "-epn", - str(end_point_name), - "-mi", - str(model_id), - "-mn", - model_name, - "-mv", - model_version, - "-iu", - "infer_url", - "-ra", - self.redis_addr, - "-rp", - self.redis_port, - "-rpw", - self.redis_password - ], - should_capture_stdout=False, - should_capture_stderr=False - ) - - def stop_device_inference_monitor(self, run_id, end_point_name, model_id, model_name, model_version): - # stop inference monitor server - logging.info(f"stop the model inference monitor, end point {run_id}, model name {model_name}...") - sys_utils.cleanup_model_monitor_processes(run_id, end_point_name, - model_id, model_name, model_version) - - def cleanup_run_when_finished(self): - logging.info("Cleanup run successfully when finished.") - - self.mlops_metrics.broadcast_server_training_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED, - is_from_model=True, edge_id=self.edge_id - ) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - pass - - time.sleep(1) - - try: - local_package_path = ServerConstants.get_package_download_dir() - for package_file in listdir(local_package_path): - if os.path.basename(package_file).startswith("run_" + str(self.run_id)): - shutil.rmtree(os.path.join(local_package_path, package_file), ignore_errors=True) - except Exception as e: - pass - - def cleanup_run_when_starting_failed(self): - logging.info("Cleanup run successfully when starting failed.") - - self.mlops_metrics.broadcast_server_training_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, - is_from_model=True, edge_id=self.edge_id) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - pass - - time.sleep(1) - - try: - local_package_path = ServerConstants.get_package_download_dir() - for package_file in listdir(local_package_path): - if os.path.basename(package_file).startswith("run_" + str(self.run_id)): - shutil.rmtree(os.path.join(local_package_path, package_file), ignore_errors=True) - except Exception as e: - pass - - def cleanup_run_when_deploy_failed(self): - topic = f"model_ops/model_device/delete_deployment/{self.edge_id}" - self.callback_delete_deployment(topic, payload=json.dumps(self.request_json)) - - def callback_deployment_result_message(self, topic=None, payload=None): - """ - This method is called when a deployment result is received from a worker device. - """ - # Save deployment result to local cache - topic_splits = str(topic).split('/') - device_id = topic_splits[-1] - payload_json = json.loads(payload) - end_point_id = payload_json["end_point_id"] - end_point_name = payload_json["end_point_name"] - model_id = payload_json["model_id"] - model_name = payload_json["model_name"] - model_version = payload_json["model_version"] - model_status = payload_json["model_status"] - replica_no = payload_json.get("replica_no", None) # "no" Idx start from 1 - run_id_str = str(end_point_id) - - # HotFix(Raphael): logging service cross talk - # Change the handler since each handler need to write to different log files - try: - # Remove the existing file handler - root_logger = logging.getLogger() - for handler in root_logger.handlers: - if isinstance(handler, logging.FileHandler): - root_logger.removeHandler(handler) - - # Correct log path: ~/.fedml/fedml-model-server/fedml/logs/fedml-run-$rid-edge-$eid.log - log_file = os.path.join(ServerConstants.get_log_file_dir(), - f"fedml-run-{run_id_str}-edge-{self.edge_id}.log") - - filehandler = logging.FileHandler(log_file, "a") - - program_prefix = "FedML-Server @device-id-{}".format(self.edge_id) - formatter = MLOpsFormatter(fmt="[" + program_prefix + "] [%(asctime)s] [%(levelname)s] " - "[%(filename)s:%(lineno)d:%(funcName)s] %(" - "message)s") - - filehandler.setFormatter(formatter) - root_logger.addHandler(filehandler) - except Exception as e: - logging.warning(f"Failed to change the logging handler due to {e}.") - - assert run_id_str in self.model_runner_mapping, (f"Run id {run_id_str} is not in the model runner mapping." - f"Current mapping {self.model_runner_mapping}.") - - logging.info("========== callback_deployment_result_message ==========\n") - # Identify the operation for this run (add, remove, update) - if run_id_str not in self.running_request_json: - logging.error(f"Run id {run_id_str} is not in the running request json.") - return - - # The rolling update and scale out / in operation should not happen at the same time - assert not ("replica_num_diff" in self.running_request_json[run_id_str] and - len(self.running_request_json[run_id_str]["replica_num_diff"]) > 0 and - "replica_version_diff" in self.running_request_json[run_id_str]) - - if "replica_version_diff" in self.running_request_json[run_id_str]: - run_operation = "UPDATE" - elif "replica_num_diff" in self.running_request_json[run_id_str] and \ - len(self.running_request_json[run_id_str]["replica_num_diff"]) > 0: - run_operation = "ADD_OR_REMOVE" - else: - logging.error(f"Unsupported operation for run id {run_id_str}. and request json " - f"{self.running_request_json[run_id_str]}") - return - - logging.info(f"End point {end_point_id}; Device {device_id}; replica {replica_no}; " - f"run_operation {run_operation} model status {model_status}.") - - # OPTIONAL DEBUG PARAMS - # this_run_controller = self.model_runner_mapping[run_id_str].replica_controller - # logging.info(f"The current replica controller state is " - # f"Total version diff num {this_run_controller.total_replica_version_diff_num}") - # logging.info(f"self.request_json now {self.request_json}") # request_json will be deprecated - # this_run_request_json = self.running_request_json.get(run_id_str, None) - # logging.info(f"self.running_request_json now {this_run_request_json}") - - # Set redis + sqlite deployment result - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - - # Deal with different model status - if model_status == ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DELETED: - # remove - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - delete_deployment_result_with_device_id_and_replica_no( - end_point_id, end_point_name, model_name, device_id, replica_no) - elif model_status == ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED: - # add or update or update-failed-rollback - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - set_deployment_result(end_point_id, end_point_name, - model_name, model_version, - device_id, payload, replica_no) - - # Note: To display the result in the UI, we need to save successful deployment result to the database - self.model_runner_mapping[run_id_str].deployed_replica_payload = copy.deepcopy(payload_json) - else: - if model_status != ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED: - logging.error(f"Unsupported model status {model_status}.") - - # Avoid endless loop, if the rollback also failed, we should report the failure to the MLOps - if self.model_runner_mapping[run_id_str].replica_controller.under_rollback: - self.send_deployment_status( - end_point_id, end_point_name, payload_json["model_name"], "", - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED) - return - - # Failure handler, send the rollback message to the worker devices only if it has not been rollback - if run_operation == "ADD_OR_REMOVE": - # During Scale out / in, - # the worker that already been scaled out / in should be sent the rollback message - rollback_dict = self.model_runner_mapping[run_id_str].replica_controller.rollback_add_or_remove_replica( - device_id=device_id, replica_no=replica_no, op_type=run_operation - ) - self.model_runner_mapping[run_id_str].replica_controller.under_rollback = True - - if rollback_dict is not None and len(rollback_dict) > 0: - self.send_deployment_status( - end_point_id, end_point_name, payload_json["model_name"], "", - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_ABORTING) - self.send_rollback_add_remove_op(run_id_str, rollback_dict) - return - else: - pass # This is the last worker that failed, so we should continue to "ABORTED" status - elif run_operation == "UPDATE": - # Overwrite the json with the rollback version diff - rollback_version_diff = \ - self.model_runner_mapping[run_id_str].replica_controller.rollback_get_replica_version_diff( - device_id_trigger=device_id, replica_no_trigger=replica_no) - - # Change the target version to the start version - self.model_runner_mapping[run_id_str].replica_controller.rollback_setback_target_replica_version() - - self.running_request_json[run_id_str]["replica_version_diff"] = copy.deepcopy(rollback_version_diff) - - # Send the rollback message to the worker devices - self.send_rollback_msg(run_id_str) - - # Set the deployment status to ABORTING - self.send_deployment_status( - end_point_id, end_point_name, payload_json["model_name"], "", - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_ABORTING) - - # TODO(Raphael): Check if resource left not cleaned up - return - else: - logging.error(f"Unsupported operation {run_operation}.") - return - - # Move to the next state (rolling update, finish the deployment, etc.) - # Notify the replica number controller - (self.model_runner_mapping[run_id_str]. - replica_controller.callback_update_curr_replica_num_state(device_id, replica_no, model_status)) - - # Notify the replica version controller, which might trigger the next rolling update - self.send_next_scroll_update_msg(run_id_str, device_id, replica_no) - - # Update the global deployment result mapping - if run_id_str not in self.slave_deployment_results_mapping: - self.slave_deployment_results_mapping[run_id_str] = dict() - if str(device_id) not in self.slave_deployment_results_mapping[run_id_str]: - self.slave_deployment_results_mapping[run_id_str][str(device_id)] = dict() - self.slave_deployment_results_mapping[run_id_str][str(device_id)][str(replica_no)] = model_status - - logging.info("callback_deployment_result_message: topic {}, payload {}, result mapping {}.".format( - topic, payload, self.slave_deployment_results_mapping[run_id_str])) - - request_json = self.running_request_json.get(run_id_str, None) - if request_json is None: - logging.error(f"The endpoint {end_point_id} is no longer running.") - self.send_deployment_status( - end_point_id, end_point_name, payload_json["model_name"], "", - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED) - return - - # Wait for all replica-level's result, not device-level - if (self.model_runner_mapping[run_id_str].replica_controller.is_all_replica_num_reconciled() and - self.model_runner_mapping[run_id_str].replica_controller.is_all_replica_version_reconciled()): - """ - When all the devices have finished the add / delete / update operation - """ - # Generate one unified inference api - # Note that here we use the gateway port instead of the inference port that is used by the slave device - model_config_parameters = request_json["parameters"] - inference_port = model_config_parameters.get("server_internal_port", - ServerConstants.MODEL_INFERENCE_DEFAULT_PORT) - inference_port_external = model_config_parameters.get("server_external_port", inference_port) - ip = self.get_ip_address(request_json) - - if ip.startswith("http://") or ip.startswith("https://"): - model_inference_url = "{}/inference/{}".format(ip, end_point_id) - else: - model_inference_url = "http://{}:{}/inference/{}".format(ip, inference_port_external, end_point_id) - - # Send stage: MODEL_DEPLOYMENT_STAGE5 = "StartInferenceIngress" - self.send_deployment_stages(end_point_id, model_name, model_id, - model_inference_url, - ServerConstants.MODEL_DEPLOYMENT_STAGE5["index"], - ServerConstants.MODEL_DEPLOYMENT_STAGE5["text"], - "inference url: {}".format(model_inference_url)) - - # Send the result to MLOps - if self.model_runner_mapping[run_id_str].deployed_replica_payload is not None: - payload_json = self.model_runner_mapping[run_id_str].deployed_replica_payload - model_slave_url = payload_json["model_url"] - payload_json["model_url"] = model_inference_url - payload_json["port"] = inference_port_external - token = FedMLModelCache.get_instance(self.redis_addr, self.redis_port).get_end_point_token( - end_point_id, end_point_name, model_name) - - model_metadata = payload_json["model_metadata"] - model_inputs = model_metadata["inputs"] - ret_inputs = list() - if "type" in model_metadata and model_metadata["type"] == "default": - payload_json["input_json"] = {"end_point_name": end_point_name, - "model_name": model_name, - "token": str(token), - "inputs": model_inputs, - "outputs": []} - payload_json["output_json"] = model_metadata["outputs"] - else: - raise Exception(f"Unsupported model metadata type {model_metadata['type']}") - - self.send_deployment_results_with_payload( - end_point_id, end_point_name, payload_json, - self.model_runner_mapping[run_id_str].replica_controller.target_replica_ids) - - payload_json_saved = payload_json - payload_json_saved["model_slave_url"] = model_slave_url - FedMLServerDataInterface.get_instance().save_job_result(end_point_id, self.edge_id, - json.dumps(payload_json_saved)) - else: - # Arrive here because only contains remove ops, so we do not need to update the model metadata - pass - - # For auto-scaling, should update the state to "DEPLOYED" - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - update_user_setting_replica_num(end_point_id=end_point_id, state="DEPLOYED") - - if self.model_runner_mapping[run_id_str].replica_controller.under_rollback: - # If first time failed (Still might need rollback), then send failed message to the MLOps - if not (FedMLModelCache.get_instance(self.redis_addr, self.redis_port). - get_end_point_activation(end_point_id)): - self.send_deployment_status( - end_point_id, end_point_name, payload_json["model_name"], "", - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED) - else: - self.send_deployment_status(end_point_id, end_point_name, - payload_json["model_name"], - model_inference_url, - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_ABORTED) - self.model_runner_mapping[run_id_str].replica_controller.under_rollback = False - else: - # Set the end point activation status to True, for scaling out / in and rolling update - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - set_end_point_activation(end_point_id, end_point_name, True) - - self.send_deployment_status(end_point_id, end_point_name, - payload_json["model_name"], - model_inference_url, - ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED) - - self.slave_deployment_results_mapping[run_id_str] = dict() - - time.sleep(3) - self.set_runner_completed_event(end_point_id) - - def callback_deployment_status_message(self, topic=None, payload=None): - # [Deprecated] Merge the logic into callback_deployment_result_message - logging.info("[Deprecated] callback_deployment_status_message: topic {}, payload {}.".format( - topic, payload)) - pass - - def send_deployment_start_request_to_edges(self, in_request_json=None): - if in_request_json is not None: - self.request_json = in_request_json - - # Iterate through replica_num_diff, both add and replace should be sent to the edge devices - if "replica_num_diff" not in self.request_json or self.request_json["replica_num_diff"] is None: - return [] - - edge_id_list = [] - for device_id in self.request_json["replica_num_diff"].keys(): - edge_id_list.append(device_id) - - self.request_json["master_node_ip"] = self.get_ip_address(self.request_json) - should_added_devices = [] - for edge_id in edge_id_list: - if edge_id == self.edge_id: - continue - should_added_devices.append(edge_id) - # send start deployment request to each device - self.send_deployment_start_request_to_edge(edge_id, self.request_json) - return should_added_devices - - def send_deployment_start_request_to_edge(self, edge_id, res_json): - topic_start_deployment = "model_ops/model_device/start_deployment/{}".format(str(edge_id)) - logging.info("start_deployment: send topic " + topic_start_deployment + f" to client {edge_id}...") - self.client_mqtt_mgr.send_message_json(topic_start_deployment, json.dumps(res_json)) - - def get_ip_address(self, request_json): - # OPTION 1: Use local ip - ip = ServerConstants.get_local_ip() - - # OPTION 2: Auto detect public ip - if "parameters" in request_json and \ - ServerConstants.AUTO_DETECT_PUBLIC_IP in request_json["parameters"] and \ - request_json["parameters"][ServerConstants.AUTO_DETECT_PUBLIC_IP]: - ip = ServerConstants.get_public_ip() - - # OPTION 3: Use user indicated ip - if self.infer_host is not None and self.infer_host != "127.0.0.1" and self.infer_host != "localhost": - ip = self.infer_host - - return ip - - def send_deployment_delete_request_to_edges(self, payload, model_msg_object): - edge_id_list_to_delete = model_msg_object.device_ids - - # Remove the model master node id from the list using index 0 - edge_id_list_to_delete = edge_id_list_to_delete[1:] - - logging.info("Device ids to be deleted: " + str(edge_id_list_to_delete)) - - for edge_id in edge_id_list_to_delete: - if edge_id == self.edge_id: - continue - # send delete deployment request to each model device - topic_delete_deployment = "model_ops/model_device/delete_deployment/{}".format(str(edge_id)) - logging.info("delete_deployment: send topic " + topic_delete_deployment + " to client...") - self.client_mqtt_mgr.send_message_json(topic_delete_deployment, payload) - - def ota_upgrade(self, payload, request_json): - run_id = request_json["end_point_id"] - force_ota = False - ota_version = None - - try: - parameters = request_json.get("parameters", None) - common_args = parameters.get("common_args", None) - force_ota = common_args.get("force_ota", False) - ota_version = common_args.get("ota_version", None) - except Exception as e: - pass - - if force_ota and ota_version is not None: - should_upgrade = True if ota_version != fedml.__version__ else False - upgrade_version = ota_version - else: - try: - fedml_is_latest_version, local_ver, remote_ver = sys_utils.check_fedml_is_latest_version(self.version) - except Exception as e: - return - - should_upgrade = False if fedml_is_latest_version else True - upgrade_version = remote_ver - - if should_upgrade: - job_obj = FedMLServerDataInterface.get_instance().get_job_by_id(run_id) - if job_obj is None: - FedMLServerDataInterface.get_instance(). \ - save_started_job(run_id, self.edge_id, time.time(), - ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING, - ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING, - payload) - - logging.info(f"Upgrade to version {upgrade_version} ...") - - sys_utils.do_upgrade(self.version, upgrade_version) - - raise Exception("Restarting after upgraded...") - - def callback_start_deployment(self, topic, payload): - try: - MLOpsConfigs.fetch_all_configs() - except Exception as e: - pass - - # Get deployment params - request_json = json.loads(payload) - run_id = request_json["end_point_id"] - end_point_name = request_json["end_point_name"] - token = request_json["token"] - user_id = request_json["user_id"] - user_name = request_json["user_name"] - device_ids = request_json["device_ids"] - device_objs = request_json["device_objs"] - - model_config = request_json["model_config"] - model_name = model_config["model_name"] - model_version = model_config["model_version"] - model_id = model_config["model_id"] - model_storage_url = model_config["model_storage_url"] - scale_min = model_config.get("instance_scale_min", 0) - scale_max = model_config.get("instance_scale_max", 0) - inference_engine = model_config.get("inference_engine", 0) - enable_auto_scaling = request_json.get("enable_auto_scaling", False) - desired_replica_num = request_json.get("desired_replica_num", 1) - - target_queries_per_replica = request_json.get("target_queries_per_replica", 10) - aggregation_window_size_seconds = request_json.get("aggregation_window_size_seconds", 60) - scale_down_delay_seconds = request_json.get("scale_down_delay_seconds", 120) - - inference_end_point_id = run_id - - logging.info("[Master] received start deployment request for end point {}.".format(run_id)) - - # Set redis config - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - - # Save the user setting (about replica number) of this run to Redis, if existed, update it - FedMLModelCache.get_instance(self.redis_addr, self.redis_port).set_user_setting_replica_num( - end_point_id=run_id, end_point_name=end_point_name, model_name=model_name, model_version=model_version, - replica_num=desired_replica_num, enable_auto_scaling=enable_auto_scaling, - scale_min=scale_min, scale_max=scale_max, state="DEPLOYING", - aggregation_window_size_seconds=aggregation_window_size_seconds, - target_queries_per_replica=target_queries_per_replica, - scale_down_delay_seconds=int(scale_down_delay_seconds) - ) - - # Start log processor for current run - self.args.run_id = run_id - self.args.edge_id = self.edge_id - MLOpsRuntimeLog(args=self.args).init_logs() - MLOpsRuntimeLogDaemon.get_instance(self.args).set_log_source( - ServerConstants.FEDML_LOG_SOURCE_TYPE_MODEL_END_POINT) - MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor(run_id, self.edge_id) - - # # Deprecated - # self.ota_upgrade(payload, request_json) - - # Add additional parameters to the request_json - run_id = inference_end_point_id - self.args.run_id = run_id - self.run_id = run_id - request_json["run_id"] = run_id - self.request_json = request_json - run_id_str = str(run_id) - self.running_request_json[run_id_str] = request_json - self.request_json["master_node_ip"] = self.get_ip_address(self.request_json) - - # Set the target status of the devices to redis - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - set_end_point_device_info(request_json["end_point_id"], end_point_name, json.dumps(device_objs)) - - # Setup Token - usr_indicated_token = self.get_usr_indicated_token(request_json) - if usr_indicated_token != "": - logging.info(f"Change Token from{token} to {usr_indicated_token}") - token = usr_indicated_token - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - set_end_point_token(run_id, end_point_name, model_name, token) - - self.subscribe_slave_devices_message(request_json) - - # Report stage to mlops: MODEL_DEPLOYMENT_STAGE1 = "Received" - self.send_deployment_stages(self.run_id, model_name, model_id, - "", - ServerConstants.MODEL_DEPLOYMENT_STAGE1["index"], - ServerConstants.MODEL_DEPLOYMENT_STAGE1["text"], - "Received request for endpoint {}".format(run_id)) - - # Report stage to mlops: MODEL_DEPLOYMENT_STAGE2 = "Initializing" - self.send_deployment_stages(self.run_id, model_name, model_id, - "", - ServerConstants.MODEL_DEPLOYMENT_STAGE2["index"], - ServerConstants.MODEL_DEPLOYMENT_STAGE2["text"], - ServerConstants.MODEL_DEPLOYMENT_STAGE2["text"]) - - ServerConstants.save_runner_infos(self.args.device_id + "." + self.args.os_name, self.edge_id, run_id=run_id) - - if self.run_as_edge_server_and_agent: - # Replica Controller is per deployment - replica_controller = FedMLDeviceReplicaController(self.edge_id, self.request_json) - - # Prepare num diff - new_request_with_num_diff = replica_controller.generate_diff_to_request_json() - self.running_request_json[run_id_str] = new_request_with_num_diff - request_json = new_request_with_num_diff - - # Listen to extra worker topics, especially when worker's replica remove to zero, - # In this case, currently Java will NOT send those worker ids to the master, but still need to listen to it. - if "replica_num_diff" in request_json and len(request_json["replica_num_diff"]) > 0: - for device_id in request_json["replica_num_diff"].keys(): - # {"op": "remove", "curr_num": 1, "target_num": 0} - if request_json["replica_num_diff"][device_id]["op"] == "remove" and \ - request_json["replica_num_diff"][device_id]["target_num"] == 0: - self.subscribe_spec_device_message(run_id, device_id) - - # Prepare version diff - new_request_with_version_diff = replica_controller.init_first_update_device_replica_mapping() - self.running_request_json[run_id_str] = new_request_with_version_diff - request_json = new_request_with_version_diff - - # Init the model runner - server_runner = FedMLServerRunner( - self.args, run_id=run_id, request_json=request_json, agent_config=self.agent_config - ) - server_runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent - server_runner.edge_id = self.edge_id - server_runner.infer_host = self.infer_host - server_runner.redis_addr = self.redis_addr - server_runner.redis_port = self.redis_port - server_runner.redis_password = self.redis_password - server_runner.replica_controller = replica_controller - - logging.info(f"[Master] new request for id {run_id_str}") - logging.info(f"[Master] model runner mapping before: {self.model_runner_mapping.items()}") - - self.run_process_event_map[run_id_str] = multiprocessing.Event() - self.run_process_event_map[run_id_str].clear() - server_runner.run_process_event = self.run_process_event_map[run_id_str] - self.run_process_completed_event_map[run_id_str] = multiprocessing.Event() - self.run_process_completed_event_map[run_id_str].clear() - server_runner.run_process_completed_event = self.run_process_completed_event_map[run_id_str] - self.model_runner_mapping[run_id_str] = server_runner - - logging.info(f"[Master] model runner mapping after: {self.model_runner_mapping.items()}") - - # This subprocess will copy the server_runner and run it, but they are not the same object - server_process = Process(target=server_runner.run, args=( - self.run_process_event_map[run_id_str], self.run_process_completed_event_map[run_id_str] - )) - server_process.start() - ServerConstants.save_run_process(run_id, server_process.pid) - - # Send stage: MODEL_DEPLOYMENT_STAGE3 = "StartRunner" - self.send_deployment_stages(self.run_id, model_name, model_id, - "", - ServerConstants.MODEL_DEPLOYMENT_STAGE3["index"], - ServerConstants.MODEL_DEPLOYMENT_STAGE3["text"], - ServerConstants.MODEL_DEPLOYMENT_STAGE3["text"]) - - def send_first_scroll_update_msg(self): - """ - Replica-level rolling update. - Delete the record of the replaced device and send the deployment msg to the devices - """ - if "replica_version_diff" not in self.request_json or self.request_json["replica_version_diff"] is None: - return [] - - first_chunk_dict = self.request_json["replica_version_diff"] - - # Delete the record of the replaced device - self.delete_device_replica_info_on_master( - self.request_json["end_point_id"], self.request_json["end_point_name"], - self.request_json["model_config"]["model_name"], first_chunk_dict) - - logging.info(f"Send the first scroll update msg to the device {first_chunk_dict} ") - - # Send the deployment msg to the devices, (we reuse the start_deployment msg) - for edge_id in first_chunk_dict.keys(): - if edge_id == self.edge_id: - continue - # send start deployment request to each device - self.send_deployment_start_request_to_edge(edge_id, self.request_json) - return list(first_chunk_dict.keys()) - - def send_rollback_msg(self, run_id_str): - # Avoid using the old request_json - self.delete_device_replica_info_on_master( - self.running_request_json[run_id_str]["end_point_id"], - self.running_request_json[run_id_str]["end_point_name"], - self.running_request_json[run_id_str]["model_config"]["model_name"], - self.running_request_json[run_id_str]["replica_version_diff"]) - - # Send the deployment msg to the devices, (we reuse the start_deployment msg) - for edge_id in self.running_request_json[run_id_str]["replica_version_diff"].keys(): - if edge_id == self.edge_id: - continue - # send start deployment request to each device - self.send_deployment_start_request_to_edge(edge_id, self.running_request_json[run_id_str]) - - def delete_device_replica_info_on_master(self, endpoint_id, endpoint_name, model_name, edge_id_replica_no_dict): - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - # Remove the record of the replaced device - # [Deprecated] deployment status & device info - # Delete the result in deployment result list in Redis / SQLite - device_result_list = FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - get_deployment_result_list(endpoint_id, endpoint_name, model_name) - - delete_device_result_list = [] - for device_result in device_result_list: - device_result_dict = json.loads(device_result) - if (str(device_result_dict["cache_device_id"]) in edge_id_replica_no_dict.keys() and - str(device_result_dict["cache_replica_no"]) in - edge_id_replica_no_dict[str(device_result_dict["cache_device_id"])]): - delete_device_result_list.append(device_result) - - for delete_item in delete_device_result_list: - FedMLModelCache.get_instance(self.redis_addr, self.redis_port).delete_deployment_result( - delete_item, endpoint_id, endpoint_name, model_name - ) - - logging.info(f"Deleted the replica record on master: {edge_id_replica_no_dict}") - - def send_next_scroll_update_msg(self, run_id_str, device_id, replica_no): - """ - Send the next scroll update msg to the devices if needed. - If there is no need for the next scroll update, directly return. - """ - if replica_no is None: - return - - replica_controller = self.model_runner_mapping[run_id_str].replica_controller - - if replica_controller.total_replica_version_diff_num == 0: - return - - if replica_controller.under_rollback: - replica_controller.intermediate_replica_version[device_id][replica_no] = replica_controller.start_version - return - - logging.info(f"Curr updating window: {replica_controller.curr_replica_updating_window} " - f"Curr version diff num: {replica_controller.total_replica_version_diff_num}") - - replica_controller.callback_update_updating_window(device_id, replica_no) - - # Decide whether to send the next scroll update - next_chunk_dict = replica_controller.get_next_chunk_devices_replica() - - if next_chunk_dict: - logging.info(f"The next scroll update for end point {run_id_str} is {next_chunk_dict}") - # Update curr updating window - replica_controller.curr_replica_updating_window = copy.deepcopy(next_chunk_dict) - - # Use global deployment result mapping to decide whether to send the next scroll update - self.running_request_json[run_id_str]["replica_version_diff"] = next_chunk_dict - - # Avoid using the old request_json - self.delete_device_replica_info_on_master( - self.running_request_json[run_id_str]["end_point_id"], - self.running_request_json[run_id_str]["end_point_name"], - self.running_request_json[run_id_str]["model_config"]["model_name"], - next_chunk_dict) - - # Send the deployment msg to the devices, (we reuse the start_deployment msg) - for edge_id in next_chunk_dict.keys(): - if edge_id == self.edge_id: - continue - # send start deployment request to each device - self.send_deployment_start_request_to_edge(edge_id, self.running_request_json[run_id_str]) - return - - def send_rollback_add_remove_op(self, run_id, rollback_replica_dict): - """ - This method is used when the original add op failed, we need to rollback by delete the existed replicas - Input example: - rollback_replica_dict = {'96684': {'curr_num': 2, 'op': 'remove', 'target_num': 1}} - """ - existed_request_json = self.running_request_json[str(run_id)] - updated_request_json = copy.deepcopy(existed_request_json) - - # Reverse the replica_num_diff - updated_request_json["replica_num_diff"] = rollback_replica_dict - - self.send_deployment_start_request_to_edges(in_request_json=updated_request_json) - - def callback_activate_deployment(self, topic, payload): - logging.info("callback_activate_deployment: topic = %s, payload = %s" % (topic, payload)) - - # Parse payload as the model message object. - model_msg_object = FedMLModelMsgObject(topic, payload) - - # Get the previous deployment status. - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - endpoint_status = FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - get_end_point_status(model_msg_object.inference_end_point_id) - if endpoint_status != ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED: - return - - # Set end point as activated status - FedMLModelCache.get_instance(self.redis_addr, self.redis_port).set_end_point_activation( - model_msg_object.inference_end_point_id, model_msg_object.end_point_name, True) - - def callback_deactivate_deployment(self, topic, payload): - logging.info("callback_deactivate_deployment: topic = %s, payload = %s" % (topic, payload)) - - # Parse payload as the model message object. - model_msg_object = FedMLModelMsgObject(topic, payload) - - # Get the endpoint status - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - endpoint_status = FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - get_end_point_status(model_msg_object.inference_end_point_id) - if endpoint_status != ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED: - return - - # Set end point as deactivated status - FedMLModelCache.get_instance(self.redis_addr, self.redis_port).set_end_point_activation( - model_msg_object.inference_end_point_id, model_msg_object.model_name, False) - - def set_runner_stopped_event(self, run_id): - run_id_str = str(run_id) - server_runner = self.model_runner_mapping.get(run_id_str, None) - if server_runner is not None: - if server_runner.run_process_event is not None: - server_runner.run_process_event.set() - self.model_runner_mapping.pop(run_id_str) - - def set_runner_completed_event(self, run_id): - run_id_str = str(run_id) - server_runner = self.model_runner_mapping.get(run_id_str, None) - if server_runner is not None: - if server_runner.run_process_completed_event is not None: - server_runner.run_process_completed_event.set() - self.model_runner_mapping.pop(run_id_str) - - def callback_delete_deployment(self, topic, payload): - logging.info("[Master] callback_delete_deployment") - # Parse payload as the model message object. - model_msg_object = FedMLModelMsgObject(topic, payload) - - # Delete SQLite records - FedMLServerDataInterface.get_instance().delete_job_from_db(model_msg_object.run_id) - FedMLModelDatabase.get_instance().delete_deployment_result( - model_msg_object.run_id, model_msg_object.end_point_name, model_msg_object.model_name, - model_version=model_msg_object.model_version) - FedMLModelDatabase.get_instance().delete_deployment_run_info( - end_point_id=model_msg_object.inference_end_point_id) - - # Delete Redis Records - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - set_end_point_activation(model_msg_object.inference_end_point_id, - model_msg_object.end_point_name, False) - FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - delete_end_point(model_msg_object.inference_end_point_id, model_msg_object.end_point_name, - model_msg_object.model_name, model_msg_object.model_version) - - # Send delete deployment request to the edge devices - self.send_deployment_delete_request_to_edges(payload, model_msg_object) - - # Stop processes on master - self.set_runner_stopped_event(model_msg_object.run_id) - self.stop_device_inference_monitor(model_msg_object.run_id, model_msg_object.end_point_name, - model_msg_object.model_id, model_msg_object.model_name, - model_msg_object.model_version) - - def send_deployment_results_with_payload(self, end_point_id, end_point_name, payload, replica_id_list=None): - self.send_deployment_results(end_point_id, end_point_name, - payload["model_name"], payload["model_url"], - payload["model_version"], payload["port"], - payload["inference_engine"], - payload["model_metadata"], - payload["model_config"], - payload["input_json"], - payload["output_json"], - replica_id_list=replica_id_list) - - def send_deployment_results(self, end_point_id, end_point_name, - model_name, model_inference_url, - model_version, inference_port, inference_engine, - model_metadata, model_config, input_json, output_json, replica_id_list=None): - deployment_results_topic_prefix = "model_ops/model_device/return_deployment_result" - deployment_results_topic = "{}/{}".format(deployment_results_topic_prefix, end_point_id) - deployment_results_payload = {"end_point_id": end_point_id, "end_point_name": end_point_name, - "model_name": model_name, "model_url": model_inference_url, - "version": model_version, "port": inference_port, - "inference_engine": inference_engine, - "model_metadata": model_metadata, - "model_config": model_config, - "input_json": input_json, - "output_json": output_json, - "timestamp": int(format(time.time_ns() / 1000.0, '.0f')), - "replica_ids": replica_id_list} - logging.info(f"[Master] deployment_results_payload is sent to mlops: {deployment_results_payload}") - - self.client_mqtt_mgr.send_message_json(deployment_results_topic, json.dumps(deployment_results_payload)) - self.client_mqtt_mgr.send_message_json(deployment_results_topic_prefix, json.dumps(deployment_results_payload)) - - def send_deployment_status(self, end_point_id, end_point_name, model_name, model_inference_url, model_status): - deployment_status_topic_prefix = "model_ops/model_device/return_deployment_status" - deployment_status_topic = "{}/{}".format(deployment_status_topic_prefix, end_point_id) - deployment_status_payload = {"end_point_id": end_point_id, "end_point_name": end_point_name, - "model_name": model_name, - "model_url": model_inference_url, - "model_status": model_status, - "timestamp": int(format(time.time_ns() / 1000.0, '.0f'))} - logging.info(f"[Master] deployment_status_payload is sent to mlops: {deployment_status_payload}") - - self.client_mqtt_mgr.send_message_json(deployment_status_topic, json.dumps(deployment_status_payload)) - self.client_mqtt_mgr.send_message_json(deployment_status_topic_prefix, json.dumps(deployment_status_payload)) - - def send_deployment_stages(self, end_point_id, model_name, model_id, model_inference_url, - model_stages_index, model_stages_title, model_stage_detail): - deployment_stages_topic_prefix = "model_ops/model_device/return_deployment_stages" - deployment_stages_topic = "{}/{}".format(deployment_stages_topic_prefix, end_point_id) - deployment_stages_payload = {"model_name": model_name, - "model_id": model_id, - "model_url": model_inference_url, - "end_point_id": end_point_id, - "model_stage_index": model_stages_index, - "model_stage_title": model_stages_title, - "model_stage_detail": model_stage_detail, - "timestamp": int(format(time.time_ns() / 1000.0, '.0f'))} - - self.client_mqtt_mgr.send_message_json(deployment_stages_topic, json.dumps(deployment_stages_payload)) - self.client_mqtt_mgr.send_message_json(deployment_stages_topic_prefix, json.dumps(deployment_stages_payload)) - - logging.info(f"-------- Stages has been sent to mlops with stage {model_stages_index} and " - f"payload {deployment_stages_payload}") - time.sleep(2) - - def on_client_mqtt_disconnected(self, mqtt_client_object): - if self.client_mqtt_lock is None: - self.client_mqtt_lock = threading.Lock() - - self.client_mqtt_lock.acquire() - self.client_mqtt_is_connected = False - self.client_mqtt_lock.release() - - logging.info("on_client_mqtt_disconnected: {}.".format(self.client_mqtt_is_connected)) - - def on_client_mqtt_connected(self, mqtt_client_object): - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - - self.mlops_metrics.set_messenger(self.client_mqtt_mgr) - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.edge_id = self.edge_id - self.mlops_metrics.server_agent_id = self.server_agent_id - - if self.client_mqtt_lock is None: - self.client_mqtt_lock = threading.Lock() - - self.client_mqtt_lock.acquire() - self.client_mqtt_is_connected = True - self.client_mqtt_lock.release() - - # logging.info("on_client_mqtt_connected: {}.".format(self.client_mqtt_is_connected)) - - def setup_client_mqtt_mgr(self): - if self.client_mqtt_mgr is not None: - return - - if self.client_mqtt_lock is None: - self.client_mqtt_lock = threading.Lock() - - # logging.info( - # "server agent config: {},{}".format( - # self.agent_config["mqtt_config"]["BROKER_HOST"], self.agent_config["mqtt_config"]["BROKER_PORT"] - # ) - # ) - - self.client_mqtt_mgr = MqttManager( - self.agent_config["mqtt_config"]["BROKER_HOST"], - self.agent_config["mqtt_config"]["BROKER_PORT"], - self.agent_config["mqtt_config"]["MQTT_USER"], - self.agent_config["mqtt_config"]["MQTT_PWD"], - self.agent_config["mqtt_config"]["MQTT_KEEPALIVE"], - "FedML_ModelServerAgent_Metrics_@{}@_{}_{}_{}".format(self.user_name, self.args.current_device_id, - str(os.getpid()), - str(uuid.uuid4())) - ) - self.client_mqtt_mgr.add_connected_listener(self.on_client_mqtt_connected) - self.client_mqtt_mgr.add_disconnected_listener(self.on_client_mqtt_disconnected) - self.client_mqtt_mgr.connect() - self.client_mqtt_mgr.loop_start() - - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - self.mlops_metrics.set_messenger(self.client_mqtt_mgr) - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.edge_id = self.edge_id - self.mlops_metrics.server_agent_id = self.server_agent_id - - def release_client_mqtt_mgr(self): - try: - if self.client_mqtt_mgr is not None: - self.client_mqtt_mgr.loop_stop() - self.client_mqtt_mgr.disconnect() - - self.client_mqtt_lock.acquire() - if self.client_mqtt_mgr is not None: - self.client_mqtt_is_connected = False - self.client_mqtt_mgr = None - self.client_mqtt_lock.release() - except Exception: - pass - - def send_deployment_stop_request_to_edges(self, edge_id_list, payload): - for edge_id in edge_id_list: - topic_stop_deployment = "model_ops/model_device/stop_deployment/{}".format(str(self.edge_id)) - logging.info("stop_deployment: send topic " + topic_stop_deployment) - self.client_mqtt_mgr.send_message_json(topic_stop_deployment, payload) - - def send_exit_train_with_exception_request_to_edges(self, edge_id_list, payload): - for edge_id in edge_id_list: - topic_exit_train = "flserver_agent/" + str(edge_id) + "/exit_train_with_exception" - logging.info("exit_train_with_exception: send topic " + topic_exit_train) - self.client_mqtt_mgr.send_message_json(topic_exit_train, payload) - - def exit_run_with_exception_entry(self): - try: - self.setup_client_mqtt_mgr() - self.exit_run_with_exception() - except Exception as e: - self.release_client_mqtt_mgr() - sys_utils.cleanup_all_fedml_server_login_processes( - ServerConstants.SERVER_LOGIN_PROGRAM, clean_process_group=False) - sys.exit(1) - finally: - self.release_client_mqtt_mgr() - - def exit_run_with_exception(self): - logging.info("Exit run successfully.") - - ServerConstants.cleanup_learning_process(self.run_id) - ServerConstants.cleanup_run_process(self.run_id) - - self.mlops_metrics.report_server_id_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id) - - time.sleep(1) - - def callback_exit_train_with_exception(self, topic, payload): - # logging.info("callback_exit_train_with_exception: topic = %s, payload = %s" % (topic, payload)) - - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json.get("runId", None) - if run_id is None: - run_id = request_json.get("run_id", None) - if run_id is None: - run_id = request_json.get("id", None) - - if run_id is None: - return - - edge_ids = request_json.get("edgeids", None) - - self.send_exit_train_with_exception_request_to_edges(edge_ids, payload) - - # Stop server with multiprocessing mode - self.request_json = request_json - server_runner = FedMLServerRunner( - self.args, edge_id=self.edge_id, request_json=request_json, agent_config=self.agent_config, run_id=run_id - ) - try: - Process(target=server_runner.exit_run_with_exception_entry).start() - except Exception as e: - pass - - def callback_client_exit_train_with_exception(self, topic, payload): - # logging.info("callback_client_exit_train_with_exception: topic = %s, payload = %s" % (topic, payload)) - - request_json = json.loads(payload) - run_id = request_json.get("run_id", None) - edge_id = request_json.get("edge_id", None) - if run_id is None: - logging.info("callback_client_exit_train_with_exception run id is none") - return - - job = FedMLServerDataInterface.get_instance().get_job_by_id(run_id) - if job is not None and job.running_json is not None and job.running_json != "": - job_json_obj = json.loads(job.running_json) - edge_ids = job_json_obj.get("edgeids", None) - - self.mlops_metrics.broadcast_server_training_status( - run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, - is_from_model=True, edge_id=edge_id) - - self.send_exit_train_with_exception_request_to_edges(edge_ids, job.running_json) - - self.exit_run_with_exception() - - def callback_runner_id_status(self, topic, payload): - logging.info("callback_runner_id_status: topic = %s, payload = %s" % (topic, payload)) - - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json["run_id"] - status = request_json["status"] - edge_id = request_json["edge_id"] - run_id_str = str(run_id) - - if ( - status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED - or status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED - ): - # Stop server with multiprocessing mode - stop_request_json = self.running_request_json.get(run_id_str, None) - if stop_request_json is None: - stop_request_json = request_json - if self.run_as_edge_server_and_agent: - server_runner = FedMLServerRunner( - self.args, run_id=run_id, request_json=stop_request_json, agent_config=self.agent_config - ) - server_runner.edge_id = self.edge_id - server_runner.run_as_edge_server_and_agent = self.run_as_edge_server_and_agent - server_runner.run_status = status - status_process = Process(target=server_runner.cleanup_client_with_status) - status_process.start() - status_process.join(10) - - # Stop log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, self.edge_id) - - def cleanup_client_with_status(self): - if self.run_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED: - logging.info("received to finished status.") - self.cleanup_run_when_finished() - elif self.run_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED: - logging.info("received to failed status.") - self.cleanup_run_when_starting_failed() - - def callback_report_current_status(self, topic, payload): - request_json = json.loads(payload) - if self.run_as_edge_server_and_agent: - self.send_agent_active_msg() - elif self.run_as_cloud_agent: - self.send_agent_active_msg() - elif self.run_as_cloud_server: - pass - - @staticmethod - def process_ota_upgrade_msg(): - os.system("pip install -U fedml") - - def callback_server_ota_msg(self, topic, payload): - request_json = json.loads(payload) - cmd = request_json["cmd"] - - if cmd == ServerConstants.FEDML_OTA_CMD_UPGRADE: - try: - self.process_ota_upgrade_msg() - # Process(target=FedMLServerRunner.process_ota_upgrade_msg).start() - raise Exception("After upgraded, restart runner...") - except Exception as e: - pass - elif cmd == ServerConstants.FEDML_OTA_CMD_RESTART: - raise Exception("Restart runner...") - - @staticmethod - def get_device_id(): - device_file_path = os.path.join(ServerConstants.get_data_dir(), ServerConstants.LOCAL_RUNNER_INFO_DIR_NAME) - file_for_device_id = os.path.join(device_file_path, "devices.id") - if not os.path.exists(device_file_path): - os.makedirs(device_file_path) - elif os.path.exists(file_for_device_id): - with open(file_for_device_id, 'r', encoding='utf-8') as f: - device_id_from_file = f.readline() - if device_id_from_file is not None and device_id_from_file != "": - return device_id_from_file - - if platform.system() == "Darwin": - cmd_get_serial_num = "system_profiler SPHardwareDataType | grep Serial | awk '{gsub(/ /,\"\")}{print}' " \ - "|awk -F':' '{print $2}' " - device_id = os.popen(cmd_get_serial_num).read() - device_id = device_id.replace('\n', '').replace(' ', '') - if device_id is None or device_id == "": - device_id = hex(uuid.getnode()) - else: - device_id = "0x" + device_id - else: - if "nt" in os.name: - - def get_uuid(): - guid = "" - try: - cmd = "wmic csproduct get uuid" - guid = str(subprocess.check_output(cmd)) - pos1 = guid.find("\\n") + 2 - guid = guid[pos1:-15] - except Exception as ex: - pass - return str(guid) - - device_id = str(get_uuid()) - elif "posix" in os.name: - device_id = sys_utils.get_device_id_in_docker() - if device_id is None: - device_id = hex(uuid.getnode()) - else: - device_id = sys_utils.run_subprocess_open( - "hal-get-property --udi /org/freedesktop/Hal/devices/computer --key system.hardware.uuid".split() - ) - device_id = hex(device_id) - - if device_id is not None and device_id != "": - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - else: - device_id = hex(uuid.uuid4()) - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - - return device_id - - def bind_account_and_device_id(self, url, account_id, device_id, os_name): - role = ServerConstants.login_role_list[ServerConstants.LOGIN_MODE_ON_PREMISE_MASTER_INDEX] - if self.run_as_edge_server_and_agent: - role = ServerConstants.login_role_list[ServerConstants.LOGIN_MODE_ON_PREMISE_MASTER_INDEX] - elif self.run_as_cloud_agent: - role = ServerConstants.login_role_list[ServerConstants.LOGIN_MODE_FEDML_CLOUD_MASTER_INDEX] - elif self.run_as_cloud_server: - role = ServerConstants.login_role_list[ServerConstants.LOGIN_MODE_INFERENCE_INSTANCE_INDEX] - - ip = requests.get('https://checkip.amazonaws.com').text.strip() - fedml_ver, exec_path, os_ver, cpu_info, python_ver, torch_ver, mpi_installed, \ - cpu_usage, available_mem, total_mem, gpu_info, gpu_available_mem, gpu_total_mem, \ - gpu_count, gpu_vendor, cpu_count, gpu_device_name = get_sys_runner_info() - host_name = sys_utils.get_host_name() - json_params = { - "accountid": account_id, - "deviceid": device_id, - "type": os_name, - "state": ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE, - "status": ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE, - "processor": cpu_info, - "core_type": cpu_info, - "network": "", - "role": role, - "os_ver": os_ver, - "memory": total_mem, - "ip": ip, - "extra_infos": {"fedml_ver": fedml_ver, "exec_path": exec_path, "os_ver": os_ver, - "cpu_info": cpu_info, "python_ver": python_ver, "torch_ver": torch_ver, - "mpi_installed": mpi_installed, "cpu_usage": cpu_usage, - "available_mem": available_mem, "total_mem": total_mem, - "cpu_count": cpu_count, "gpu_count": 0, "host_name": host_name} - } - if gpu_count > 0: - if gpu_total_mem is not None: - json_params["gpu"] = gpu_info if gpu_info is not None else "" + ", Total GPU Memory: " + gpu_total_mem - else: - json_params["gpu"] = gpu_info if gpu_info is not None else "" - json_params["extra_infos"]["gpu_info"] = gpu_info if gpu_info is not None else "" - if gpu_available_mem is not None: - json_params["extra_infos"]["gpu_available_mem"] = gpu_available_mem - if gpu_total_mem is not None: - json_params["extra_infos"]["gpu_total_mem"] = gpu_total_mem - - json_params["extra_infos"]["gpu_count"] = gpu_count - json_params["extra_infos"]["gpu_vendor"] = gpu_vendor - json_params["extra_infos"]["gpu_device_name"] = gpu_device_name - - gpu_available_id_list = sys_utils.get_available_gpu_id_list(limit=gpu_count) - gpu_available_count = len(gpu_available_id_list) if gpu_available_id_list is not None else 0 - gpu_list = sys_utils.get_gpu_list() - json_params["extra_infos"]["gpu_available_count"] = gpu_available_count - json_params["extra_infos"]["gpu_available_id_list"] = gpu_available_id_list - json_params["extra_infos"]["gpu_list"] = gpu_list - else: - json_params["gpu"] = "None" - json_params["extra_infos"]["gpu_available_count"] = 0 - json_params["extra_infos"]["gpu_available_id_list"] = [] - json_params["extra_infos"]["gpu_list"] = [] - - _, cert_path = MLOpsConfigs.get_request_params() - if cert_path is not None: - try: - requests.session().verify = cert_path - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - except requests.exceptions.SSLError as err: - MLOpsConfigs.install_root_ca_file() - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - else: - response = requests.post(url, json=json_params, headers={"Connection": "close"}) - edge_id = -1 - user_name = None - extra_url = None - if response.status_code != 200: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - pass - else: - # print("url = {}, response = {}".format(url, response)) - status_code = response.json().get("code") - if status_code == "SUCCESS": - edge_id = response.json().get("data").get("id") - user_name = response.json().get("data").get("userName", None) - extra_url = response.json().get("data").get("url", None) - if edge_id is None or edge_id <= 0: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - else: - if status_code == SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR: - raise SystemExit(SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR) - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - return -1, None, None - return edge_id, user_name, extra_url - - def fetch_configs(self): - return MLOpsConfigs.fetch_all_configs() - - def send_agent_active_msg(self): - active_topic = "flserver_agent/active" - status = MLOpsStatus.get_instance().get_server_agent_status(self.edge_id) - if ( - status is not None - and status != ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE - and status != ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - ): - return - - status = ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - active_msg = {"ID": self.edge_id, "status": status} - MLOpsStatus.get_instance().set_server_agent_status(self.edge_id, status) - self.mqtt_mgr.send_message_json(active_topic, json.dumps(active_msg)) - - def subscribe_slave_devices_message(self, request_json): - if request_json is None: - return - run_id = request_json["run_id"] - edge_id_list = request_json["device_ids"] - for edge_id in edge_id_list: - if str(edge_id) == str(self.edge_id): - continue - - # subscribe deployment result message for each model device - deployment_results_topic = "model_device/model_device/return_deployment_result/{}/{}".format( - run_id, edge_id) - - self.mqtt_mgr.add_message_listener(deployment_results_topic, self.callback_deployment_result_message) - self.mqtt_mgr.subscribe_msg(deployment_results_topic) - - def subscribe_spec_device_message(self, run_id, device_id): - if device_id == self.edge_id: - return - - # subscribe deployment result message for each model device - deployment_results_topic = "model_device/model_device/return_deployment_result/{}/{}".format( - run_id, device_id) - - self.mqtt_mgr.add_message_listener(deployment_results_topic, self.callback_deployment_result_message) - self.mqtt_mgr.subscribe_msg(deployment_results_topic) - - def on_agent_mqtt_connected(self, mqtt_client_object): - # The MQTT message topic format is as follows: // - - # Setup MQTT message listener for starting deployment - server_agent_id = self.edge_id - topic_start_deployment = "model_ops/model_device/start_deployment/{}".format(str(self.edge_id)) - self.mqtt_mgr.add_message_listener(topic_start_deployment, self.callback_start_deployment) - - # Setup MQTT message listener for activating deployment - topic_activate_deployment = "model_ops/model_device/activate_deployment/{}".format(str(self.edge_id)) - self.mqtt_mgr.add_message_listener(topic_activate_deployment, self.callback_activate_deployment) - - # Setup MQTT message listener for deactivating deployment - topic_deactivate_deployment = "model_ops/model_device/deactivate_deployment/{}".format(str(self.edge_id)) - self.mqtt_mgr.add_message_listener(topic_deactivate_deployment, self.callback_deactivate_deployment) - - # Setup MQTT message listener for delete deployment - topic_delete_deployment = "model_ops/model_device/delete_deployment/{}".format(str(self.edge_id)) - self.mqtt_mgr.add_message_listener(topic_delete_deployment, self.callback_delete_deployment) - - # Setup MQTT message listener for server status switching - topic_server_status = "fl_server/flserver_agent_" + str(server_agent_id) + "/status" - self.mqtt_mgr.add_message_listener(topic_server_status, self.callback_runner_id_status) - - # Setup MQTT message listener to report current device status. - topic_report_status = "mlops/report_device_status" - self.mqtt_mgr.add_message_listener(topic_report_status, self.callback_report_current_status) - - # Setup MQTT message listener to OTA messages from the MLOps. - topic_ota_msg = "mlops/flserver_agent_" + str(server_agent_id) + "/ota" - self.mqtt_mgr.add_message_listener(topic_ota_msg, self.callback_server_ota_msg) - - # Subscribe topics for starting train, stopping train and fetching client status. - mqtt_client_object.subscribe(topic_start_deployment, qos=2) - mqtt_client_object.subscribe(topic_activate_deployment, qos=2) - mqtt_client_object.subscribe(topic_deactivate_deployment, qos=2) - mqtt_client_object.subscribe(topic_delete_deployment, qos=2) - mqtt_client_object.subscribe(topic_server_status, qos=2) - mqtt_client_object.subscribe(topic_report_status, qos=2) - mqtt_client_object.subscribe(topic_ota_msg, qos=2) - - self.subscribed_topics.clear() - self.subscribed_topics.append(topic_start_deployment) - self.subscribed_topics.append(topic_activate_deployment) - self.subscribed_topics.append(topic_deactivate_deployment) - self.subscribed_topics.append(topic_delete_deployment) - self.subscribed_topics.append(topic_server_status) - self.subscribed_topics.append(topic_report_status) - self.subscribed_topics.append(topic_ota_msg) - - self.endpoint_sync_protocol = FedMLEndpointSyncProtocol(agent_config=self.agent_config, mqtt_mgr=self.mqtt_mgr) - self.endpoint_sync_protocol.setup_listener_for_sync_device_info(self.edge_id) - - # Broadcast the first active message. - self.send_agent_active_msg() - - # Echo results - # print("\n\nCongratulations, your device is connected to the FedML MLOps platform successfully!") - # print( - # "Your FedML Edge ID is " + str(self.edge_id) + ", unique device ID is " - # + str(self.unique_device_id) - # + "\n" - # ) - - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - - def on_agent_mqtt_disconnected(self, mqtt_client_object): - MLOpsStatus.get_instance().set_server_agent_status( - self.edge_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE - ) - - def recover_inference_and_monitor(self): - try: - history_jobs = FedMLServerDataInterface.get_instance().get_history_jobs() - for job in history_jobs.job_list: - if job.running_json is None: - continue - - if job.deployment_result == "": - continue - - run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \ - model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \ - inference_end_point_id, use_gpu, memory_size, model_version, inference_port = \ - self.parse_model_run_params(json.loads(job.running_json)) - - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - is_activated = FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - get_end_point_activation(run_id) - if not is_activated: - continue - - self.start_device_inference_gateway(run_id, end_point_name, model_id, model_name, model_version, - inference_port=inference_port) - - self.stop_device_inference_monitor(run_id, end_point_name, model_id, model_name, model_version) - self.start_device_inference_monitor(run_id, end_point_name, model_id, model_name, model_version) - except Exception as e: - logging.info("recover inference and monitor: {}".format(traceback.format_exc())) - - def recover_start_deployment_msg_after_upgrading(self): - try: - current_job = FedMLServerDataInterface.get_instance().get_current_job() - if current_job is not None and \ - current_job.status == ServerConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING: - FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) - is_activated = FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ - get_end_point_activation(current_job.job_id) - if not is_activated: - return - logging.info("start deployment after upgrading.") - topic_start_deployment = "model_ops/model_device/start_deployment/{}".format(str(self.edge_id)) - self.callback_start_deployment(topic_start_deployment, current_job.running_json) - except Exception as e: - logging.info("recover starting deployment message after upgrading: {}".format(traceback.format_exc())) - - def setup_agent_mqtt_connection(self, service_config): - # Setup MQTT connection - self.mqtt_mgr = MqttManager( - service_config["mqtt_config"]["BROKER_HOST"], - service_config["mqtt_config"]["BROKER_PORT"], - service_config["mqtt_config"]["MQTT_USER"], - service_config["mqtt_config"]["MQTT_PWD"], - service_config["mqtt_config"]["MQTT_KEEPALIVE"], - "FedML_ModelServerAgent_Daemon_@" + self.user_name + "@_" + self.args.current_device_id + str(uuid.uuid4()), - "flserver_agent/last_will_msg", - json.dumps({"ID": self.edge_id, "status": ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE}) - ) - self.agent_config = service_config - - # Init local database - FedMLServerDataInterface.get_instance().create_job_table() - try: - FedMLModelDatabase.get_instance().set_database_base_dir(ServerConstants.get_database_dir()) - FedMLModelDatabase.get_instance().create_table() - except Exception as e: - pass - - server_api_cmd = "fedml.computing.scheduler.model_scheduler.device_server_api:api" - server_api_pids = RunProcessUtils.get_pid_from_cmd_line(server_api_cmd) - if server_api_pids is None or len(server_api_pids) <= 0: - # Start local API services - cur_dir = os.path.dirname(__file__) - fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir))) - python_program = get_python_program() - self.local_api_process = ServerConstants.exec_console_with_script( - "{} -m uvicorn {} --host 0.0.0.0 --port {} --reload --reload-delay 3 --reload-dir {} " - "--log-level critical".format( - python_program, server_api_cmd, ServerConstants.LOCAL_SERVER_API_PORT, - fedml_base_dir - ), - should_capture_stdout=False, - should_capture_stderr=False - ) - # if self.local_api_process is not None and self.local_api_process.pid is not None: - # print(f"Model master local API process id {self.local_api_process.pid}") - - self.recover_inference_and_monitor() - - # MLOpsRuntimeLogDaemon.get_instance(self.args).stop_all_log_processor() - - # Setup MQTT connected listener - self.mqtt_mgr.add_connected_listener(self.on_agent_mqtt_connected) - self.mqtt_mgr.add_disconnected_listener(self.on_agent_mqtt_disconnected) - self.mqtt_mgr.connect() - - self.setup_client_mqtt_mgr() - self.mlops_metrics.report_server_training_status( - self.run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE, - is_from_model=True, edge_id=self.edge_id) - MLOpsStatus.get_instance().set_server_agent_status( - self.edge_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE - ) - - self.recover_start_deployment_msg_after_upgrading() - - def stop_agent(self): - if self.run_process_event is not None: - self.run_process_event.set() - - if self.mqtt_mgr is not None: - try: - for topic in self.subscribed_topics: - self.mqtt_mgr.unsubscribe_msg(topic) - except Exception as e: - pass - - self.mqtt_mgr.loop_stop() - self.mqtt_mgr.disconnect() - - self.release_client_mqtt_mgr() - - def start_agent_mqtt_loop(self, should_exit_sys=True): - # Start MQTT message loop - try: - self.mqtt_mgr.loop_forever() - except Exception as e: - if str(e) == "Restarting after upgraded...": - logging.info("Restarting after upgraded...") - else: - print("Server tracing: {}".format(traceback.format_exc())) - finally: - self.stop_agent() - if should_exit_sys: - pass - """ - # Deprecated, will kill the process by the parent process. - time.sleep(5) - sys_utils.cleanup_all_fedml_server_login_processes( - ServerConstants.SERVER_LOGIN_PROGRAM, clean_process_group=False) - sys.exit(1) - """ - diff --git a/python/fedml/computing/scheduler/model_scheduler/job_runner_msg_sender.py b/python/fedml/computing/scheduler/model_scheduler/job_runner_msg_sender.py new file mode 100755 index 0000000000..235c4deb74 --- /dev/null +++ b/python/fedml/computing/scheduler/model_scheduler/job_runner_msg_sender.py @@ -0,0 +1,137 @@ + +import json +import logging +import os +import time +from .device_model_cache import FedMLModelCache +from .device_server_constants import ServerConstants +from ..scheduler_core.general_constants import GeneralConstants + + +class FedMLDeployJobRunnerMsgSender(object): + def __init__(self): + self.infer_host = "127.0.0.1" + self.redis_addr = "local" + self.redis_port = "6379" + self.redis_password = "fedml_default" + self.message_center = None + self.request_json = None + self.edge_id = None + + def send_deployment_results_with_payload(self, end_point_id, end_point_name, payload, replica_id_list=None): + self.send_deployment_results(end_point_id, end_point_name, + payload["model_name"], payload["model_url"], + payload["model_version"], payload["port"], + payload["inference_engine"], + payload["model_metadata"], + payload["model_config"], + payload["input_json"], + payload["output_json"], + replica_id_list=replica_id_list) + + def send_deployment_results(self, end_point_id, end_point_name, + model_name, model_inference_url, + model_version, inference_port, inference_engine, + model_metadata, model_config, input_json, output_json, replica_id_list=None): + deployment_results_topic = "model_ops/model_device/return_deployment_result" + deployment_results_payload = {"end_point_id": end_point_id, "end_point_name": end_point_name, + "model_name": model_name, "model_url": model_inference_url, + "version": model_version, "port": inference_port, + "inference_engine": inference_engine, + "model_metadata": model_metadata, + "model_config": model_config, + "input_json": input_json, + "output_json": output_json, + "timestamp": int(format(time.time_ns() / 1000.0, '.0f')), + "replica_ids": replica_id_list} + logging.info(f"[Master] deployment_results_payload is sent to mlops: {deployment_results_payload}") + + self.message_center.send_message_json(deployment_results_topic, json.dumps(deployment_results_payload)) + + @staticmethod + def send_deployment_status( + end_point_id, end_point_name, model_name, model_inference_url, model_status, message_center=None): + if message_center is None: + return + deployment_status_topic = "model_ops/model_device/return_deployment_status" + deployment_status_payload = {"end_point_id": end_point_id, "end_point_name": end_point_name, + "model_name": model_name, + "model_url": model_inference_url, + "model_status": model_status, + "timestamp": int(format(time.time_ns() / 1000.0, '.0f'))} + logging.info(f"[Master] deployment_status_payload is sent to mlops: {deployment_status_payload}") + + message_center.send_message_json(deployment_status_topic, json.dumps(deployment_status_payload)) + + @staticmethod + def send_deployment_stages(end_point_id, model_name, model_id, model_inference_url, + model_stages_index, model_stages_title, model_stage_detail, + message_center=None): + if message_center is None: + return + deployment_stages_topic = "model_ops/model_device/return_deployment_stages" + deployment_stages_payload = {"model_name": model_name, + "model_id": model_id, + "model_url": model_inference_url, + "end_point_id": end_point_id, + "model_stage_index": model_stages_index, + "model_stage_title": model_stages_title, + "model_stage_detail": model_stage_detail, + "timestamp": int(format(time.time_ns() / 1000.0, '.0f'))} + + message_center.send_message_json(deployment_stages_topic, json.dumps(deployment_stages_payload)) + + logging.info(f"-------- Stages has been sent to mlops with stage {model_stages_index} and " + f"payload {deployment_stages_payload}") + + def send_deployment_start_request_to_edges(self, in_request_json=None): + if in_request_json is not None: + self.request_json = in_request_json + + # Iterate through replica_num_diff, both add and replace should be sent to the edge devices + if "replica_num_diff" not in self.request_json or self.request_json["replica_num_diff"] is None: + return [] + + edge_id_list = [] + for device_id in self.request_json["replica_num_diff"].keys(): + edge_id_list.append(device_id) + + self.request_json["master_node_ip"] = GeneralConstants.get_ip_address(self.request_json) + should_added_devices = [] + for edge_id in edge_id_list: + if edge_id == self.edge_id: + continue + should_added_devices.append(edge_id) + # send start deployment request to each device + self.send_deployment_start_request_to_edge(edge_id, self.request_json) + return should_added_devices + + def send_deployment_start_request_to_edge(self, edge_id, request_json): + topic_start_deployment = "model_ops/model_device/start_deployment/{}".format(str(edge_id)) + logging.info("start_deployment: send topic " + topic_start_deployment + " to client...") + self.message_center.send_message_json(topic_start_deployment, json.dumps(request_json)) + + def send_deployment_delete_request_to_edges(self, payload, model_msg_object, message_center=None): + edge_id_list_to_delete = model_msg_object.device_ids + + # Remove the model master node id from the list using index 0 + edge_id_list_to_delete = edge_id_list_to_delete[1:] + + logging.info("Device ids to be deleted: " + str(edge_id_list_to_delete)) + + for edge_id in edge_id_list_to_delete: + if edge_id == self.edge_id: + continue + # send delete deployment request to each model device + topic_delete_deployment = "model_ops/model_device/delete_deployment/{}".format(str(edge_id)) + logging.info("delete_deployment: send topic " + topic_delete_deployment + " to client...") + if message_center is not None: + message_center.send_message_json(topic_delete_deployment, payload) + else: + self.message_center.send_message_json(topic_delete_deployment, payload) + + def send_deployment_stop_request_to_edges(self, edge_id_list, payload): + for edge_id in edge_id_list: + topic_stop_deployment = "model_ops/model_device/stop_deployment/{}".format(str(self.edge_id)) + logging.info("stop_deployment: send topic " + topic_stop_deployment) + self.message_center.send_message_json(topic_stop_deployment, payload) diff --git a/python/fedml/computing/scheduler/model_scheduler/master_agent.py b/python/fedml/computing/scheduler/model_scheduler/master_agent.py new file mode 100755 index 0000000000..2f30ae8b8e --- /dev/null +++ b/python/fedml/computing/scheduler/model_scheduler/master_agent.py @@ -0,0 +1,27 @@ + +from .device_server_constants import ServerConstants +from .device_server_data_interface import FedMLServerDataInterface +from .master_protocol_manager import FedMLDeployMasterProtocolManager +from ..master.base_master_agent import FedMLBaseMasterAgent + + +class FedMLDeployMasterAgent(FedMLBaseMasterAgent): + + def __init__(self): + FedMLBaseMasterAgent.__init__(self) + + # Override + def _get_log_file_dir(self): + return ServerConstants.get_log_file_dir() + + # Override + def _save_agent_info(self, unique_device_id, edge_id): + ServerConstants.save_runner_infos(unique_device_id, edge_id) + + # Override + def _init_database(self): + FedMLServerDataInterface.get_instance().create_job_table() + + # Override + def _generate_protocol_manager_instance(self, args, agent_config=None): + return FedMLDeployMasterProtocolManager(args, agent_config=agent_config) \ No newline at end of file diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py new file mode 100755 index 0000000000..00b08acfb8 --- /dev/null +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py @@ -0,0 +1,772 @@ +import copy +import json +import logging +import os +import time +import queue +import traceback +from abc import ABC +from multiprocessing import Queue + +import fedml +from fedml.core.mlops import MLOpsRuntimeLog, MLOpsConfigs +from fedml.core.mlops.mlops_runtime_log import MLOpsFormatter +from .device_client_constants import ClientConstants +from .device_model_cache import FedMLModelCache +from .device_server_constants import ServerConstants +from .device_server_data_interface import FedMLServerDataInterface +from ..comm_utils import sys_utils +from ..comm_utils.run_process_utils import RunProcessUtils +from ..comm_utils.sys_utils import get_python_program +from ..scheduler_core.general_constants import GeneralConstants +from ..master.base_master_job_runner import FedMLBaseMasterJobRunner +from .device_replica_controller import FedMLDeviceReplicaController +from .job_runner_msg_sender import FedMLDeployJobRunnerMsgSender + + +class FedMLDeployMasterJobRunner(FedMLBaseMasterJobRunner, FedMLDeployJobRunnerMsgSender, ABC): + default_redis_addr = "local" + default_redis_port = "6379" + default_redis_password = "fedml_default" + + def __init__(self, args, run_id=0, request_json=None, agent_config=None, edge_id=0, + cuda_visible_gpu_ids_str=None): + FedMLDeployJobRunnerMsgSender.__init__(self) + FedMLBaseMasterJobRunner.__init__( + self, args, edge_id=edge_id, request_json=request_json, agent_config=agent_config, run_id=run_id, + cuda_visible_gpu_ids_str=cuda_visible_gpu_ids_str, agent_data_dir=ServerConstants.get_data_dir(), + agent_package_download_dir=ServerConstants.get_package_download_dir(), + agent_package_unzip_dir=GeneralConstants.get_package_unzip_dir(ServerConstants.get_package_download_dir()), + agent_log_file_dir=ServerConstants.get_log_file_dir() + ) + + self.is_deployment_runner = True + self.infer_host = "127.0.0.1" + self.redis_addr = "local" + self.redis_port = "6379" + self.redis_password = "fedml_default" + self.inference_gateway_process = None + self.monitor_process = None + self.replica_controller = None + self.deployed_replica_payload = None + self.slave_deployment_results_map = dict() + self.deployment_result_queue = Queue() + self.is_fresh_endpoint = True + + # Override + def _generate_job_runner_instance(self, args, run_id=None, request_json=None, agent_config=None, edge_id=None, ): + return FedMLDeployMasterJobRunner( + args, run_id=run_id, request_json=request_json, agent_config=self.agent_config, edge_id=edge_id + ) + + # Override + def _generate_extend_queue_list(self): + return [self.deployment_result_queue] + + # Override + def run_impl( + self, edge_id_status_queue, edge_device_info_queue, run_metrics_queue, + run_event_queue, run_artifacts_queue, run_logs_queue, edge_device_info_global_queue, + run_extend_queue_list=None, sender_message_queue=None, listener_message_queue=None, + status_center_queue=None + ): + # Parse the model parameters. + run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \ + model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \ + inference_end_point_id, use_gpu, memory_size, model_version, inference_port = \ + FedMLDeployMasterJobRunner.parse_model_run_params(self.request_json) + self.run_id = run_id + self.is_fresh_endpoint = self.request_json.get("is_fresh_endpoint", True) + + # Print request parameters. + logging.info("model deployment request: {}".format(self.request_json)) + logging.info("send deployment stages...") + + # Generate the replica controller object + self.replica_controller = FedMLDeviceReplicaController(self.edge_id, self.request_json) + + # Start the process to report system performance(cpu,memory,etc.) to MLOps + # TODO(Raphael): This measurement is for the host machine. Change to container's metrics + self.mlops_metrics.report_sys_perf(self.args, self.agent_config["mqtt_config"], run_id=run_id) + + # Check if we should stop the runner + self.check_runner_stop_event() + + # Send stage: MODEL_DEPLOYMENT_STAGE4 = "ForwardRequest2Slave" + self.send_deployment_stages( + self.run_id, model_name, model_id, "", ServerConstants.MODEL_DEPLOYMENT_STAGE4["index"], + ServerConstants.MODEL_DEPLOYMENT_STAGE4["text"], ServerConstants.MODEL_DEPLOYMENT_STAGE4["text"], + message_center=self.message_center) + + # Init the runtime logs + self.args.run_id = self.run_id + MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) + + # Report server running status + logging.info("report deployment status...") + self.check_runner_stop_event() + self.status_reporter.report_server_id_status( + run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_STARTING, + is_from_model=True, running_json=json.dumps(self.request_json), + server_agent_id=self.edge_id, server_id=self.edge_id, edge_id=self.edge_id) + self.send_deployment_status( + self.run_id, end_point_name, model_name, "", + ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYING, + message_center=self.message_center) + + # start unified inference gateway process if not started + FedMLDeployMasterJobRunner.start_device_inference_gateway() + + # start inference monitor process + FedMLDeployMasterJobRunner.stop_device_inference_monitor( + run_id, end_point_name, model_id, model_name, model_version) + FedMLDeployMasterJobRunner.start_device_inference_monitor( + run_id, end_point_name, model_id, model_name, model_version) + + # Changed the status to "IDLE" + self.status_reporter.report_server_id_status( + run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED, + is_from_model=True, server_agent_id=self.edge_id, server_id=self.edge_id, edge_id=self.edge_id) + + # Check if we should stop the runner + logging.info("send the model inference request to slave devices...") + self.check_runner_stop_event() + + # Forward deployment request to slave devices + # Handle "op:add" && "op:remove" + devices_sent_add_or_remove_msg = self.send_deployment_start_request_to_edges() + + # Handle "op:update" + try: + devices_sent_update_remove_msg = self.send_first_scroll_update_msg() + + if len(devices_sent_add_or_remove_msg) == 0 and len(devices_sent_update_remove_msg) == 0: + # No device is added, updated or removed + logging.info("No device is added, updated or removed. No action needed for reconciliation.") + ip = GeneralConstants.get_ip_address(self.request_json) + master_port = ServerConstants.get_inference_master_gateway_port() + if master_port is not None: + inference_port = int(master_port) + model_inference_port = inference_port + if ip.startswith("http://") or ip.startswith("https://"): + model_inference_url = "{}/api/v1/predict".format(ip) + else: + model_inference_url = "http://{}:{}/api/v1/predict".format(ip, model_inference_port) + + self.send_deployment_status( + run_id, end_point_name, model_name, model_inference_url, + ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, + message_center=self.message_center + ) + + # Set setting to "DEPLOYED" for autoscaling service reference + FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) + FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ + update_user_setting_replica_num(end_point_id=run_id, state="DEPLOYED") + + # Complete the job runner + self.trigger_completed_event() + + return + except Exception as e: + logging.error(f"Failed to send first scroll update message due to {e}.") + logging.error(f"Exception traceback {traceback.format_exc()}.") + + logging.info("Start waiting for result callback from workers ...") + + self.deployment_result_queue = run_extend_queue_list[0] + while True: + self.check_runner_stop_event() + + try: + deployment_result = self.deployment_result_queue.get(block=False, timeout=0.2) + result_topic = deployment_result.get("topic", None) + result_payload = deployment_result.get("payload", None) + self.process_deployment_result_message(topic=result_topic, payload=result_payload) + except queue.Empty as e: # If queue is empty, then continue + pass + + time.sleep(0.5) + + def save_deployment_result(self, topic=None, payload=None): + self.deployment_result_queue.put({"topic": topic, "payload": payload}) + + def process_deployment_result_message(self, topic=None, payload=None): + # Parse the parameters + topic_splits = str(topic).split('/') + device_id = topic_splits[-1] + payload_json = json.loads(payload) + end_point_id = payload_json["end_point_id"] + end_point_name = payload_json["end_point_name"] + model_id = payload_json["model_id"] + model_name = payload_json["model_name"] + model_version = payload_json["model_version"] + model_status = payload_json["model_status"] + replica_no = payload_json.get("replica_no", None) # "no" Idx start from 1 + run_id_str = str(end_point_id) + + # HotFix(Raphael): logging service cross talk + # Change the handler since each handler need to write to different log files + try: + # Remove the existing file handler + root_logger = logging.getLogger() + for handler in root_logger.handlers: + if isinstance(handler, logging.FileHandler): + root_logger.removeHandler(handler) + + # Correct log path: ~/.fedml/fedml-model-server/fedml/logs/fedml-run-$rid-edge-$eid.log + log_file = os.path.join(ServerConstants.get_log_file_dir(), + f"fedml-run-{run_id_str}-edge-{self.edge_id}.log") + + filehandler = logging.FileHandler(log_file, "a") + + program_prefix = "FedML-Server @device-id-{}".format(self.edge_id) + formatter = MLOpsFormatter(fmt="[" + program_prefix + "] [%(asctime)s] [%(levelname)s] " + "[%(filename)s:%(lineno)d:%(funcName)s] %(" + "message)s") + + filehandler.setFormatter(formatter) + root_logger.addHandler(filehandler) + except Exception as e: + logging.warning(f"Failed to change the logging handler due to {e}.") + + logging.info("========== callback_deployment_result_message ==========\n") + + # The rolling update and scale out / in operation should not happen at the same time + assert not ("replica_num_diff" in self.request_json and + len(self.request_json["replica_num_diff"]) > 0 and + "replica_version_diff" in self.request_json) + + if "replica_version_diff" in self.request_json: + run_operation = "UPDATE" + elif "replica_num_diff" in self.request_json and \ + len(self.request_json["replica_num_diff"]) > 0: + run_operation = "ADD_OR_REMOVE" + else: + logging.error(f"Unsupported operation for run id {run_id_str}. and request json " + f"{self.request_json}") + return + + logging.info(f"Endpoint {end_point_id}; Device {device_id}; replica {replica_no}; " + f"run_operation {run_operation} model status {model_status}.") + + # Set redis + sqlite deployment result + FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) + + # Deal with different model status + if model_status == ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DELETED: + # remove + FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ + delete_deployment_result_with_device_id_and_replica_no( + end_point_id, end_point_name, model_name, device_id, replica_no) + elif model_status == ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED: + # add or update or update-failed-rollback + FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ + set_deployment_result(end_point_id, end_point_name, + model_name, model_version, + device_id, payload, replica_no) + + # Note: To display the result in the UI, we need to save successful deployment result to the database + self.save_deployed_replica_payload(payload_json) + else: + if model_status != ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED: + logging.error(f"Unsupported model status {model_status}.") + + # Avoid endless loop, if the rollback also failed, we should report the failure to the MLOps + if self.replica_controller.under_rollback or self.is_fresh_endpoint: + self.send_deployment_status( + end_point_id, end_point_name, payload_json["model_name"], "", + ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED, + message_center=self.message_center) + return + + # Failure handler, send the rollback message to the worker devices only if it has not been rollback + if run_operation == "ADD_OR_REMOVE": + # During Scale out / in, + # the worker that already been scaled out / in should be sent the rollback message + rollback_dict = self.replica_controller.rollback_add_or_remove_replica( + device_id=device_id, replica_no=replica_no, op_type=run_operation + ) + self.replica_controller.under_rollback = True + + if rollback_dict is not None and len(rollback_dict) > 0: + self.send_deployment_status( + end_point_id, end_point_name, payload_json["model_name"], "", + ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_ABORTING, + message_center=self.message_center) + self.send_rollback_add_remove_op(run_id_str, rollback_dict) + return + else: + # This is the last worker that failed, so we should continue to "ABORTED" status + model_inference_url = self.construct_final_gateway_url(end_point_id) + + self.send_deployment_status( + end_point_id, end_point_name, payload_json["model_name"], model_inference_url, + ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_ABORTED, message_center=self.message_center) + + # For auto-scaling, should update the state to "DEPLOYED" + FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ + update_user_setting_replica_num(end_point_id=end_point_id, state="DEPLOYED") + + self.replica_controller.under_rollback = False + + return + elif run_operation == "UPDATE": + # Overwrite the json with the rollback version diff + rollback_version_diff = self.replica_controller.rollback_get_replica_version_diff( + device_id_trigger=device_id, replica_no_trigger=replica_no) + + # Change the target version to the start version + self.replica_controller.rollback_setback_target_replica_version() + + self.request_json["replica_version_diff"] = copy.deepcopy(rollback_version_diff) + + # Send the rollback message to the worker devices + self.send_rollback_msg(run_id_str) + + # Set the deployment status to ABORTING + self.send_deployment_status( + end_point_id, end_point_name, payload_json["model_name"], "", + ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_ABORTING, + message_center=self.message_center) + + # TODO(Raphael): Check if resource left not cleaned up + return + + # Move to the next state (rolling update, finish the deployment, etc.) + # Notify the replica number controller + (self.replica_controller.callback_update_curr_replica_num_state(device_id, replica_no, model_status)) + + # Notify the replica version controller, which might trigger the next rolling update + self.send_next_scroll_update_msg(run_id_str, device_id, replica_no) + + # Update the global deployment result mapping + self.slave_deployment_results_map[str(device_id)] = model_status + + logging.info("callback_deployment_result_message: topic {}, payload {}.".format(topic, payload)) + + request_json = self.request_json + if request_json is None: + logging.error(f"The endpoint {end_point_id} is no longer running.") + self.send_deployment_status( + end_point_id, end_point_name, payload_json["model_name"], "", + ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED, + message_center=self.message_center) + return + + # Wait for all replica-level's result, not device-level + if (self.replica_controller.is_all_replica_num_reconciled() and + self.replica_controller.is_all_replica_version_reconciled()): + """ + When all the devices have finished the add / delete / update operation + """ + model_inference_url = self.construct_final_gateway_url(end_point_id) + + # Send stage: MODEL_DEPLOYMENT_STAGE5 = "StartInferenceIngress" + self.send_deployment_stages(end_point_id, model_name, model_id, + model_inference_url, + ServerConstants.MODEL_DEPLOYMENT_STAGE5["index"], + ServerConstants.MODEL_DEPLOYMENT_STAGE5["text"], + "inference url: {}".format(model_inference_url), + message_center=self.message_center) + + # Send the result to MLOps + if self.deployed_replica_payload is not None: + payload_json = self.deployed_replica_payload + model_slave_url = payload_json["model_url"] + payload_json["model_url"] = model_inference_url + payload_json["port"] = ServerConstants.get_inference_master_gateway_port() + token = FedMLModelCache.get_instance(self.redis_addr, self.redis_port).get_end_point_token( + end_point_id, end_point_name, model_name) + + model_metadata = payload_json["model_metadata"] + model_inputs = model_metadata["inputs"] + + if "type" in model_metadata and model_metadata["type"] == "default": + payload_json["input_json"] = {"end_point_name": end_point_name, + "model_name": model_name, + "token": str(token), + "inputs": model_inputs, + "outputs": []} + payload_json["output_json"] = model_metadata["outputs"] + else: + raise Exception(f"Unsupported model metadata type {model_metadata['type']}") + + self.send_deployment_results_with_payload( + end_point_id, end_point_name, payload_json, + self.replica_controller.target_replica_ids) + + payload_json_saved = payload_json + payload_json_saved["model_slave_url"] = model_slave_url + FedMLServerDataInterface.get_instance().save_job_result(end_point_id, self.edge_id, + json.dumps(payload_json_saved)) + else: + # Arrive here because only contains remove ops, so we do not need to update the model metadata + pass + + # For auto-scaling, should update the state to "DEPLOYED" + FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ + update_user_setting_replica_num(end_point_id=end_point_id, state="DEPLOYED") + + if self.replica_controller.under_rollback: + # If first time failed (Still might need rollback), then send failed message to the MLOps + if not (FedMLModelCache.get_instance(self.redis_addr, self.redis_port). + get_end_point_activation(end_point_id)): + self.send_deployment_status( + end_point_id, end_point_name, payload_json["model_name"], "", + ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED, message_center=self.message_center) + else: + self.send_deployment_status( + end_point_id, end_point_name, payload_json["model_name"], model_inference_url, + ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_ABORTED, message_center=self.message_center) + + self.replica_controller.under_rollback = False + else: + # Set the end point activation status to True, for scaling out / in and rolling update + FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ + set_end_point_activation(end_point_id, end_point_name, True) + + self.send_deployment_status( + end_point_id, end_point_name, payload_json["model_name"], model_inference_url, + ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, message_center=self.message_center) + + time.sleep(3) + self.trigger_completed_event() + + def cleanup_runner_process(self, run_id): + ServerConstants.cleanup_run_process(run_id, not_kill_subprocess=True) + + @staticmethod + def start_device_inference_gateway(): + # start unified inference server + python_program = get_python_program() + inference_port = ServerConstants.get_inference_master_gateway_port() + if not ServerConstants.is_running_on_k8s(): + logging.info(f"start the model inference gateway...") + inference_gw_cmd = "fedml.computing.scheduler.model_scheduler.device_model_inference:api" + inference_gateway_pids = RunProcessUtils.get_pid_from_cmd_line(inference_gw_cmd) + if inference_gateway_pids is None or len(inference_gateway_pids) <= 0: + cur_dir = os.path.dirname(__file__) + fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir))) + inference_gateway_process = ServerConstants.exec_console_with_script(f"{python_program} " + f"-m uvicorn {inference_gw_cmd} " + f"--host 0.0.0.0 " + f"--port {str(inference_port)} " + f"--reload --reload-delay 3 " + f"--reload-dir {fedml_base_dir} " + f"--log-level info", + should_capture_stdout=False, + should_capture_stderr=False) + return inference_gateway_process + else: + return inference_gateway_pids[0] + + return None + + @staticmethod + def start_device_inference_monitor( + run_id, end_point_name, model_id, model_name, model_version, check_stopped_event=True, + redis_addr="localhost", redis_port=6379, redis_password="fedml_default" + ): + # start inference monitor server + # Will report the qps related metrics to the MLOps + logging.info(f"start the model inference monitor, end point {run_id}, model name {model_name}...") + run_id_str = str(run_id) + pip_source_dir = os.path.dirname(__file__) + monitor_file = os.path.join(pip_source_dir, "device_model_monitor.py") + python_program = get_python_program() + running_model_name = ServerConstants.get_running_model_name(end_point_name, + model_name, model_version, run_id, model_id) + monitor_process = ServerConstants.exec_console_with_shell_script_list( + [python_program, monitor_file, "-v", fedml.get_env_version(), "-ep", run_id_str, + "-epn", str(end_point_name), "-mi", str(model_id), "-mn", model_name, + "-mv", model_version, "-iu", "infer_url", "-ra", redis_addr, + "-rp", str(redis_port), "-rpw", redis_password], + should_capture_stdout=False, should_capture_stderr=False + ) + return monitor_process + + @staticmethod + def stop_device_inference_monitor(run_id, end_point_name, model_id, model_name, model_version): + # stop inference monitor server + logging.info(f"stop the model inference monitor, end point {run_id}, model name {model_name}...") + sys_utils.cleanup_model_monitor_processes(run_id, end_point_name, + model_id, model_name, model_version) + + @staticmethod + def recover_inference_and_monitor(): + # noinspection PyBroadException + try: + agent_config = dict() + try: + agent_config["mqtt_config"], _, _, _ = MLOpsConfigs.fetch_all_configs() + except Exception as e: + pass + + history_jobs = FedMLServerDataInterface.get_instance().get_history_jobs() + for job in history_jobs.job_list: + if job.running_json is None: + continue + + if job.deployment_result == "": + continue + + run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \ + model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \ + inference_end_point_id, use_gpu, memory_size, model_version, inference_port = \ + FedMLDeployMasterJobRunner.parse_model_run_params(json.loads(job.running_json)) + + FedMLModelCache.get_instance().set_redis_params() + is_activated = FedMLModelCache.get_instance().get_end_point_activation(run_id) + if not is_activated: + continue + + FedMLDeployMasterJobRunner.start_device_inference_gateway() + + FedMLDeployMasterJobRunner.stop_device_inference_monitor( + run_id, end_point_name, model_id, model_name, model_version) + FedMLDeployMasterJobRunner.start_device_inference_monitor( + run_id, end_point_name, model_id, model_name, model_version) + except Exception as e: + logging.info("recover inference and monitor: {}".format(traceback.format_exc())) + + def send_first_scroll_update_msg(self): + """ + Replica-level rolling update. + Delete the record of the replaced device and send the deployment msg to the devices + """ + if "replica_version_diff" not in self.request_json or self.request_json["replica_version_diff"] is None: + return [] + + first_chunk_dict = self.request_json["replica_version_diff"] + + # Delete the record of the replaced device + try: + self.delete_device_replica_info_on_master( + self.request_json["end_point_id"], self.request_json["end_point_name"], + self.request_json["model_config"]["model_name"], first_chunk_dict) + except Exception as e: + logging.info(f"Exception at send_first_scroll_update_msg {traceback.format_exc()}") + + logging.info(f"Send the first scroll update msg to the device {first_chunk_dict} ") + + # Send the deployment msg to the devices, (we reuse the start_deployment msg) + for edge_id in first_chunk_dict.keys(): + if edge_id == self.edge_id: + continue + # send start deployment request to each device + self.send_deployment_start_request_to_edge(edge_id, self.request_json) + return list(first_chunk_dict.keys()) + + def send_next_scroll_update_msg(self, run_id_str, device_id, replica_no): + """ + Send the next scroll update msg to the devices if needed. + If there is no need for the next scroll update, directly return. + """ + if replica_no is None: + return + + replica_controller = self.replica_controller + + if replica_controller.total_replica_version_diff_num == 0: + return + + if replica_controller.under_rollback: + replica_controller.intermediate_replica_version[device_id][replica_no] = replica_controller.start_version + return + + logging.info(f"Curr updating window: {replica_controller.curr_replica_updating_window} " + f"Curr version diff num: {replica_controller.total_replica_version_diff_num}") + + replica_controller.callback_update_updating_window(device_id, replica_no) + + # Decide whether to send the next scroll update + next_chunk_dict = replica_controller.get_next_chunk_devices_replica() + + if next_chunk_dict: + logging.info(f"The next scroll update for end point {run_id_str} is {next_chunk_dict}") + # Update curr updating window + replica_controller.curr_replica_updating_window = copy.deepcopy(next_chunk_dict) + + # Use global deployment result mapping to decide whether to send the next scroll update + self.request_json["replica_version_diff"] = next_chunk_dict + + # Avoid using the old request_json + try: + self.delete_device_replica_info_on_master( + self.request_json["end_point_id"], + self.request_json["end_point_name"], + self.request_json["model_config"]["model_name"], + next_chunk_dict) + except Exception as e: + logging.info(f"Exception at send_next_scroll_update_msg {traceback.format_exc()}") + + # Send the deployment msg to the devices, (we reuse the start_deployment msg) + for edge_id in next_chunk_dict.keys(): + if edge_id == self.edge_id: + continue + # send start deployment request to each device + self.send_deployment_start_request_to_edge(edge_id, self.request_json) + return + + def send_rollback_msg(self, run_id_str): + # Avoid using the old request_json + try: + self.delete_device_replica_info_on_master( + self.request_json["end_point_id"], + self.request_json["end_point_name"], + self.request_json["model_config"]["model_name"], + self.request_json["replica_version_diff"]) + except Exception as e: + logging.info(f"Exception at send_rollback_msg {traceback.format_exc()}") + + # Send the deployment msg to the devices, (we reuse the start_deployment msg) + for edge_id in self.request_json["replica_version_diff"].keys(): + if edge_id == self.edge_id: + continue + # send start deployment request to each device + self.send_deployment_start_request_to_edge(edge_id, self.request_json) + + def send_rollback_add_remove_op(self, run_id, rollback_replica_dict): + """ + This method is used when the original add op failed, we need to rollback by delete the existed replicas + Input example: + rollback_replica_dict = {'96684': {'curr_num': 2, 'op': 'remove', 'target_num': 1}} + """ + existed_request_json = self.request_json + updated_request_json = copy.deepcopy(existed_request_json) + + # Reverse the replica_num_diff + updated_request_json["replica_num_diff"] = rollback_replica_dict + + self.send_deployment_start_request_to_edges(in_request_json=updated_request_json) + + def delete_device_replica_info_on_master(self, endpoint_id, endpoint_name, model_name, edge_id_replica_no_dict): + FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) + # Remove the record of the replaced device + # [Deprecated] deployment status & device info + # Delete the result in deployment result list in Redis / SQLite + device_result_list = FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ + get_deployment_result_list(endpoint_id, endpoint_name, model_name) + + delete_device_result_list = [] + for device_result in device_result_list: + device_result_dict = json.loads(device_result) + if (str(device_result_dict["cache_device_id"]) in edge_id_replica_no_dict.keys() and + str(device_result_dict["cache_replica_no"]) in + edge_id_replica_no_dict[str(device_result_dict["cache_device_id"])]): + delete_device_result_list.append(device_result) + + for delete_item in delete_device_result_list: + FedMLModelCache.get_instance(self.redis_addr, self.redis_port).delete_deployment_result( + delete_item, endpoint_id, endpoint_name, model_name + ) + + logging.info(f"Deleted the replica record on master: {edge_id_replica_no_dict}") + + def save_deployed_replica_payload(self, payload_json): + self.deployed_replica_payload = copy.deepcopy(payload_json) + + def get_deployed_replica_payload(self): + return self.deployed_replica_payload + + def callback_update_curr_replica_num_state(self, changed_device_id, replica_no, op_type): + if self.replica_controller is not None: + self.replica_controller.callback_update_curr_replica_num_state(changed_device_id, replica_no, op_type) + + def is_all_replica_num_reconciled(self): + if self.replica_controller is not None: + return self.replica_controller.is_all_replica_num_reconciled() + + return False + + def is_all_replica_version_reconciled(self): + if self.replica_controller is not None: + return self.replica_controller.is_all_replica_version_reconciled() + + return False + + @staticmethod + def generate_request_json_with_replica_num_diff(run_id, edge_id, request_json): + # Replica Controller is per deployment! + replica_controller = FedMLDeviceReplicaController(edge_id, request_json) + logging.info(f"Start Diff Replica controller for run {run_id} on edge {edge_id}") + + # Prepare num diff + run_id_str = str(run_id) + new_request_with_num_diff = replica_controller.generate_diff_to_request_json() + request_json = new_request_with_num_diff + + return request_json + + @staticmethod + def generate_request_json_with_replica_version_diff(run_id, edge_id, request_json): + # Replica Controller is per deployment! + replica_controller = FedMLDeviceReplicaController(edge_id, request_json) + logging.info(f"Start Diff Replica controller for run {run_id} on edge {edge_id}") + + # Prepare version diff + new_request_with_version_diff = replica_controller.init_first_update_device_replica_mapping() + request_json = new_request_with_version_diff + + return request_json + + @staticmethod + def parse_model_run_params(running_json): + run_id = running_json["end_point_id"] + end_point_name = running_json["end_point_name"] + token = running_json["token"] + user_id = running_json["user_id"] + user_name = running_json["user_name"] + device_ids = running_json["device_ids"] + device_objs = running_json["device_objs"] + + model_config = running_json["model_config"] + model_name = model_config["model_name"] + model_id = model_config["model_id"] + model_storage_url = model_config["model_storage_url"] + scale_min = model_config.get("instance_scale_min", 0) + scale_max = model_config.get("instance_scale_max", 0) + inference_engine = model_config.get("inference_engine", 0) + model_is_from_open = model_config["is_from_open"] + inference_end_point_id = run_id + use_gpu = "gpu" # TODO: Get GPU from device infos + memory_size = "256m" # TODO: Get Memory size for each instance + model_version = model_config["model_version"] + model_config_parameters = running_json.get("parameters", {}) + + inference_port = ServerConstants.get_inference_master_gateway_port() + + return run_id, end_point_name, token, user_id, user_name, device_ids, device_objs, model_config, model_name, \ + model_id, model_storage_url, scale_min, scale_max, inference_engine, model_is_from_open, \ + inference_end_point_id, use_gpu, memory_size, model_version, inference_port + + # Override + def get_download_package_info(self, packages_config=None): + model_name = packages_config["model_name"] + model_storage_url = packages_config["model_storage_url"] + return model_name, model_storage_url + + # Override + def build_dynamic_args(self, run_id, run_config, package_conf_object, base_dir): + pass + + # Override + def build_dynamic_constrain_variables(self, run_id, run_config): + pass + + def construct_final_gateway_url(self, end_point_id): + inference_port_external = ServerConstants.get_inference_master_gateway_port() + ip = GeneralConstants.get_ip_address(self.request_json) + + identifier = "inference" + if self.deployed_replica_payload is not None: + payload_json = self.deployed_replica_payload + enable_custom_path = payload_json["model_metadata"].get( + ServerConstants.EXPOSE_SUBDOMAINS_KEY, False) + if enable_custom_path: + identifier = "custom_inference" + + model_inference_url = "http://{}:{}/{}/{}".format(ip, inference_port_external, identifier, end_point_id) + return model_inference_url + diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py new file mode 100755 index 0000000000..0c674cb5f0 --- /dev/null +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner_manager.py @@ -0,0 +1,74 @@ + +import json +from fedml.core.common.singleton import Singleton +from ..master.base_master_job_runner_manager import FedMLBaseMasterJobRunnerManager +from .master_job_runner import FedMLDeployMasterJobRunner +from ..scheduler_core.general_constants import GeneralConstants + + +class FedMLDeployJobRunnerManager(FedMLBaseMasterJobRunnerManager, Singleton): + def __init__(self): + FedMLBaseMasterJobRunnerManager.__init__(self) + + @staticmethod + def get_instance(): + return FedMLDeployJobRunnerManager() + + # Override + def _generate_job_runner_instance( + self, args, run_id=None, request_json=None, agent_config=None, edge_id=None + ): + job_runner = FedMLDeployMasterJobRunner( + args, run_id=run_id, request_json=request_json, agent_config=agent_config, edge_id=edge_id) + job_runner.infer_host = GeneralConstants.get_ip_address(request_json) + return job_runner + + def save_deployment_result(self, topic, payload): + payload_json = json.loads(payload) + endpoint_id = payload_json["end_point_id"] + run_id_str = str(endpoint_id) + if self.job_runners.get(run_id_str, None) is not None: + self.job_runners[run_id_str].save_deployment_result(topic=topic, payload=payload) + + def send_deployment_stages( + self, end_point_id, model_name, model_id, model_inference_url, + model_stages_index, model_stages_title, model_stage_detail, message_center=None + ): + run_id_str = str(end_point_id) + if self.job_runners.get(run_id_str, None) is not None: + self.job_runners[run_id_str].send_deployment_stages( + end_point_id, model_name, model_id, model_inference_url, + model_stages_index, model_stages_title, model_stage_detail, + message_center=message_center + ) + + def send_deployment_delete_request_to_edges(self, end_point_id, payload, model_msg_object, message_center=None, + args=None): + run_id_str = str(end_point_id) + if self.job_runners.get(run_id_str, None) is not None: + self.job_runners[run_id_str].send_deployment_delete_request_to_edges( + payload, model_msg_object, message_center=message_center) + else: + # Hotfix: re-instantiate the job runner + # TODO(Alay, Raphael): Try to dig into whether re-instantiate the job runner is necessary + self.job_runners[run_id_str] = self._generate_job_runner_instance(args) + self.job_runners[run_id_str].send_deployment_delete_request_to_edges( + payload, model_msg_object, message_center=message_center) + + def stop_device_inference_monitor(self, run_id, end_point_name, model_id, model_name, model_version): + run_id_str = str(run_id) + if self.job_runners.get(run_id_str, None) is not None: + self.job_runners[run_id_str].stop_device_inference_monitor( + run_id, end_point_name, model_id, model_name, model_version) + + @staticmethod + def recover_inference_and_monitor(): + FedMLDeployMasterJobRunner.recover_inference_and_monitor() + + @staticmethod + def generate_request_json_with_replica_num_diff(run_id, edge_id, request_json): + return FedMLDeployMasterJobRunner.generate_request_json_with_replica_num_diff(run_id, edge_id, request_json) + + @staticmethod + def generate_request_json_with_replica_version_diff(run_id, edge_id, request_json): + return FedMLDeployMasterJobRunner.generate_request_json_with_replica_version_diff(run_id, edge_id, request_json) diff --git a/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py new file mode 100755 index 0000000000..9e0d51b588 --- /dev/null +++ b/python/fedml/computing/scheduler/model_scheduler/master_protocol_manager.py @@ -0,0 +1,364 @@ + +import json +import logging +from fedml.core.mlops import MLOpsConfigs, MLOpsRuntimeLog, MLOpsRuntimeLogDaemon +from .device_model_cache import FedMLModelCache +from .device_model_db import FedMLModelDatabase +from .device_model_msg_object import FedMLModelMsgObject +from .device_server_constants import ServerConstants +from .device_server_data_interface import FedMLServerDataInterface +from ..master.base_master_protocol_manager import FedMLBaseMasterProtocolManager +from .master_job_runner_manager import FedMLDeployJobRunnerManager +from ..scheduler_core.general_constants import GeneralConstants +from ..scheduler_core.endpoint_sync_protocol import FedMLEndpointSyncProtocol +from ..scheduler_core.compute_cache_manager import ComputeCacheManager + + +class FedMLDeployMasterProtocolManager(FedMLBaseMasterProtocolManager): + def __init__(self, args, agent_config=None): + FedMLBaseMasterProtocolManager.__init__(self, args, agent_config=agent_config) + + self.message_center_name = "deploy_master_agent" + self.is_deployment_status_center = True + + self.topic_start_deployment = None + self.topic_activate_endpoint = None + self.topic_deactivate_deployment = None + self.topic_delete_deployment = None + + self.infer_host = "127.0.0.1" + self.redis_addr = "local" + self.redis_port = "6379" + self.redis_password = "fedml_default" + self.endpoint_sync_protocol = None + + # Override + def _generate_protocol_manager_instance(self, args, agent_config=None): + return FedMLDeployMasterProtocolManager(args, agent_config=agent_config) + + # Override + def generate_topics(self): + super().generate_topics() + + # The topic for start deployment + self.topic_start_deployment = "model_ops/model_device/start_deployment/{}".format(str(self.edge_id)) + + # The topic for activating endpoint + self.topic_activate_endpoint = "model_ops/model_device/activate_deployment/{}".format(str(self.edge_id)) + + # The topic for activating endpoint + self.topic_deactivate_deployment = "model_ops/model_device/deactivate_deployment/{}".format(str(self.edge_id)) + + # The topic for deleting endpoint + self.topic_delete_deployment = "model_ops/model_device/delete_deployment/{}".format(str(self.edge_id)) + + # Subscribe topics for endpoints + self.add_subscribe_topic(self.topic_start_deployment) + self.add_subscribe_topic(self.topic_activate_endpoint) + self.add_subscribe_topic(self.topic_deactivate_deployment) + self.add_subscribe_topic(self.topic_delete_deployment) + + # Override + def add_protocol_handler(self): + super().add_protocol_handler() + + # Add the message listeners for endpoint related topics + self.add_message_listener(self.topic_start_deployment, self.callback_start_deployment) + self.add_message_listener(self.topic_activate_endpoint, self.callback_activate_deployment) + self.add_message_listener(self.topic_deactivate_deployment, self.callback_deactivate_deployment) + self.add_message_listener(self.topic_delete_deployment, self.callback_delete_deployment) + + # Override + def _get_job_runner_manager(self): + return FedMLDeployJobRunnerManager.get_instance() + + # Override + def _init_extra_items(self): + # Init local database + FedMLServerDataInterface.get_instance().create_job_table() + try: + FedMLModelDatabase.get_instance().set_database_base_dir(ServerConstants.get_database_dir()) + FedMLModelDatabase.get_instance().create_table() + except Exception as e: + pass + + FedMLDeployJobRunnerManager.recover_inference_and_monitor() + + # Override + def _process_connection_ready(self): + self.endpoint_sync_protocol = FedMLEndpointSyncProtocol( + agent_config=self.agent_config, mqtt_mgr=self.message_center) + self.endpoint_sync_protocol.setup_listener_for_sync_device_info(self.edge_id) + + MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) + + # Override + def _process_connection_lost(self): + pass + + # Override + def print_connected_info(self): + pass + + def callback_deployment_result_message(self, topic=None, payload=None): + logging.info(f"Received deployment result") + FedMLDeployJobRunnerManager.get_instance().save_deployment_result(topic, payload) + + def callback_delete_deployment(self, topic, payload): + logging.info("[Master] callback_delete_deployment") + # Parse payload as the model message object. + model_msg_object = FedMLModelMsgObject(topic, payload) + + # Get the launch job id + ComputeCacheManager.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) + launch_job_id = ComputeCacheManager.get_instance().get_gpu_cache().get_endpoint_run_id_map(model_msg_object.run_id) + + # Delete SQLite records + FedMLServerDataInterface.get_instance().delete_job_from_db(model_msg_object.run_id) + FedMLModelDatabase.get_instance().delete_deployment_result( + model_msg_object.run_id, model_msg_object.end_point_name, model_msg_object.model_name, + model_version=model_msg_object.model_version) + FedMLModelDatabase.get_instance().delete_deployment_run_info( + end_point_id=model_msg_object.inference_end_point_id) + + # Delete Redis Records + FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) + FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ + set_end_point_activation(model_msg_object.inference_end_point_id, + model_msg_object.end_point_name, False) + FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ + delete_end_point(model_msg_object.inference_end_point_id, model_msg_object.end_point_name, + model_msg_object.model_name, model_msg_object.model_version) + + # Send delete deployment request to the edge devices + FedMLDeployJobRunnerManager.get_instance().send_deployment_delete_request_to_edges( + model_msg_object.run_id, payload, model_msg_object, message_center=self.message_center, args=self.args) + + # Stop processes on master + FedMLDeployJobRunnerManager.get_instance().stop_job_runner(model_msg_object.run_id) + FedMLDeployJobRunnerManager.get_instance().stop_device_inference_monitor( + model_msg_object.run_id, model_msg_object.end_point_name, model_msg_object.model_id, + model_msg_object.model_name, model_msg_object.model_version) + + # Report the launch job status with killed status. + if launch_job_id is not None: + self.generate_status_report(model_msg_object.run_id, self.edge_id, server_agent_id=self.edge_id).\ + report_server_id_status(launch_job_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_KILLED, + server_id=self.edge_id, server_agent_id=self.edge_id) + + def callback_start_deployment(self, topic, payload): + # noinspection PyBroadException + try: + MLOpsConfigs.fetch_all_configs() + except Exception as e: + pass + + # Get deployment params + request_json = json.loads(payload) + run_id = request_json["end_point_id"] + end_point_name = request_json["end_point_name"] + token = request_json["token"] + device_objs = request_json["device_objs"] + enable_auto_scaling = request_json.get("enable_auto_scaling", False) + desired_replica_num = request_json.get("desired_replica_num", 1) + target_queries_per_replica = request_json.get("target_queries_per_replica", 10) + aggregation_window_size_seconds = request_json.get("aggregation_window_size_seconds", 60) + scale_down_delay_seconds = request_json.get("scale_down_delay_seconds", 120) + user_encrypted_api_key = request_json.get(ServerConstants.USER_ENCRYPTED_API_KEY, "") + + model_config = request_json["model_config"] + model_name = model_config["model_name"] + model_version = model_config["model_version"] + model_id = model_config["model_id"] + scale_min = model_config.get("instance_scale_min", 0) + scale_max = model_config.get("instance_scale_max", 0) + + model_config_parameters = request_json.get("parameters", {}) + timeout_s = model_config_parameters.get("request_timeout_sec", 30) + + inference_end_point_id = run_id + + logging.info("[Master] received start deployment request for end point {}.".format(run_id)) + + # Set redis config + FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) + + # Query if the endpoint exists + endpoint_device_info = FedMLModelCache.get_instance(self.redis_addr, self.redis_port).get_end_point_device_info( + request_json["end_point_id"]) + request_json["is_fresh_endpoint"] = True if endpoint_device_info is None else False + + if user_encrypted_api_key == "": + user_encrypted_api_key = (FedMLModelCache.get_instance(self.redis_addr, self.redis_port). + get_user_encrypted_api_key(run_id)) + if user_encrypted_api_key != "": # Pass the cached key to the workers + request_json[ServerConstants.USER_ENCRYPTED_API_KEY] = user_encrypted_api_key + + # Save the user setting (about replica number) of this run to Redis, if existed, update it + FedMLModelCache.get_instance(self.redis_addr, self.redis_port).set_user_setting_replica_num( + end_point_id=run_id, end_point_name=end_point_name, model_name=model_name, model_version=model_version, + replica_num=desired_replica_num, enable_auto_scaling=enable_auto_scaling, + scale_min=scale_min, scale_max=scale_max, state="DEPLOYING", + aggregation_window_size_seconds=aggregation_window_size_seconds, + target_queries_per_replica=target_queries_per_replica, + scale_down_delay_seconds=int(scale_down_delay_seconds), + timeout_s=timeout_s, user_encrypted_api_key=user_encrypted_api_key + ) + + # Start log processor for current run + self.args.run_id = run_id + self.args.edge_id = self.edge_id + MLOpsRuntimeLog(args=self.args).init_logs() + MLOpsRuntimeLogDaemon.get_instance(self.args).set_log_source( + ServerConstants.FEDML_LOG_SOURCE_TYPE_MODEL_END_POINT) + MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor(run_id, self.edge_id) + + # Add additional parameters to the request_json + run_id = inference_end_point_id + self.args.run_id = run_id + self.run_id = run_id + request_json["run_id"] = run_id + self.request_json = request_json + run_id_str = str(run_id) + self.running_request_json[run_id_str] = request_json + self.request_json["master_node_ip"] = GeneralConstants.get_ip_address(request_json) + + # Set the target status of the devices to redis + FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ + set_end_point_device_info(request_json["end_point_id"], end_point_name, json.dumps(device_objs)) + + # Setup Token + usr_indicated_token = self.get_usr_indicated_token(request_json) + if usr_indicated_token != "": + logging.info(f"Change Token from{token} to {usr_indicated_token}") + token = usr_indicated_token + FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ + set_end_point_token(run_id, end_point_name, model_name, token) + + self.subscribe_deployment_messages_from_slave_devices(request_json) + + ServerConstants.save_runner_infos(self.args.device_id + "." + self.args.os_name, self.edge_id, run_id=run_id) + + # Num diff + request_json = FedMLDeployJobRunnerManager.generate_request_json_with_replica_num_diff( + run_id, self.edge_id, request_json + ) + + # Listen to extra worker topics, especially when worker's replica remove to zero, + # In this case, currently Java will NOT send those worker ids to the master, but still need to listen to it. + if "replica_num_diff" in request_json and len(request_json["replica_num_diff"]) > 0: + for device_id in request_json["replica_num_diff"].keys(): + # {"op": "remove", "curr_num": 1, "target_num": 0} + if request_json["replica_num_diff"][device_id]["op"] == "remove" and \ + request_json["replica_num_diff"][device_id]["target_num"] == 0: + self.subscribe_spec_device_message(run_id, device_id) + + # Version diff + request_json = FedMLDeployJobRunnerManager.generate_request_json_with_replica_version_diff( + run_id, self.edge_id, request_json + ) + self.running_request_json[run_id_str] = request_json + + # Start the job runner to deploy models + self._get_job_runner_manager().start_job_runner( + run_id, request_json, args=self.args, edge_id=self.edge_id, + sender_message_queue=self.message_center.get_sender_message_queue(), + listener_message_queue=self.get_listener_message_queue(), + status_center_queue=self.get_status_queue() + ) + process = self._get_job_runner_manager().get_runner_process(run_id) + if process is not None: + ServerConstants.save_run_process(run_id, process.pid) + + # Report stage to mlops: MODEL_DEPLOYMENT_STAGE1 = "Received" + FedMLDeployJobRunnerManager.get_instance().send_deployment_stages( + run_id, model_name, model_id, "", ServerConstants.MODEL_DEPLOYMENT_STAGE1["index"], + ServerConstants.MODEL_DEPLOYMENT_STAGE1["text"], "Received request for endpoint {}".format(run_id), + message_center=self.message_center) + + # Report stage to mlops: MODEL_DEPLOYMENT_STAGE2 = "Initializing" + FedMLDeployJobRunnerManager.get_instance().send_deployment_stages( + run_id, model_name, model_id, "", ServerConstants.MODEL_DEPLOYMENT_STAGE2["index"], + ServerConstants.MODEL_DEPLOYMENT_STAGE2["text"], ServerConstants.MODEL_DEPLOYMENT_STAGE2["text"], + message_center=self.message_center) + + # Send stage: MODEL_DEPLOYMENT_STAGE3 = "StartRunner" + FedMLDeployJobRunnerManager.get_instance().send_deployment_stages( + run_id, model_name, model_id, "", ServerConstants.MODEL_DEPLOYMENT_STAGE3["index"], + ServerConstants.MODEL_DEPLOYMENT_STAGE3["text"], ServerConstants.MODEL_DEPLOYMENT_STAGE3["text"], + message_center=self.message_center) + + def callback_activate_deployment(self, topic, payload): + logging.info("callback_activate_deployment: topic = %s, payload = %s" % (topic, payload)) + + # Parse payload as the model message object. + model_msg_object = FedMLModelMsgObject(topic, payload) + + # Get the previous deployment status. + FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) + endpoint_status = FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ + get_end_point_status(model_msg_object.inference_end_point_id) + if endpoint_status != ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED: + return + + # Set end point as activated status + FedMLModelCache.get_instance(self.redis_addr, self.redis_port).set_end_point_activation( + model_msg_object.inference_end_point_id, model_msg_object.end_point_name, True) + + def callback_deactivate_deployment(self, topic, payload): + logging.info("callback_deactivate_deployment: topic = %s, payload = %s" % (topic, payload)) + + # Parse payload as the model message object. + model_msg_object = FedMLModelMsgObject(topic, payload) + + # Get the endpoint status + FedMLModelCache.get_instance().set_redis_params(self.redis_addr, self.redis_port, self.redis_password) + endpoint_status = FedMLModelCache.get_instance(self.redis_addr, self.redis_port). \ + get_end_point_status(model_msg_object.inference_end_point_id) + if endpoint_status != ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED: + return + + # Set end point as deactivated status + FedMLModelCache.get_instance(self.redis_addr, self.redis_port).set_end_point_activation( + model_msg_object.inference_end_point_id, model_msg_object.model_name, False) + + @staticmethod + def get_usr_indicated_token(request_json) -> str: + usr_indicated_token = "" + if "parameters" in request_json and "authentication_token" in request_json["parameters"]: + usr_indicated_token = request_json["parameters"]["authentication_token"] + return usr_indicated_token + + def init_device_update_map(self): + # [Deprecated] Use the replica controller to manage the device update + pass + + def subscribe_deployment_messages_from_slave_devices(self, request_json): + if request_json is None: + return + run_id = request_json["run_id"] + edge_id_list = request_json["device_ids"] + logging.info("Edge ids: " + str(edge_id_list)) + for edge_id in edge_id_list: + if str(edge_id) == str(self.edge_id): + continue + # subscribe deployment result message for each model device + deployment_results_topic = "model_device/model_device/return_deployment_result/{}/{}".format( + run_id, edge_id) + self.add_message_listener(deployment_results_topic, self.callback_deployment_result_message) + self.subscribe_msg(deployment_results_topic) + + logging.info("subscribe device messages {}".format(deployment_results_topic)) + + self.setup_listeners_for_edge_status(run_id, edge_id_list, self.edge_id) + + def subscribe_spec_device_message(self, run_id, device_id): + if device_id == self.edge_id: + return + + # subscribe deployment result message for each model device + deployment_results_topic = "model_device/model_device/return_deployment_result/{}/{}".format( + run_id, device_id) + + self.add_message_listener(deployment_results_topic, self.callback_deployment_result_message) + self.subscribe_msg(deployment_results_topic) diff --git a/python/fedml/computing/scheduler/model_scheduler/model_device_client.py b/python/fedml/computing/scheduler/model_scheduler/model_device_client.py index f397c5421f..05f43afc5f 100755 --- a/python/fedml/computing/scheduler/model_scheduler/model_device_client.py +++ b/python/fedml/computing/scheduler/model_scheduler/model_device_client.py @@ -1,16 +1,12 @@ -import json + +import copy import logging import multiprocessing -import os import time import traceback from multiprocessing import Process - -import click -from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants - -from fedml.computing.scheduler.model_scheduler import device_client_runner -from fedml.computing.scheduler.model_scheduler import device_client_constants +from ..scheduler_core.account_manager import FedMLAccountManager +from .worker_agent import FedMLDeployWorkerAgent class FedMLModelDeviceClientRunner: @@ -18,8 +14,7 @@ def __init__(self, args, current_device_id, os_name, is_from_docker, service_con self.agent_process = None self.agent_runner = None self.agent_process_event = None - self.real_client_runner = None - self.args = args + self.args = copy.deepcopy(args) self.service_config = service_config self.unique_device_id = None self.current_device_id = current_device_id @@ -31,8 +26,6 @@ def __init__(self, args, current_device_id, os_name, is_from_docker, service_con self.redis_port = "6379" self.redis_password = "fedml_default" - self.agent_runner = None - def get_edge_id(self): return self.edge_id @@ -45,33 +38,34 @@ def start(self): self.agent_runner.redis_password = self.redis_password if self.agent_process_event is None: self.agent_process_event = multiprocessing.Event() - self.agent_process = Process(target=self.agent_runner.run_entry, args=(self.agent_process_event,)) - self.edge_id = self.bind_device(init_params=False) + self.agent_process = Process(target=self.agent_runner.run_entry, args=(self.agent_process_event, self.args,)) + self.edge_id = self.bind_device() self.agent_process.start() - def run_entry(self, process_event): + def run_entry(self, process_event, in_args): # print(f"Model worker process id {os.getpid()}") self.agent_process_event = process_event + worker_agent = FedMLDeployWorkerAgent() + while not self.agent_process_event.is_set(): try: try: - if self.real_client_runner is not None: - self.real_client_runner.stop_agent() + worker_agent.logout() except Exception as e: pass - self.bind_device() - - self.start_agent() + worker_agent.login( + in_args.account_id, api_key=in_args.api_key, device_id=in_args.device_id, + os_name=in_args.os_name, role=FedMLAccountManager.ROLE_DEPLOY_WORKER_ON_PREM + ) except Exception as e: logging.info("Restart model device client: {}".format(traceback.format_exc())) pass finally: try: - if self.real_client_runner is not None: - self.real_client_runner.stop_agent() + worker_agent.logout() except Exception as e: pass time.sleep(15) @@ -87,100 +81,18 @@ def check_runner_stop_event(self): raise Exception("Runner stopped") def stop(self): - if self.real_client_runner is not None: - self.real_client_runner.stop_agent() + FedMLDeployWorkerAgent.logout() if self.agent_process_event is not None: self.agent_process_event.set() - def get_binding_unique_device_id(self, current_device_id, os_name, is_from_docker=False): - role_str = "OnPremise" - - # Judge whether running from fedml docker hub - is_from_fedml_docker_hub = False - dock_loc_file = device_client_constants.ClientConstants.get_docker_location_file() - if os.path.exists(dock_loc_file): - is_from_fedml_docker_hub = True - - # Build unique device id - is_from_k8s = device_client_constants.ClientConstants.is_running_on_k8s() - if is_from_k8s: - unique_device_id = current_device_id + "@" + os_name + ".MDA.K8S." + role_str + ".Device" - elif is_from_docker: - unique_device_id = current_device_id + "@" + os_name + ".MDA.Docker." + role_str + ".Device" + def bind_device(self): + # Login account + login_result = FedMLAccountManager.get_instance().login( + self.args.account_id, api_key=self.args.api_key, device_id=self.args.device_id, + os_name=self.args.os_name, role=FedMLAccountManager.ROLE_DEPLOY_WORKER_ON_PREM + ) + if login_result is not None: + return login_result.edge_id else: - unique_device_id = current_device_id + "@" + os_name + ".MDA." + role_str + ".Device" - if is_from_fedml_docker_hub: - unique_device_id = current_device_id + "@" + os_name + ".MDA.DockerHub." + role_str + ".Device" - - return unique_device_id - - def init_logs_param(self, edge_id): - # Init runtime logs - self.args.log_file_dir = device_client_constants.ClientConstants.get_log_file_dir() - self.args.run_id = 0 - self.args.role = "client" - client_ids = list() - client_ids.append(edge_id) - self.args.client_id_list = json.dumps(client_ids) - setattr(self.args, "using_mlops", True) - - def bind_device(self, init_params=True): - self.unique_device_id = self.get_binding_unique_device_id(self.current_device_id, self.os_name, - self.is_from_docker) - - # Create client runner for communication with the FedML server. - if self.real_client_runner is None: - self.real_client_runner = device_client_runner.FedMLClientRunner(self.args) - - # Bind account id to the ModelOps platform. - register_try_count = 0 - edge_id = -1 - user_name = None - extra_url = None - while register_try_count < 5: - try: - edge_id, user_name, extra_url = self.real_client_runner.bind_account_and_device_id( - self.service_config["ml_ops_config"]["EDGE_BINDING_URL"], self.args.account_id, - self.unique_device_id, self.os_name - ) - if edge_id > 0: - self.real_client_runner.edge_id = edge_id - break - except Exception as e: - click.echo("{}\n{}".format(SchedulerConstants.ERR_MSG_BINDING_EXCEPTION_2, traceback.format_exc())) - click.echo(SchedulerConstants.ERR_MSG_BINDING_EXIT_RETRYING) - register_try_count += 1 - time.sleep(3) - continue - - if edge_id <= 0: - click.echo("") - click.echo("Oops, you failed to login the FedML ModelOps platform.") - click.echo("Please check whether your network is normal!") - return - self.edge_id = edge_id - - # Init runtime logs - if init_params: - setattr(self.args, "client_id", edge_id) - self.init_logs_param(edge_id) - self.real_client_runner.args = self.args - self.real_client_runner.user_name = user_name - - return edge_id - - def start_agent(self): - self.real_client_runner.unique_device_id = self.unique_device_id - device_client_constants.ClientConstants.save_runner_infos(self.current_device_id + "." + self.os_name, - self.edge_id, run_id=0) - - # Setup MQTT connection for communication with the FedML server. - self.real_client_runner.infer_host = self.infer_host - self.real_client_runner.redis_addr = self.redis_addr - self.real_client_runner.redis_port = self.redis_port - self.real_client_runner.redis_password = self.redis_password - self.real_client_runner.setup_agent_mqtt_connection(self.service_config) - - # Start mqtt looper - self.real_client_runner.start_agent_mqtt_loop(should_exit_sys=False) + return None diff --git a/python/fedml/computing/scheduler/model_scheduler/model_device_server.py b/python/fedml/computing/scheduler/model_scheduler/model_device_server.py index b654d2fdfd..b2ecd144b1 100755 --- a/python/fedml/computing/scheduler/model_scheduler/model_device_server.py +++ b/python/fedml/computing/scheduler/model_scheduler/model_device_server.py @@ -1,16 +1,12 @@ -import json + +import copy import logging import multiprocessing -import os import time import traceback from multiprocessing import Process - -import click -from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants - -from fedml.computing.scheduler.model_scheduler import device_server_runner -from fedml.computing.scheduler.model_scheduler import device_server_constants +from ..scheduler_core.account_manager import FedMLAccountManager +from .master_agent import FedMLDeployMasterAgent class FedMLModelDeviceServerRunner: @@ -18,8 +14,7 @@ def __init__(self, args, current_device_id, os_name, is_from_docker, service_con self.agent_process = None self.agent_runner = None self.agent_process_event = None - self.real_server_runner = None - self.args = args + self.args = copy.deepcopy(args) self.service_config = service_config self.unique_device_id = None self.current_device_id = current_device_id @@ -30,7 +25,6 @@ def __init__(self, args, current_device_id, os_name, is_from_docker, service_con self.redis_addr = "local" self.redis_port = "6379" self.redis_password = "fedml_default" - self.agent_runner = None def get_edge_id(self): return self.edge_id @@ -44,35 +38,33 @@ def start(self): self.agent_runner.redis_password = self.redis_password if self.agent_process_event is None: self.agent_process_event = multiprocessing.Event() - self.agent_process = Process(target=self.agent_runner.run_entry, args=(self.agent_process_event,)) - self.edge_id = self.bind_device(init_params=False) + self.agent_process = Process(target=self.agent_runner.run_entry, args=(self.agent_process_event, self.args)) + self.edge_id = self.bind_device() self.agent_process.start() - def run_entry(self, process_event): + def run_entry(self, process_event, in_args): # print(f"Model master process id {os.getpid()}") self.agent_process_event = process_event + master_agent = FedMLDeployMasterAgent() while not self.agent_process_event.is_set(): try: try: - if self.real_server_runner is not None: - self.real_server_runner.stop_agent() + master_agent.logout() except Exception as e: pass - # Get identity of the device from MLOps platform. - self.bind_device() - - # Start the agent for the device. - self.start_agent() + master_agent.login( + in_args.account_id, api_key=in_args.api_key, device_id=in_args.device_id, + os_name=in_args.os_name, role=FedMLAccountManager.ROLE_DEPLOY_MASTER_ON_PREM + ) except Exception as e: logging.info("Restart model device server: {}".format(traceback.format_exc())) pass finally: try: - if self.real_server_runner is not None: - self.real_server_runner.stop_agent() + master_agent.logout() except Exception as e: pass time.sleep(15) @@ -88,104 +80,18 @@ def check_runner_stop_event(self): raise Exception("Runner stopped") def stop(self): - if self.real_server_runner is not None: - self.real_server_runner.stop_agent() + FedMLDeployMasterAgent.logout() if self.agent_process_event is not None: self.agent_process_event.set() - def get_binding_unique_device_id(self, current_device_id, os_name, is_from_docker=False): - role_str = "OnPremise" - - # Judge whether running from fedml docker hub - is_from_fedml_docker_hub = False - dock_loc_file = device_server_constants.ServerConstants.get_docker_location_file() - if os.path.exists(dock_loc_file): - is_from_fedml_docker_hub = True - - # Build unique device id - is_from_k8s = device_server_constants.ServerConstants.is_running_on_k8s() - if is_from_k8s: - unique_device_id = current_device_id + "@" + os_name + ".MDA.K8S." + role_str + ".Master.Device" - elif is_from_docker: - unique_device_id = current_device_id + "@" + os_name + ".MDA.Docker." + role_str + ".Master.Device" + def bind_device(self): + # Login account + login_result = FedMLAccountManager.get_instance().login( + self.args.account_id, api_key=self.args.api_key, device_id=self.args.device_id, + os_name=self.args.os_name, role=FedMLAccountManager.ROLE_DEPLOY_MASTER_ON_PREM + ) + if login_result is not None: + return login_result.edge_id else: - unique_device_id = current_device_id + "@" + os_name + ".MDA." + role_str + ".Master.Device" - - if is_from_fedml_docker_hub: - unique_device_id = current_device_id + "@" + os_name + ".MDA.DockerHub." + role_str + ".Master.Device" - - return unique_device_id - - def init_logs_param(self, edge_id): - self.args.log_file_dir = device_server_constants.ServerConstants.get_log_file_dir() - self.args.run_id = 0 - self.args.role = "server" - self.args.edge_id = edge_id - setattr(self.args, "using_mlops", True) - setattr(self.args, "server_agent_id", edge_id) - - def bind_device(self, init_params=True): - self.unique_device_id = self.get_binding_unique_device_id(self.current_device_id, self.os_name, - self.is_from_docker) - - # Create client runner for communication with the FedML server. - if self.real_server_runner is None: - self.real_server_runner = device_server_runner.FedMLServerRunner(self.args) - - # Bind account id to the ModelOps platform. - register_try_count = 0 - edge_id = -1 - user_name = None - extra_url = None - while register_try_count < 5: - try: - edge_id, user_name, extra_url = self.real_server_runner.bind_account_and_device_id( - self.service_config["ml_ops_config"]["EDGE_BINDING_URL"], self.args.account_id, - self.unique_device_id, self.os_name - ) - if edge_id > 0: - self.real_server_runner.edge_id = edge_id - break - except Exception as e: - click.echo("{}\n{}".format(SchedulerConstants.ERR_MSG_BINDING_EXCEPTION_2, traceback.format_exc())) - click.echo(SchedulerConstants.ERR_MSG_BINDING_EXIT_RETRYING) - register_try_count += 1 - time.sleep(3) - continue - - if edge_id <= 0: - click.echo("") - click.echo("Oops, you failed to login the FedML ModelOps platform.") - click.echo("Please check whether your network is normal!") - return - self.edge_id = edge_id - - # Init runtime logs - if init_params: - setattr(self.args, "client_id", edge_id) - self.real_server_runner.infer_host = self.infer_host - self.real_server_runner.redis_addr = self.redis_addr - self.real_server_runner.redis_port = self.redis_port - self.real_server_runner.redis_password = self.redis_password - self.init_logs_param(edge_id) - self.real_server_runner.args = self.args - self.real_server_runner.run_as_edge_server_and_agent = True - self.real_server_runner.user_name = user_name - - return edge_id - - def start_agent(self): - # Log arguments and binding results. - # logging.info("login: unique_device_id = %s" % str(unique_device_id)) - # logging.info("login: edge_id = %s" % str(edge_id)) - self.real_server_runner.unique_device_id = self.unique_device_id - device_server_constants.ServerConstants.save_runner_infos(self.current_device_id + "." + self.os_name, - self.edge_id, run_id=0) - - # Setup MQTT connection for communication with the FedML server. - self.real_server_runner.infer_host = self.infer_host - self.real_server_runner.setup_agent_mqtt_connection(self.service_config) - - # Start mqtt looper - self.real_server_runner.start_agent_mqtt_loop(should_exit_sys=False) + return None diff --git a/python/fedml/computing/scheduler/model_scheduler/modelops_configs.py b/python/fedml/computing/scheduler/model_scheduler/modelops_configs.py index e988c29a8a..719f3825c4 100644 --- a/python/fedml/computing/scheduler/model_scheduler/modelops_configs.py +++ b/python/fedml/computing/scheduler/model_scheduler/modelops_configs.py @@ -4,6 +4,7 @@ import certifi import requests +import cachetools.func import fedml from fedml.core.mlops.mlops_utils import MLOpsUtils @@ -32,6 +33,7 @@ def get_instance(args): return ModelOpsConfigs._config_instance @staticmethod + @cachetools.func.ttl_cache(ttl=600) def get_request_params(): url = fedml._get_backend_service() url = "{}/fedmlOpsServer/configs/fetch".format(url) diff --git a/python/fedml/computing/scheduler/model_scheduler/sample_model/README.md b/python/fedml/computing/scheduler/model_scheduler/sample_model/README.md deleted file mode 100644 index fcb51bd792..0000000000 --- a/python/fedml/computing/scheduler/model_scheduler/sample_model/README.md +++ /dev/null @@ -1,57 +0,0 @@ -## 1 Device Login: -Login as fedml cloud device: -```fedml model device login $user_id_or_api_key -c``` - -Login as on premise device: -```fedml model device login $user_id_or_api_key -p``` - - -## 2. Model Card: -Create local model repository: -```fedml model create -n $model_name``` - -Delete local model repository: -```fedml model delete -n $model_name -f $model_file_name``` - -Add file to local model repository: -```fedml model add -n $model_name -p $model_file_path``` - -Remove file from local model repository: -```fedml model remove -n $model_name -f $model_file_name``` - -List model in the local model repository: -```fedml model list -n $model_name``` - -Build local model repository as zip model package: -```fedml model package -n $model_name``` - -Push local model repository to ModelOps(open.fedml.ai): -```fedml model push -n $model_name -u $user_id_or_api_key``` - -Pull remote model(ModelOps) to local model repository: -```fedml model pull -n $model_name -u $user_id_or_api_key``` - - -## 3. Model Package: -Create local model repository: -```fedml model create -n $model_name``` - -Delete local model repository: -```fedml model delete -n $model_name -f $model_file_name``` - -Add file to local model repository: -```fedml model add -n $model_name -p $model_file_path``` - -Remove file from local model repository: -```fedml model remove -n $model_name -f $model_file_name``` - -List model in the local model repository: -```fedml model list -n $model_name``` - -Build local model repository as zip model package: -```fedml model package -n $model_name``` - -## 4. Model Deploy: -``` -fedml model deploy -n $model_name -dt $device_type(md.on_premise_device/md.fedml_cloud_device) -d $master_device_id -u $user_id_or_api_key -p $deployment_extra_params -``` diff --git a/python/fedml/computing/scheduler/model_scheduler/sample_model/fedml_model.bin b/python/fedml/computing/scheduler/model_scheduler/sample_model/fedml_model.bin deleted file mode 100644 index d98296eb61..0000000000 Binary files a/python/fedml/computing/scheduler/model_scheduler/sample_model/fedml_model.bin and /dev/null differ diff --git a/python/fedml/computing/scheduler/model_scheduler/sample_model/fedml_model_config.yaml b/python/fedml/computing/scheduler/model_scheduler/sample_model/fedml_model_config.yaml deleted file mode 100644 index 491ed507d9..0000000000 --- a/python/fedml/computing/scheduler/model_scheduler/sample_model/fedml_model_config.yaml +++ /dev/null @@ -1,20 +0,0 @@ -{ - "platform": "onnxruntime", - "max_batch_size": 1, - "input_size": [[1,24], [1,2]], - "input_types": ["int", "float"], - "input": [ - { - "name": "input", - "data_type": "TYPE_FP32", - "dims": [] - } - ], - "output": [ - { - "name": "output", - "data_type": "TYPE_FP32", - "dims": [] - } - ] -} \ No newline at end of file diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_agent.py b/python/fedml/computing/scheduler/model_scheduler/worker_agent.py new file mode 100755 index 0000000000..bdbe5fc143 --- /dev/null +++ b/python/fedml/computing/scheduler/model_scheduler/worker_agent.py @@ -0,0 +1,27 @@ + +from .device_client_constants import ClientConstants +from .device_client_data_interface import FedMLClientDataInterface +from .worker_protocol_manager import FedMLDeployWorkerProtocolManager +from ..slave.base_slave_agent import FedMLBaseSlaveAgent + + +class FedMLDeployWorkerAgent(FedMLBaseSlaveAgent): + + def __init__(self): + FedMLBaseSlaveAgent.__init__(self) + + # Override + def _get_log_file_dir(self): + return ClientConstants.get_log_file_dir() + + # Override + def _save_agent_info(self, unique_device_id, edge_id): + ClientConstants.save_runner_infos(unique_device_id, edge_id) + + # Override + def _init_database(self): + FedMLClientDataInterface.get_instance().create_job_table() + + # Override + def _generate_protocol_manager_instance(self, args, agent_config=None): + return FedMLDeployWorkerProtocolManager(args, agent_config=agent_config) diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py new file mode 100755 index 0000000000..c73630fb65 --- /dev/null +++ b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner.py @@ -0,0 +1,512 @@ + +import json +import logging +import os +import shutil +import time +import traceback +import urllib +from abc import ABC +import yaml +from fedml.computing.scheduler.comm_utils.job_utils import JobRunnerUtils +from fedml.computing.scheduler.comm_utils.network_util import return_this_device_connectivity_type + +from fedml.core.mlops import MLOpsRuntimeLog +from fedml.computing.scheduler.comm_utils import file_utils +from .device_client_constants import ClientConstants +from .device_model_cache import FedMLModelCache +from ..scheduler_core.general_constants import GeneralConstants +from ..slave.base_slave_job_runner import FedMLBaseSlaveJobRunner +from .device_model_deployment import start_deployment +from .device_model_db import FedMLModelDatabase +from .device_replica_handler import FedMLDeviceReplicaHandler + + +class FedMLDeployWorkerJobRunner(FedMLBaseSlaveJobRunner, ABC): + + def __init__(self, args, run_id=0, request_json=None, agent_config=None, edge_id=0, + cuda_visible_gpu_ids_str=None): + FedMLBaseSlaveJobRunner.__init__( + self, args, edge_id=edge_id, request_json=request_json, agent_config=agent_config, run_id=run_id, + cuda_visible_gpu_ids_str=cuda_visible_gpu_ids_str, agent_data_dir=ClientConstants.get_data_dir(), + agent_package_download_dir=ClientConstants.get_model_package_dir(), + agent_package_unzip_dir=GeneralConstants.get_package_unzip_dir(ClientConstants.get_package_download_dir()), + agent_log_file_dir=ClientConstants.get_log_file_dir() + ) + + self.is_deployment_runner = True + self.infer_host = "127.0.0.1" + self.redis_addr = "local" + self.redis_port = "6379" + self.redis_password = "fedml_default" + self.model_is_from_open = False + self.replica_handler = None + + # Override + def _generate_job_runner_instance(self, args, run_id=None, request_json=None, agent_config=None, edge_id=None): + return FedMLDeployWorkerJobRunner( + args, run_id=run_id, request_json=request_json, agent_config=self.agent_config, edge_id=edge_id + ) + + # Override + def _generate_extend_queue_list(self): + return None + + def retrieve_binary_model_file(self, package_name, package_url): + local_package_path = ClientConstants.get_model_package_dir() + if not os.path.exists(local_package_path): + os.makedirs(local_package_path, exist_ok=True) + unzip_package_path = ClientConstants.get_model_dir() + local_package_file = "{}".format(os.path.join(local_package_path, package_name)) + if os.path.exists(local_package_file): + os.remove(local_package_file) + urllib.request.urlretrieve(package_url, local_package_file, + reporthook=self.package_download_progress) + + unzip_package_path = os.path.join(unzip_package_path, package_name) + if not os.path.exists(unzip_package_path): + os.makedirs(unzip_package_path, exist_ok=True) + dst_model_file = os.path.join(unzip_package_path, package_name) + if os.path.exists(local_package_file): + shutil.copy(local_package_file, dst_model_file) + + return unzip_package_path, dst_model_file + + @staticmethod + def get_model_bin_file(unzip_package_full_path): + unzip_package_path = os.path.dirname(unzip_package_full_path) + model_bin_file = os.path.join(unzip_package_path, "fedml_model.bin") + return model_bin_file + + def update_local_fedml_config(self, run_id, model_config, model_config_parameters=None): + model_name = model_config["model_name"] + model_storage_url = model_config["model_storage_url"] + end_point_name = self.request_json["end_point_name"] + model_version = model_config["model_version"] + + # Generate the model package dir for downloading. + model_version = model_version.replace(" ", "-") # Avoid using space for folder name + model_version = model_version.replace(":", "-") # Since docker mount will conflict with ":" + local_package_path = ClientConstants.get_model_package_dir() + os.makedirs(local_package_path, exist_ok=True) + this_run_model_dir = f"{run_id}_{end_point_name}_{model_name}_{model_version}" + this_run_model_full_path = os.path.join(local_package_path, this_run_model_dir) + self.agent_package_download_dir = this_run_model_full_path + self.agent_package_unzip_dir = this_run_model_full_path + + # Retrieve model package or model binary file. + if self.model_is_from_open: + unzip_package_path, model_bin_file = self.retrieve_binary_model_file(model_name, model_storage_url) + else: + unzip_package_path = self.retrieve_and_unzip_package(model_name, model_storage_url) + model_bin_file = FedMLDeployWorkerJobRunner.get_model_bin_file(unzip_package_path) + + # Load the config to memory + fedml_local_config_file = os.path.join(unzip_package_path, "fedml_model_config.yaml") + + # Inject the config from UI to pkg yaml + package_conf_object = model_config_parameters + + # Save the config to local + with open(fedml_local_config_file, "w") as f: + yaml.dump(package_conf_object, f) + + logging.info("The package_conf_object is {}".format(package_conf_object)) + + return unzip_package_path, model_bin_file, package_conf_object + + def download_model_package(self, package_name, package_url): + # Copy config file from the client + unzip_package_path = self.retrieve_and_unzip_package( + package_name, package_url + ) + + return unzip_package_path + + # Override + def run_impl(self, run_extend_queue_list, sender_message_center, + listener_message_queue, status_center_queue): + # Get deployment params + run_id = self.request_json["end_point_id"] + end_point_name = self.request_json["end_point_name"] + device_ids = self.request_json["device_ids"] + master_ip = self.request_json["master_node_ip"] + model_config = self.request_json["model_config"] + model_name = model_config["model_name"] + model_id = model_config["model_id"] + model_version = model_config["model_version"] + model_config_parameters = self.request_json["parameters"] + inference_port = model_config_parameters.get("worker_internal_port", + ClientConstants.MODEL_INFERENCE_DEFAULT_PORT) + inference_port_external = model_config_parameters.get("worker_external_port", inference_port) + inference_engine = model_config_parameters.get("inference_engine", + ClientConstants.INFERENCE_ENGINE_TYPE_INT_DEFAULT) + inference_end_point_id = run_id + self.run_id = run_id + + MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) + + logging.info(f"[Worker] Received model deployment request from master for endpoint {run_id}.") + self.replica_handler = FedMLDeviceReplicaHandler(self.edge_id, self.request_json) + if self.replica_handler is not None: + logging.info("\n================= Worker replica Handler ======================\n" + f"Reconcile with num diff {self.replica_handler.replica_num_diff}\n" + f"and version diff {self.replica_handler.replica_version_diff}\n" + "===============================================================\n") + else: + logging.error(f"[Worker] Replica handler is None.") + return False + + self.check_runner_stop_event() + + # Report the deployment status to mlops + self.status_reporter.report_client_id_status( + self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_INITIALIZING, + is_from_model=True, running_json=json.dumps(self.request_json), run_id=run_id) + self.status_reporter.report_client_id_status( + self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_RUNNING, + is_from_model=True, run_id=run_id) + + self.check_runner_stop_event() + + # Reconcile the replica number (op: add, remove) + prev_rank, op, op_num = self.replica_handler.reconcile_num_replica() + + # Reconcile the replica version (op: update) + replica_rank_to_update = [] + if not op: + replica_rank_to_update, op = self.replica_handler.reconcile_replica_version() + + if not op: + logging.info("[Worker] No need to reconcile.") + return True + + logging.info("\n================ Worker Reconcile Operations ======================\n" + f" op: {op}; op num: {op_num}.\n" + "===================================================================\n") + + if op == "rollback": + # Find the version (notified by master) to rollback + logging.info("Try to use backup package to rollback...") + version_diff_dict = self.request_json["replica_version_diff"][str(self.edge_id)] + version_rollback_to = None + for replica_no, rollback_ops in version_diff_dict.items(): + version_rollback_to = rollback_ops["new_version"] # Note that new_version is the version to rollback + break + if version_rollback_to is None: + logging.error(f"No old version found for run_id: {self.run_id} " + f"edge_id: {self.edge_id}, rollback failed. No old version found in request_json.") + return False + model_version = version_rollback_to + + # Construct the parent folder name for the package + model_version_formatted = model_version.replace(" ", "-") + model_version_formatted = model_version_formatted.replace(":", "-") + models_root_dir = ClientConstants.get_model_package_dir() + parent_fd = f"{run_id}_{end_point_name}_{model_name}_{model_version_formatted}" + + # Check if the package is already downloaded + unzip_package_path = "" + if os.path.exists(os.path.join(models_root_dir, parent_fd)): + unzip_package_path = self.find_previous_downloaded_pkg(os.path.join(models_root_dir, parent_fd)) + + # Download the package if not found + if unzip_package_path == "": + logging.info("Download and unzip model to local...") + unzip_package_path, _, _ = \ + self.update_local_fedml_config(run_id, model_config, model_config_parameters) + if unzip_package_path is None: + logging.info("Failed to update local fedml config.") + self.check_runner_stop_event() + self.status_reporter.report_client_id_status( + self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, + is_from_model=True, run_id=run_id) + return False + + if not os.path.exists(unzip_package_path): + logging.info("Failed to unzip file.") + self.check_runner_stop_event() + self.status_reporter.report_client_id_status( + self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, + is_from_model=True, run_id=run_id) + return False + + self.check_runner_stop_event() + + running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \ + "", "", model_version, {}, {} + + # ip and connectivity + worker_ip = GeneralConstants.get_ip_address(self.request_json) + connectivity = return_this_device_connectivity_type() + + if op == "add": + for rank in range(prev_rank + 1, prev_rank + 1 + op_num): + try: + running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \ + start_deployment( + end_point_id=inference_end_point_id, end_point_name=end_point_name, model_id=model_id, + model_version=model_version, model_storage_local_path=unzip_package_path, + inference_model_name=model_name, inference_engine=inference_engine, + infer_host=worker_ip, master_ip=master_ip, edge_id=self.edge_id, + master_device_id=device_ids[0], replica_rank=rank, + gpu_per_replica=int(self.replica_handler.gpu_per_replica), request_json=self.request_json + ) + except Exception as e: + inference_output_url = "" + logging.error(f"[Worker] Exception at deployment: {traceback.format_exc()}") + + if inference_output_url == "": + logging.error("[Worker] Failed to deploy the model.") + + # Send failed result back to master + _ = self.send_deployment_results( + end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED, + model_id, model_name, inference_output_url, inference_model_version, inference_port, + inference_engine, model_metadata, model_config) + + self.status_reporter.run_id = self.run_id + + raise Exception("[Worker] Failed to deploy the model.") + else: + # Send failed successful result back to master + logging.info("Finished deployment, continue to send results to master...") + result_payload = self.send_deployment_results( + end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, + model_id, model_name, inference_output_url, model_version, inference_port_external, + inference_engine, model_metadata, model_config, replica_no=rank + 1, + connectivity=connectivity + ) + + if inference_port_external != inference_port: + # Save internal port to local db + logging.info("inference_port_external {} != inference_port {}".format( + inference_port_external, inference_port)) + result_payload = self.construct_deployment_results( + end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, + model_id, model_name, inference_output_url, model_version, inference_port, + inference_engine, model_metadata, model_config, replica_no=rank + 1, + connectivity=connectivity + ) + + FedMLModelDatabase.get_instance().set_deployment_result( + run_id, end_point_name, model_name, model_version, self.edge_id, + json.dumps(result_payload), replica_no=rank + 1) + + logging.info(f"Deploy replica {rank + 1} / {prev_rank + 1 + op_num} successfully.") + + self.status_reporter.run_id = self.run_id + self.status_reporter.report_client_id_status( + self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, + is_from_model=True, run_id=self.run_id) + return True + elif op == "remove": + for rank_to_delete in range(prev_rank, prev_rank - op_num, -1): + self.replica_handler.remove_replica(rank_to_delete) + + FedMLModelCache.get_instance().set_redis_params() + replica_occupied_gpu_ids_str = FedMLModelCache.get_instance().get_replica_gpu_ids( + run_id, end_point_name, model_name, self.edge_id, rank_to_delete + 1) + + replica_occupied_gpu_ids = json.loads(replica_occupied_gpu_ids_str) + + JobRunnerUtils.get_instance().release_partial_job_gpu(run_id, self.edge_id, replica_occupied_gpu_ids) + + FedMLModelDatabase.get_instance().delete_deployment_result_with_device_id_and_rank( + run_id, end_point_name, model_name, self.edge_id, rank_to_delete) + + # Report the deletion msg to master + result_payload = self.send_deployment_results( + end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DELETED, + model_id, model_name, inference_output_url, model_version, inference_port_external, + inference_engine, model_metadata, model_config, replica_no=rank_to_delete + 1) + + time.sleep(1) + self.status_reporter.run_id = self.run_id + self.status_reporter.report_client_id_status( + self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, + is_from_model=True, run_id=self.run_id) + + # TODO: If delete all replica, then delete the job and related resources + if rank_to_delete == 0: + pass + return True + elif op == "update" or op == "rollback": + # Update is combine of delete and add + for rank in replica_rank_to_update: + # Delete a replica (container) if exists + self.replica_handler.remove_replica(rank) + + FedMLModelCache.get_instance().set_redis_params() + replica_occupied_gpu_ids_str = FedMLModelCache.get_instance().get_replica_gpu_ids( + run_id, end_point_name, model_name, self.edge_id, rank + 1) + + replica_occupied_gpu_ids = json.loads(replica_occupied_gpu_ids_str) + logging.info(f"Release gpu ids {replica_occupied_gpu_ids} for update / rollback.") + + # TODO (Raphael) check if this will allow another job to seize the gpu during high concurrency: + try: + JobRunnerUtils.get_instance().release_partial_job_gpu( + run_id, self.edge_id, replica_occupied_gpu_ids) + except Exception as e: + if op == "rollback": + pass + else: + logging.error(f"Failed to release gpu ids {replica_occupied_gpu_ids} for update.") + return False + + # Delete the deployment result from local db + FedMLModelDatabase.get_instance().delete_deployment_result_with_device_id_and_rank( + run_id, end_point_name, model_name, self.edge_id, rank) + + logging.info(f"Delete replica with no {rank + 1} successfully.") + time.sleep(1) + + # Add a replica (container) + # TODO: Reduce the duplicated code + logging.info(f"Start to deploy the model with replica no {rank + 1} ...") + try: + running_model_name, inference_output_url, inference_model_version, model_metadata, model_config = \ + start_deployment( + end_point_id=inference_end_point_id, end_point_name=end_point_name, model_id=model_id, + model_version=model_version, model_storage_local_path=unzip_package_path, + inference_model_name=model_name, inference_engine=inference_engine, + infer_host=worker_ip, master_ip=master_ip, edge_id=self.edge_id, + master_device_id=device_ids[0], replica_rank=rank, + gpu_per_replica=int(self.replica_handler.gpu_per_replica), request_json=self.request_json + ) + except Exception as e: + inference_output_url = "" + logging.error(f"Exception at deployment: {traceback.format_exc()}") + + if inference_output_url == "": + logging.error("Failed to deploy the model...") + + # Release the gpu occupancy + FedMLModelCache.get_instance().set_redis_params() + replica_occupied_gpu_ids_str = FedMLModelCache.get_instance().get_replica_gpu_ids( + run_id, end_point_name, model_name, self.edge_id, rank + 1) + logging.info(f"Release gpu ids {replica_occupied_gpu_ids_str} for " + f"failed deployment of replica no {rank + 1}.") + + if replica_occupied_gpu_ids_str is not None: + replica_occupied_gpu_ids = json.loads(replica_occupied_gpu_ids_str) + JobRunnerUtils.get_instance().release_partial_job_gpu( + run_id, self.edge_id, replica_occupied_gpu_ids) + + self.send_deployment_results( + end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED, + model_id, model_name, inference_output_url, inference_model_version, inference_port, + inference_engine, model_metadata, model_config) + + self.status_reporter.run_id = self.run_id + self.status_reporter.report_client_id_status( + self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, + is_from_model=True, run_id=self.run_id) + return False + else: + logging.info("Finished deployment, continue to send results to master...") + result_payload = self.send_deployment_results( + end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, + model_id, model_name, inference_output_url, model_version, inference_port_external, + inference_engine, model_metadata, model_config, replica_no=rank + 1, + connectivity=connectivity + ) + + if inference_port_external != inference_port: # Save internal port to local db + logging.info("inference_port_external {} != inference_port {}".format( + inference_port_external, inference_port)) + result_payload = self.construct_deployment_results( + end_point_name, self.edge_id, ClientConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED, + model_id, model_name, inference_output_url, model_version, inference_port, + inference_engine, model_metadata, model_config, replica_no=rank + 1, + connectivity=connectivity + ) + + FedMLModelDatabase.get_instance().set_deployment_result( + run_id, end_point_name, model_name, model_version, self.edge_id, + json.dumps(result_payload), replica_no=rank + 1) + + logging.info(f"Update replica with no {rank + 1} successfully. Op num {op_num}") + time.sleep(5) + time.sleep(1) + self.status_reporter.run_id = self.run_id + self.status_reporter.report_client_id_status( + self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, + is_from_model=True, run_id=self.run_id) + return True + + else: + # The delete op will be handled by callback_delete_deployment + logging.error(f"Unsupported op {op} with op num {op_num}") + return False + + def construct_deployment_results(self, end_point_name, device_id, model_status, + model_id, model_name, model_inference_url, + model_version, inference_port, inference_engine, + model_metadata, model_config, replica_no=1, + connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT): + deployment_results_payload = {"end_point_id": self.run_id, "end_point_name": end_point_name, + "model_id": model_id, "model_name": model_name, + "model_url": model_inference_url, "model_version": model_version, + "port": inference_port, + "inference_engine": inference_engine, + "model_metadata": model_metadata, + "model_config": model_config, + "model_status": model_status, + "inference_port": inference_port, + "replica_no": replica_no, + "connectivity_type": connectivity, + } + return deployment_results_payload + + def send_deployment_results(self, end_point_name, device_id, model_status, + model_id, model_name, model_inference_url, + model_version, inference_port, inference_engine, + model_metadata, model_config, replica_no=1, + connectivity=ClientConstants.WORKER_CONNECTIVITY_TYPE_DEFAULT): + deployment_results_topic = "model_device/model_device/return_deployment_result/{}/{}".format( + self.run_id, device_id) + + deployment_results_payload = self.construct_deployment_results( + end_point_name, device_id, model_status, + model_id, model_name, model_inference_url, + model_version, inference_port, inference_engine, + model_metadata, model_config, replica_no=replica_no, connectivity=connectivity) + + logging.info("[client] send_deployment_results: topic {}, payload {}.".format(deployment_results_topic, + deployment_results_payload)) + self.message_center.send_message_json(deployment_results_topic, json.dumps(deployment_results_payload)) + return deployment_results_payload + + def reset_devices_status(self, edge_id, status): + self.status_reporter.run_id = self.run_id + self.status_reporter.edge_id = edge_id + self.status_reporter.report_client_id_status( + edge_id, status, is_from_model=True, run_id=self.run_id) + + # Override + def get_download_package_info(self, packages_config=None): + model_name = packages_config["model_name"] + model_storage_url = packages_config["model_storage_url"] + return model_name, model_storage_url + + # Override + def build_dynamic_args(self, run_id, run_config, package_conf_object, base_dir): + pass + + # Override + def build_dynamic_constrain_variables(self, run_id, run_config): + pass + + @staticmethod + def find_previous_downloaded_pkg(parent_dir: str) -> str: + """ + Find a folder inside parent_dir that contains the fedml_model_config.yaml file. + """ + res = file_utils.find_file_inside_folder(parent_dir, ClientConstants.MODEL_REQUIRED_MODEL_CONFIG_FILE) + if res is not None: + # return the parent folder of res + return os.path.dirname(res) + else: + return "" diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_job_runner_manager.py b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner_manager.py new file mode 100755 index 0000000000..4fe35d5a8a --- /dev/null +++ b/python/fedml/computing/scheduler/model_scheduler/worker_job_runner_manager.py @@ -0,0 +1,23 @@ + +from fedml.core.common.singleton import Singleton +from .worker_job_runner import FedMLDeployWorkerJobRunner +from ..scheduler_core.general_constants import GeneralConstants +from ..slave.base_slave_job_runner_manager import FedMLBaseSlaveJobRunnerManager + + +class FedMLDeployJobRunnerManager(FedMLBaseSlaveJobRunnerManager, Singleton): + def __init__(self): + FedMLBaseSlaveJobRunnerManager.__init__(self) + + @staticmethod + def get_instance(): + return FedMLDeployJobRunnerManager() + + # Override + def _generate_job_runner_instance( + self, args, run_id=None, request_json=None, agent_config=None, edge_id=None + ): + job_runner = FedMLDeployWorkerJobRunner( + args, run_id=run_id, request_json=request_json, agent_config=agent_config, edge_id=edge_id) + job_runner.infer_host = GeneralConstants.get_ip_address(request_json) + return job_runner diff --git a/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py new file mode 100755 index 0000000000..b1d0bebc47 --- /dev/null +++ b/python/fedml/computing/scheduler/model_scheduler/worker_protocol_manager.py @@ -0,0 +1,223 @@ + +import json +import logging +import os +import traceback + +from fedml.computing.scheduler.comm_utils.job_utils import JobRunnerUtils +from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils +from fedml.computing.scheduler.comm_utils.sys_utils import get_python_program +from fedml.core.mlops import MLOpsConfigs, MLOpsRuntimeLog, MLOpsRuntimeLogDaemon +from .device_model_db import FedMLModelDatabase +from .device_model_msg_object import FedMLModelMsgObject +from .device_client_constants import ClientConstants +from .device_client_data_interface import FedMLClientDataInterface +from ..slave.base_slave_protocol_manager import FedMLBaseSlaveProtocolManager +from .worker_job_runner_manager import FedMLDeployJobRunnerManager +from .device_mqtt_inference_protocol import FedMLMqttInference +from ..scheduler_core.compute_cache_manager import ComputeCacheManager +from .device_model_cache import FedMLModelCache + + +class FedMLDeployWorkerProtocolManager(FedMLBaseSlaveProtocolManager): + def __init__(self, args, agent_config=None): + FedMLBaseSlaveProtocolManager.__init__(self, args, agent_config=agent_config) + + self.message_center_name = "deploy_slave_agent" + self.is_deployment_status_center = True + + self.topic_start_deployment = None + self.topic_delete_deployment = None + + self.infer_host = "127.0.0.1" + self.redis_addr = "local" + self.redis_port = "6379" + self.redis_password = "fedml_default" + self.endpoint_sync_protocol = None + self.local_api_process = None + self.mqtt_inference_obj = None + + # Override + def _generate_protocol_manager_instance(self, args, agent_config=None): + return FedMLDeployWorkerProtocolManager(args, agent_config=agent_config) + + # Override + def generate_topics(self): + super().generate_topics() + + # The topic for start deployment + self.topic_start_deployment = "model_ops/model_device/start_deployment/{}".format(str(self.edge_id)) + + # The topic for deleting endpoint + self.topic_delete_deployment = "model_ops/model_device/delete_deployment/{}".format(str(self.edge_id)) + + # Subscribe topics for endpoints + self.add_subscribe_topic(self.topic_start_deployment) + self.add_subscribe_topic(self.topic_delete_deployment) + + # Override + def add_protocol_handler(self): + super().add_protocol_handler() + + # Add the message listeners for endpoint related topics + self.add_message_listener(self.topic_start_deployment, self.callback_start_deployment) + self.add_message_listener(self.topic_delete_deployment, self.callback_delete_deployment) + + # Override + def _get_job_runner_manager(self): + return FedMLDeployJobRunnerManager.get_instance() + + # Override + def _init_extra_items(self): + # Init local database + FedMLClientDataInterface.get_instance().create_job_table() + try: + FedMLModelDatabase.get_instance().set_database_base_dir(ClientConstants.get_database_dir()) + FedMLModelDatabase.get_instance().create_table() + except Exception as e: + pass + + client_api_cmd = "fedml.computing.scheduler.model_scheduler.device_client_api:api" + client_api_pids = RunProcessUtils.get_pid_from_cmd_line(client_api_cmd) + + worker_proxy_port = ClientConstants.get_inference_worker_proxy_port() + + if client_api_pids is None or len(client_api_pids) <= 0: + # Start local API services + cur_dir = os.path.dirname(__file__) + fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir))) + python_program = get_python_program() + self.local_api_process = ClientConstants.exec_console_with_script( + "{} -m uvicorn {} --host 0.0.0.0 --port {} --reload --reload-delay 3 --reload-dir {} " + "--log-level critical".format( + python_program, client_api_cmd, + worker_proxy_port, fedml_base_dir + ), + should_capture_stdout=False, + should_capture_stderr=False + ) + + # Override + def _process_connection_ready(self): + MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) + + if self.mqtt_inference_obj is None: + self.mqtt_inference_obj = FedMLMqttInference( + agent_config=self.agent_config, mqtt_mgr=self.communication_mgr) + self.mqtt_inference_obj.setup_listener_for_endpoint_inference_request(self.edge_id) + + # Override + def _process_connection_lost(self): + try: + if self.mqtt_inference_obj is not None: + self.mqtt_inference_obj.remove_listener_for_endpoint_inference_request(self.edge_id) + except Exception as e: + pass + + # Override + def print_connected_info(self): + pass + + def callback_start_deployment(self, topic, payload): + """ + topic: model_ops/model_device/start_deployment/model-agent-device-id + payload: {"model_name": "image-model", "model_storage_url":"s3-url", + "instance_scale_min":1, "instance_scale_max":3, "inference_engine":"onnx (or tensorrt)"} + """ + # Parse deployment parameters + request_json = json.loads(payload) + run_id = request_json["end_point_id"] + token = request_json["token"] + user_id = request_json["user_id"] + user_name = request_json["user_name"] + device_ids = request_json["device_ids"] + device_objs = request_json["device_objs"] + model_config = request_json["model_config"] + model_name = model_config["model_name"] + model_storage_url = model_config["model_storage_url"] + scale_min = model_config.get("instance_scale_min", 0) + scale_max = model_config.get("instance_scale_max", 0) + inference_engine = model_config.get("inference_engine", 0) + inference_end_point_id = run_id + + try: + MLOpsConfigs.fetch_all_configs() + except Exception as e: + pass + + # Start log processor for current run + run_id = inference_end_point_id + self.args.run_id = run_id + self.args.edge_id = self.edge_id + MLOpsRuntimeLog(args=self.args).init_logs() + MLOpsRuntimeLogDaemon.get_instance(self.args).set_log_source( + ClientConstants.FEDML_LOG_SOURCE_TYPE_MODEL_END_POINT) + MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor(run_id, self.edge_id) + + # Start the job runner + request_json["run_id"] = run_id + run_id_str = str(run_id) + self.request_json = request_json + self.running_request_json[run_id_str] = request_json + self._get_job_runner_manager().start_job_runner( + run_id, request_json, args=self.args, edge_id=self.edge_id, + sender_message_queue=self.message_center.get_sender_message_queue(), + listener_message_queue=self.get_listener_message_queue(), + status_center_queue=self.get_status_queue() + ) + process = self._get_job_runner_manager().get_runner_process(run_id) + if process is not None: + ClientConstants.save_run_process(run_id, process.pid) + + def callback_delete_deployment(self, topic, payload): + logging.info("[Worker] callback_delete_deployment") + + # Parse payload as the model message object. + model_msg_object = FedMLModelMsgObject(topic, payload) + + # Delete all replicas on this device + try: + ClientConstants.remove_deployment( + model_msg_object.end_point_name, model_msg_object.model_name, model_msg_object.model_version, + model_msg_object.run_id, model_msg_object.model_id, edge_id=self.edge_id) + except Exception as e: + logging.info(f"Exception when removing deployment {traceback.format_exc()}") + pass + + self._get_job_runner_manager().stop_job_runner(model_msg_object.run_id) + + logging.info(f"[endpoint/device][{model_msg_object.run_id}/{self.edge_id}] " + f"Release gpu resource when the worker deployment deleted.") + JobRunnerUtils.get_instance().release_gpu_ids(model_msg_object.run_id, self.edge_id) + + if self.running_request_json.get(str(model_msg_object.run_id)) is not None: + try: + self.running_request_json.pop(str(model_msg_object.run_id)) + except Exception as e: + logging.error(f"Error when removing running_request_json: {traceback.format_exc()}") + pass + + FedMLClientDataInterface.get_instance().delete_job_from_db(model_msg_object.run_id) + FedMLModelDatabase.get_instance().delete_deployment_result_with_device_id( + model_msg_object.run_id, model_msg_object.end_point_name, model_msg_object.model_name, + self.edge_id) + + # Delete FEDML_GLOBAL_ENDPOINT_RUN_ID_MAP_TAG-${run_id} both in redis and local db + ComputeCacheManager.get_instance().gpu_cache.delete_endpoint_run_id_map(str(model_msg_object.run_id)) + + # Delete FEDML_EDGE_ID_MODEL_DEVICE_ID_MAP_TAG-${run_id} both in redis and local db + ComputeCacheManager.get_instance().gpu_cache.delete_edge_model_id_map(str(model_msg_object.run_id)) + + # Delete FEDML_GLOBAL_DEVICE_RUN_GPU_IDS_TAG-${run_id}-${device_id} both in redis and local db + ComputeCacheManager.get_instance().gpu_cache.delete_device_run_gpu_ids(str(self.edge_id), + str(model_msg_object.run_id)) + + # Delete FEDML_GLOBAL_DEVICE_RUN_NUM_GPUS_TAG-${run_id}-${device_id} both in redis and local db + ComputeCacheManager.get_instance().gpu_cache.delete_device_run_num_gpus(str(self.edge_id), + str(model_msg_object.run_id)) + + # Delete FEDML_MODEL_REPLICA_GPU_IDS_TAG-${run_id}-${end_point_name}-${model_name}-${device_id}-* + FedMLModelCache.get_instance().set_redis_params() + FedMLModelCache.get_instance().delete_all_replica_gpu_ids(model_msg_object.run_id, + model_msg_object.end_point_name, + model_msg_object.model_name, self.edge_id) diff --git a/python/fedml/computing/scheduler/scheduler_core/account_manager.py b/python/fedml/computing/scheduler/scheduler_core/account_manager.py new file mode 100755 index 0000000000..4b6a628b43 --- /dev/null +++ b/python/fedml/computing/scheduler/scheduler_core/account_manager.py @@ -0,0 +1,473 @@ +import logging +import os +import platform +import subprocess +import time +import traceback +import uuid + +import requests + +import fedml +from fedml.computing.scheduler.comm_utils import sys_utils, security_utils +from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants +from fedml.computing.scheduler.comm_utils.sys_utils import get_sys_runner_info +from fedml.computing.scheduler.scheduler_core.general_constants import GeneralConstants, MarketplaceType +from fedml.core.common.singleton import Singleton +from fedml.core.mlops import MLOpsConfigs + + +class FedMLAccountManager(Singleton): + LOCAL_RUNNER_INFO_DIR_NAME = 'runner_infos' + STATUS_IDLE = "IDLE" + ROLE_EDGE_SERVER = "edge_server" + ROLE_CLOUD_AGENT = "cloud_agent" + ROLE_CLOUD_SERVER = "cloud_server" + ROLE_EDGE_DEVICE = "client" + ROLE_GPU_PROVIDER = "gpu_supplier" + ROLE_DEPLOY_MASTER_ON_PREM = "md.on_premise_device.master" + ROLE_DEPLOY_WORKER_ON_PREM = "md.on_premise_device" + + DEVICE_ID_SUFFIX_EDGE_SERVER = ".Edge.Server" + DEVICE_ID_SUFFIX_CLOUD_AGENT = ".Public.Cloud" + DEVICE_ID_SUFFIX_CLOUD_SERVER = ".Public.Server" + DEVICE_ID_SUFFIX_EDGE_DEVICE = ".Edge.Device" + DEVICE_ID_SUFFIX_GPU_PROVIDER = ".Edge.GPU.Supplier" + DEVICE_ID_SUFFIX_DEPLOY = "MDA" + DEVICE_ID_SUFFIX_DEPLOY_MASTER_ON_PREM = ".OnPremise.Master.Device" + DEVICE_ID_SUFFIX_DEPLOY_WORKER_ON_PREM = ".OnPremise.Device" + + DEVICE_ID_DOCKER_TAG = ".Docker" + DEVICE_ID_DOCKER_HUB_TAG = ".DockerHub" + + def __init__(self): + if not hasattr(self, "agent_args"): + self.agent_args = None + + @staticmethod + def get_instance(): + return FedMLAccountManager() + + def login(self, user_id, api_key="", device_id=None, os_name=None, role=None, runner_cmd=None, marketplace_type=None, + price_per_hour=None, name=""): + # Build the agent args + self.build_agent_args( + user_id, api_key=api_key, device_id=device_id, os_name=os_name, role=role, runner_cmd=runner_cmd + ) + + # Fetch configs from the MLOps config server. + service_config = dict() + log_server_url = None + config_try_count = 0 + edge_id = 0 + while config_try_count < 5: + # noinspection PyBroadException + try: + mqtt_config, s3_config, mlops_config, docker_config = FedMLAccountManager.fetch_configs() + service_config["mqtt_config"] = mqtt_config + service_config["s3_config"] = s3_config + service_config["ml_ops_config"] = mlops_config + service_config["docker_config"] = docker_config + log_server_url = mlops_config.get("LOG_SERVER_URL", None) + break + except Exception as e: + print("{}\n{}".format(SchedulerConstants.ERR_MSG_BINDING_EXCEPTION_1, traceback.format_exc())) + print(SchedulerConstants.ERR_MSG_BINDING_EXIT_RETRYING) + config_try_count += 1 + time.sleep(3) + continue + + # Failed to fetch the config after retrying many times. + if config_try_count >= 5: + print("") + print("[5] Oops, you failed to login the FedML MLOps platform.") + print("Please check whether your network is normal!") + return None + + # Bind account id to TensorOpera® Nexus AI Platform + register_try_count = 0 + edge_id = -1 + user_name = None + extra_url = None + general_edge_id = None + while register_try_count < 5: + # noinspection PyBroadException + try: + edge_id, user_name, extra_url, general_edge_id = FedMLAccountManager.bind_account_and_device_id( + url=service_config["ml_ops_config"]["EDGE_BINDING_URL"], account_id=self.agent_args.account_id, + device_id=self.agent_args.unique_device_id, os_name=self.agent_args.os_name, + api_key=api_key, role=role, marketplace_type=marketplace_type, price_per_hour=price_per_hour, + name=name + ) + if edge_id > 0: + break + except SystemExit as e: + print("Your account does not exist. Please make sure your account correct.") + os.system("fedml logout -s") + return + except Exception as e: + print("{}\n{}".format(SchedulerConstants.ERR_MSG_BINDING_EXCEPTION_2, traceback.format_exc())) + print(SchedulerConstants.ERR_MSG_BINDING_EXIT_RETRYING) + register_try_count += 1 + time.sleep(3) + continue + + # Failed to bind your account after retrying many times. + if edge_id <= 0: + print("") + print("[6] Oops, you failed to login the FedML MLOps platform.") + print("Please check whether your network is normal!") + return None + + # Fill the bound result to agent args. + self.fill_argent_args( + log_server_url=log_server_url, server_id=edge_id, + edge_id=edge_id, general_edge_id=general_edge_id, + user_name=user_name, extra_url=extra_url, + agent_config=service_config) + + return self.agent_args + + def build_agent_args(self, user_id, api_key=None, device_id=None, os_name=None, role=None, runner_cmd=None): + # Generate the suffix for device based on the role + device_id_suffix = None + is_master = False + is_deploy = False + if role == FedMLAccountManager.ROLE_EDGE_SERVER: + device_id_suffix = FedMLAccountManager.DEVICE_ID_SUFFIX_EDGE_SERVER + is_master = True + elif role == FedMLAccountManager.ROLE_CLOUD_AGENT: + device_id_suffix = FedMLAccountManager.DEVICE_ID_SUFFIX_CLOUD_AGENT + is_master = True + elif role == FedMLAccountManager.ROLE_CLOUD_SERVER: + device_id_suffix = "" + is_master = True + elif role == FedMLAccountManager.ROLE_EDGE_DEVICE: + device_id_suffix = FedMLAccountManager.DEVICE_ID_SUFFIX_EDGE_DEVICE + elif role == FedMLAccountManager.ROLE_GPU_PROVIDER: + device_id_suffix = FedMLAccountManager.DEVICE_ID_SUFFIX_GPU_PROVIDER + elif role == FedMLAccountManager.ROLE_DEPLOY_MASTER_ON_PREM: + device_id_suffix = FedMLAccountManager.DEVICE_ID_SUFFIX_DEPLOY_MASTER_ON_PREM + is_master = True + is_deploy = True + elif role == FedMLAccountManager.ROLE_DEPLOY_WORKER_ON_PREM: + device_id_suffix = FedMLAccountManager.DEVICE_ID_SUFFIX_DEPLOY_WORKER_ON_PREM + is_deploy = True + + # Build the agent args + version = fedml.get_env_version() + if self.agent_args is None: + self.agent_args = AgentArgs() + self.agent_args.role = role + self.agent_args.account_id = user_id + self.agent_args.api_key = api_key + self.agent_args.current_running_dir = GeneralConstants.get_deploy_fedml_home_dir(is_master=is_master) \ + if is_deploy else GeneralConstants.get_launch_fedml_home_dir(is_master=is_master) + sys_name = platform.system() + if sys_name == "Darwin": + sys_name = "MacOS" + self.agent_args.os_name = sys_name if os_name is None or os_name == "" else os_name + self.agent_args.version = version + self.agent_args.log_file_dir = GeneralConstants.get_deploy_log_file_dir(is_master=is_master) \ + if is_deploy else GeneralConstants.get_launch_log_file_dir(is_master=is_master) + is_from_docker = False + if device_id is not None and device_id != "0": + self.agent_args.current_device_id = device_id + else: + data_dir = GeneralConstants.get_deploy_data_dir(is_master=is_master) \ + if is_deploy else GeneralConstants.get_launch_data_dir(is_master=is_master) + is_gpu_provider = True if role == FedMLAccountManager.ROLE_GPU_PROVIDER else False + self.agent_args.current_device_id = FedMLAccountManager.get_device_id( + data_dir=data_dir, use_machine_id=is_gpu_provider) + self.agent_args.device_id = self.agent_args.current_device_id + self.agent_args.config_version = version + self.agent_args.cloud_region = "" + + # Check if it is running in the fedml docker hub + is_from_fedml_docker_hub = False + dock_loc_file = GeneralConstants.get_deploy_docker_location_file(is_master=is_master) \ + if is_deploy else GeneralConstants.get_launch_docker_location_file(is_master=is_master) + if os.path.exists(dock_loc_file): + is_from_fedml_docker_hub = True + + # Build unique device id + docker_tag = FedMLAccountManager.DEVICE_ID_DOCKER_TAG if is_from_docker else "" + docker_tag = FedMLAccountManager.DEVICE_ID_DOCKER_HUB_TAG if is_from_fedml_docker_hub else docker_tag + unique_device_id = f"{self.agent_args.current_device_id}@{self.agent_args.os_name}" \ + f"{docker_tag}{device_id_suffix}" + if role == FedMLAccountManager.ROLE_CLOUD_SERVER: + unique_device_id = self.agent_args.current_device_id + + # Set the unique device id + self.agent_args.is_from_docker = is_from_docker or is_from_fedml_docker_hub + self.agent_args.unique_device_id = unique_device_id + self.agent_args.runner_cmd = runner_cmd + + def fill_argent_args( + self, log_server_url=None, server_id=None, edge_id=None, + user_name=None, extra_url=None, general_edge_id=None, agent_config=None): + self.agent_args.log_server_url = log_server_url + self.agent_args.server_id = server_id + self.agent_args.edge_id = edge_id + self.agent_args.user_name = user_name + self.agent_args.extra_url = extra_url + self.agent_args.general_edge_id = general_edge_id + self.agent_args.agent_config = agent_config + + @staticmethod + def write_login_failed_file(is_client=True): + login_exit_file = os.path.join( + GeneralConstants.get_launch_log_file_dir(is_master=not is_client), "exited.log") + with open(login_exit_file, "w") as f: + f.writelines(f"{os.getpid()}.") + + @staticmethod + def get_device_id(data_dir, use_machine_id=False): + device_file_path = os.path.join(data_dir, FedMLAccountManager.LOCAL_RUNNER_INFO_DIR_NAME) + file_for_device_id = os.path.join(device_file_path, "devices.id") + if not os.path.exists(device_file_path): + os.makedirs(device_file_path, exist_ok=True) + elif os.path.exists(file_for_device_id): + with open(file_for_device_id, 'r', encoding='utf-8') as f: + device_id_from_file = f.readline() + if device_id_from_file is not None and device_id_from_file != "": + return device_id_from_file + + if platform.system() == "Darwin": + cmd_get_serial_num = "system_profiler SPHardwareDataType | grep Serial | awk '{gsub(/ /,\"\")}{print}' " \ + "|awk -F':' '{print $2}' " + device_id = os.popen(cmd_get_serial_num).read() + device_id = device_id.replace('\n', '').replace(' ', '') + if device_id is None or device_id == "": + if not use_machine_id: + device_id = hex(uuid.getnode()) + else: + device_id = FedMLAccountManager.get_gpu_machine_id() + else: + device_id = "0x" + device_id + else: + if "nt" in os.name: + + def get_uuid(): + guid = "" + try: + cmd = "wmic csproduct get uuid" + guid = str(subprocess.check_output(cmd)) + pos1 = guid.find("\\n") + 2 + guid = guid[pos1:-15] + except Exception as ex: + logging.error(f"Failed to get uuid with Exception {ex}. Traceback: {traceback.format_exc()}") + pass + return str(guid) + + device_id = str(get_uuid()) + logging.info(device_id) + elif "posix" in os.name: + device_id = sys_utils.get_device_id_in_docker() + if device_id is None: + if not use_machine_id: + device_id = hex(uuid.getnode()) + else: + device_id = FedMLAccountManager.get_gpu_machine_id() + else: + device_id = sys_utils.run_subprocess_open( + "hal-get-property --udi /org/freedesktop/Hal/devices/computer --key system.hardware.uuid".split() + ) + device_id = hex(device_id) + + if device_id is not None and device_id != "": + with open(file_for_device_id, 'w', encoding='utf-8') as f: + f.write(device_id) + else: + device_id = hex(uuid.uuid4()) + with open(file_for_device_id, 'w', encoding='utf-8') as f: + f.write(device_id) + + return device_id + + @staticmethod + def get_gpu_machine_id(): + gpu_list = sys_utils.get_gpu_list() + gpu_uuids = "" + if len(gpu_list) > 0: + for gpu in gpu_list: + gpu_uuids += gpu.get("uuid", "") + else: + gpu_uuids = str(uuid.uuid4()) + device_id_combination = \ + f"{FedMLAccountManager.get_machine_id()}-{hex(uuid.getnode())}-{gpu_uuids}" + device_id = security_utils.get_content_hash(device_id_combination) + return device_id + + @staticmethod + def get_machine_id(): + try: + import machineid + return machineid.id().replace('\n', '').replace('\r\n', '').strip() + except Exception as e: + logging.error(f"Failed to get machine id with Exception {e}. Traceback: {traceback.format_exc()}") + return hex(uuid.getnode()) + + @staticmethod + def bind_account_and_device_id( + url, account_id, device_id, marketplace_type, price_per_hour, os_name,api_key="", + role=ROLE_EDGE_SERVER, name=""): + ip = requests.get('https://checkip.amazonaws.com').text.strip() + fedml_ver, exec_path, os_ver, cpu_info, python_ver, torch_ver, mpi_installed, \ + cpu_usage, available_mem, total_mem, gpu_info, gpu_available_mem, gpu_total_mem, \ + gpu_count, gpu_vendor, cpu_count, gpu_device_name = get_sys_runner_info() + host_name = sys_utils.get_host_name() + json_params = { + "accountid": account_id, + "deviceid": device_id, + "type": os_name, + "state": FedMLAccountManager.STATUS_IDLE, + "status": FedMLAccountManager.STATUS_IDLE, + "processor": cpu_info, + "core_type": cpu_info, + "network": "", + "role": role, + "os_ver": os_ver, + "memory": total_mem, + "ip": ip, + "api_key": api_key, + "extra_infos": {"fedml_ver": fedml_ver, "exec_path": exec_path, "os_ver": os_ver, + "cpu_info": cpu_info, "python_ver": python_ver, "torch_ver": torch_ver, + "mpi_installed": mpi_installed, "cpu_usage": cpu_usage, + "available_mem": available_mem, "total_mem": total_mem, + "cpu_count": cpu_count, "gpu_count": 0, "host_name": host_name} + } + + if role == FedMLAccountManager.ROLE_GPU_PROVIDER: + json_params["marketplaceType"] = MarketplaceType.from_str(marketplace_type).value + json_params["providerPricePerHour"] = float(price_per_hour) + json_params["name"] = name + logging.info(f"[DEBUG] marketplaceType: {marketplace_type}, price_per_hour: {price_per_hour}, name: {name}") + + if gpu_count > 0: + if gpu_total_mem is not None: + json_params["gpu"] = gpu_info if gpu_info is not None else "" + ", Total GPU Memory: " + gpu_total_mem + else: + json_params["gpu"] = gpu_info if gpu_info is not None else "" + json_params["extra_infos"]["gpu_info"] = gpu_info if gpu_info is not None else "" + if gpu_available_mem is not None: + json_params["extra_infos"]["gpu_available_mem"] = gpu_available_mem + if gpu_total_mem is not None: + json_params["extra_infos"]["gpu_total_mem"] = gpu_total_mem + + json_params["extra_infos"]["gpu_count"] = gpu_count + json_params["extra_infos"]["gpu_vendor"] = gpu_vendor + json_params["extra_infos"]["gpu_device_name"] = gpu_device_name + + gpu_available_id_list = sys_utils.get_available_gpu_id_list(limit=gpu_count) + gpu_available_count = len(gpu_available_id_list) if gpu_available_id_list is not None else 0 + gpu_list = sys_utils.get_gpu_list() + json_params["extra_infos"]["gpu_available_count"] = gpu_available_count + json_params["extra_infos"]["gpu_available_id_list"] = gpu_available_id_list + json_params["extra_infos"]["gpu_list"] = gpu_list + else: + json_params["gpu"] = "None" + json_params["extra_infos"]["gpu_available_count"] = 0 + json_params["extra_infos"]["gpu_available_id_list"] = [] + json_params["extra_infos"]["gpu_list"] = [] + + _, cert_path = MLOpsConfigs.get_request_params() + if cert_path is not None: + try: + requests.session().verify = cert_path + response = requests.post( + url, json=json_params, verify=True, + headers={"content-type": "application/json", "Connection": "close"} + ) + except requests.exceptions.SSLError as err: + logging.error( + f"Failed to bind account and device id with error: {err}, traceback: {traceback.format_exc()}") + MLOpsConfigs.install_root_ca_file() + response = requests.post( + url, json=json_params, verify=True, + headers={"content-type": "application/json", "Connection": "close"} + ) + else: + response = requests.post(url, json=json_params, headers={"Connection": "close"}) + edge_id, user_name, extra_url, general_edge_id = -1, None, None, None + if response.status_code != 200: + print(f"Binding to MLOps with response.status_code = {response.status_code}, " + f"response.content: {response.content}") + pass + else: + # print("url = {}, response = {}".format(url, response)) + status_code = response.json().get("code") + if status_code == "SUCCESS": + edge_id = response.json().get("data").get("id") + user_name = response.json().get("data").get("userName", None) + extra_url = response.json().get("data").get("url", None) + general_edge_id = response.json().get("data").get("general_edge_id", None) + if edge_id is None or edge_id <= 0: + print(f"Binding to MLOps with response.status_code = {response.status_code}, " + f"response.content: {response.content}") + else: + if status_code == SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR: + raise SystemExit(SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR) + print(f"Binding to MLOps with response.status_code = {response.status_code}, " + f"response.content: {response.content}") + return -1, None, None, None + return edge_id, user_name, extra_url, general_edge_id + + @staticmethod + def fetch_configs(): + return MLOpsConfigs.fetch_all_configs() + + @staticmethod + def _role_is_slave_agent(role): + return True if role == FedMLAccountManager.ROLE_EDGE_DEVICE or \ + role == FedMLAccountManager.ROLE_GPU_PROVIDER else False + + +class AgentArgs: + def __init__(self, role=None, account_id=None, api_key=None, server_id=None, current_running_dir=None, + os_name=None, version=None, log_file_dir=None, log_server_url=None, device_id=None, + current_device_id=None, config_version=None, cloud_region=None, is_from_docker=False, + edge_id=None, agent_config=None, user_name=None, extra_url=None, unique_device_id=None): + self.role = role + self.account_id = account_id + self.api_key = api_key + self.current_running_dir = current_running_dir + self.server_id = server_id + self.os_name = os_name + self.version = version + self.log_file_dir = log_file_dir + self.log_server_url = log_server_url + self.device_id = device_id + self.current_device_id = current_device_id + self.config_version = config_version + self.cloud_region = cloud_region + self.is_from_docker = is_from_docker + self.edge_id = edge_id + self.client_id = edge_id + self.agent_config = agent_config + self.user_name = user_name + self.extra_url = extra_url + self.unique_device_id = unique_device_id + self.client_id_list = None + self.using_mlops = True + self.server_agent_id = None + self.general_edge_id = None + self.runner_cmd = None + + def is_cloud_server(self): + return self.role == FedMLAccountManager.ROLE_CLOUD_SERVER + + def is_cloud_agent(self): + return self.role == FedMLAccountManager.ROLE_CLOUD_AGENT + + def is_edge_server(self): + return self.role == FedMLAccountManager.ROLE_EDGE_SERVER + + def is_edge_device(self): + return self.role == FedMLAccountManager.ROLE_EDGE_DEVICE + + def is_gpu_provider(self): + return self.role == FedMLAccountManager.ROLE_GPU_PROVIDER + + def is_slave_agent(self): + return self.is_edge_device() or self.is_gpu_provider() diff --git a/python/fedml/computing/scheduler/scheduler_core/base_db.py b/python/fedml/computing/scheduler/scheduler_core/base_db.py index b827efacf7..dbb322cfae 100755 --- a/python/fedml/computing/scheduler/scheduler_core/base_db.py +++ b/python/fedml/computing/scheduler/scheduler_core/base_db.py @@ -1,5 +1,6 @@ import json import os +import platform import time from sqlalchemy import Column, String, TEXT, Integer, Float, create_engine, and_ @@ -25,7 +26,10 @@ def open_job_db(self): if self.db_connection is not None: return - self.db_engine = create_engine('sqlite:////{}'.format(self.db_path), echo=False) + if platform.system() == "Windows": + self.db_engine = create_engine('sqlite:///{}'.format(self.db_path), echo=False) + else: + self.db_engine = create_engine('sqlite:////{}'.format(self.db_path), echo=False) db_session_class = sessionmaker(bind=self.db_engine) self.db_connection = db_session_class() diff --git a/python/fedml/computing/scheduler/scheduler_core/compute_cache_manager.py b/python/fedml/computing/scheduler/scheduler_core/compute_cache_manager.py index f918c785e2..6247cebe4f 100755 --- a/python/fedml/computing/scheduler/scheduler_core/compute_cache_manager.py +++ b/python/fedml/computing/scheduler/scheduler_core/compute_cache_manager.py @@ -1,10 +1,11 @@ -import threading +import threading import redis from .compute_gpu_cache import ComputeGpuCache from .compute_logs_cache import ComputeLogsCache from .business_models import LogsUploadModel, MetricsModel from ..comm_utils.constants import SchedulerConstants +from .compute_status_cache import ComputeStatusCache class ComputeCacheManager(object): @@ -23,6 +24,7 @@ def init(self): self.redis_connection = None self.gpu_cache = ComputeGpuCache(self.redis_connection) self.logs_cache = ComputeLogsCache(self.redis_connection) + self.status_cache = ComputeStatusCache(self.redis_connection) self.local_lock = threading.Lock() def setup_redis_connection(self, redis_addr, redis_port, redis_password="fedml_default"): @@ -48,6 +50,7 @@ def setup_redis_connection(self, redis_addr, redis_port, redis_password="fedml_d self.redis_connection.set("FEDML_TEST_KEYS", "TEST") self.gpu_cache.redis_connection = self.redis_connection self.logs_cache.redis_connection = self.redis_connection + self.status_cache.redis_connection = self.redis_connection is_connected = True except Exception as e: is_connected = False @@ -69,6 +72,7 @@ def setup_public_redis_connection(self): self.redis_connection.set("FEDML_TEST_KEYS", "TEST") self.gpu_cache.redis_connection = self.redis_connection self.logs_cache.redis_connection = self.redis_connection + self.status_cache.redis_connection = self.redis_connection is_connected = True except Exception as e: pass @@ -134,6 +138,9 @@ def get_artifact_logs(self): def get_artifacts(self): pass + def get_status_cache(self): + return self.status_cache + diff --git a/python/fedml/computing/scheduler/scheduler_core/compute_gpu_cache.py b/python/fedml/computing/scheduler/scheduler_core/compute_gpu_cache.py index 7bab71212e..6b3addc320 100755 --- a/python/fedml/computing/scheduler/scheduler_core/compute_gpu_cache.py +++ b/python/fedml/computing/scheduler/scheduler_core/compute_gpu_cache.py @@ -10,6 +10,7 @@ class ComputeGpuCache(object): FEDML_GLOBAL_DEVICE_RUN_NUM_GPUS_TAG = "FEDML_GLOBAL_DEVICE_RUN_NUM_GPUS_TAG-" FEDML_GLOBAL_DEVICE_RUN_GPU_IDS_TAG = "FEDML_GLOBAL_DEVICE_RUN_GPU_IDS_TAG-" FEDML_GLOBAL_DEVICE_AVAILABLE_GPU_IDS_TAG = "FEDML_GLOBAL_DEVICE_AVAILABLE_GPU_IDS_TAG-" + FEDML_GLOBAL_DEVICE_INITIAL_AVAILABLE_GPU_IDS_TAG = "FEDML_GLOBAL_DEVICE_INITIAL_AVAILABLE_GPU_IDS_TAG-" FEDML_GLOBAL_DEVICE_TOTAL_NUM_GPUS_TAG = "FEDML_GLOBAL_DEVICE_TOTAL_NUM_GPUS_TAG-" FEDML_GLOBAL_RUN_TOTAL_NUM_GPUS_TAG = "FEDML_GLOBAL_RUN_TOTAL_NUM_GPUS_TAG-" FEDML_GLOBAL_RUN_DEVICE_IDS_TAG = "FEDML_GLOBAL_RUN_DEVICE_IDS_TAG-" @@ -107,6 +108,25 @@ def get_device_available_gpu_ids(self, device_id): return [] return device_available_gpu_ids + + def get_device_initial_available_gpu_ids(self, device_id): + # Get the initial available GPU ids from the cache, for checking if the device all available GPU ids is changed + device_initial_available_gpu_ids = None + try: + if self.redis_connection.exists(self.get_device_initial_available_gpu_ids_key(device_id)): + device_initial_available_gpu_ids = self.redis_connection.get(self.get_device_initial_available_gpu_ids_key(device_id)) + if str(device_initial_available_gpu_ids).strip() == "": + return [] + except Exception as e: + pass + + if device_initial_available_gpu_ids is not None and str(device_initial_available_gpu_ids).strip() != "": + device_initial_available_gpu_ids = device_initial_available_gpu_ids.split(',') + device_initial_available_gpu_ids = self.map_str_list_to_int_list(device_initial_available_gpu_ids) + else: + return [] + + return device_initial_available_gpu_ids def get_device_total_num_gpus(self, device_id): device_total_num_gpus = None @@ -241,6 +261,14 @@ def set_device_available_gpu_ids(self, device_id, gpu_ids): pass ComputeGpuDatabase.get_instance().set_device_available_gpu_ids(device_id, gpu_ids) + + def set_device_initial_available_gpu_ids(self, device_id, gpu_ids): + # Set the initial available GPU ids to the cache, use to check if the device all available GPU ids is changed + try: + str_gpu_ids = self.map_list_to_str(gpu_ids) + self.redis_connection.set(self.get_device_initial_available_gpu_ids_key(device_id), str_gpu_ids) + except Exception as e: + pass def set_device_total_num_gpus(self, device_id, num_gpus): try: @@ -311,6 +339,9 @@ def get_device_run_gpu_ids_key(device_id, run_id): def get_device_available_gpu_ids_key(self, device_id): return f"{ComputeGpuCache.FEDML_GLOBAL_DEVICE_AVAILABLE_GPU_IDS_TAG}{device_id}" + + def get_device_initial_available_gpu_ids_key(self, device_id): + return f"{ComputeGpuCache.FEDML_GLOBAL_DEVICE_INITIAL_AVAILABLE_GPU_IDS_TAG}{device_id}" def get_device_total_num_gpus_key(self, device_id): return f"{ComputeGpuCache.FEDML_GLOBAL_DEVICE_TOTAL_NUM_GPUS_TAG}{device_id}" diff --git a/python/fedml/computing/scheduler/scheduler_core/compute_gpu_db.py b/python/fedml/computing/scheduler/scheduler_core/compute_gpu_db.py index d50555d3c9..eb80c1424e 100755 --- a/python/fedml/computing/scheduler/scheduler_core/compute_gpu_db.py +++ b/python/fedml/computing/scheduler/scheduler_core/compute_gpu_db.py @@ -8,6 +8,7 @@ from fedml.core.common.singleton import Singleton from .base_db import FedMLBaseDb from .compute_utils import ComputeUtils +from ..master.server_constants import ServerConstants Base = declarative_base() diff --git a/python/fedml/computing/scheduler/scheduler_core/compute_status_cache.py b/python/fedml/computing/scheduler/scheduler_core/compute_status_cache.py new file mode 100755 index 0000000000..f224806b8c --- /dev/null +++ b/python/fedml/computing/scheduler/scheduler_core/compute_status_cache.py @@ -0,0 +1,78 @@ +import logging +import traceback +from .compute_status_db import ComputeStatusDatabase +from ..master.server_constants import ServerConstants + + +class ComputeStatusCache(object): + FEDML_JOB_STATUS_TAG = "FEDML_JOB_STATUS_TAG-" + FEDML_DEVICE_STATUS_IN_JOB_TAG = "FEDML_DEVICE_STATUS_IN_JOB_TAG-" + + def __init__(self, redis_connection): + self.redis_connection = redis_connection + ComputeStatusDatabase.get_instance().set_database_base_dir(ServerConstants.get_database_dir()) + ComputeStatusDatabase.get_instance().create_table() + + def save_job_status(self, run_id, status): + try: + self.redis_connection.set(self._get_job_status_key(run_id), status) + except Exception as e: + logging.error(f"Error setting job status: {e}, Traceback: {traceback.format_exc()}") + pass + + ComputeStatusDatabase.get_instance().set_job_status(run_id, status) + + def get_job_status(self, run_id): + status = None + try: + if self.redis_connection.exists(self._get_job_status_key(run_id)): + status = self.redis_connection.get(self._get_job_status_key(run_id)) + except Exception as e: + logging.error(f"Error getting job status: {e}, Traceback: {traceback.format_exc()}") + pass + + if status is None: + status = ComputeStatusDatabase.get_instance().get_job_status(run_id) + try: + if status is not None: + self.redis_connection.set(self._get_job_status_key(run_id), status) + except Exception as e: + pass + + return status + + def save_device_status_in_job(self, run_id, device_id, status): + if status is None: + return + try: + self.redis_connection.set(self._get_device_status_in_job_key(run_id, device_id), status) + except Exception as e: + logging.error(f"Error setting device status in job: {e}, Traceback: {traceback.format_exc()}") + pass + + ComputeStatusDatabase.get_instance().set_device_status_in_job(run_id, device_id, status) + + def get_device_status_in_job(self, run_id, device_id): + status = None + try: + if self.redis_connection.exists(self._get_device_status_in_job_key(run_id, device_id)): + status = self.redis_connection.get(self._get_device_status_in_job_key(run_id, device_id)) + except Exception as e: + logging.error(f"Error getting device status in job: {e}, Traceback: {traceback.format_exc()}") + pass + + if status is None: + status = ComputeStatusDatabase.get_instance().get_device_status_in_job(run_id, device_id) + try: + if status is not None: + self.redis_connection.set(self._get_device_status_in_job_key(run_id, device_id), status) + except Exception as e: + pass + + return status + + def _get_job_status_key(self, run_id): + return f"{ComputeStatusCache.FEDML_JOB_STATUS_TAG}{run_id}" + + def _get_device_status_in_job_key(self, run_id, device_id): + return f"{ComputeStatusCache.FEDML_DEVICE_STATUS_IN_JOB_TAG}{run_id}-{device_id}" diff --git a/python/fedml/computing/scheduler/scheduler_core/compute_status_db.py b/python/fedml/computing/scheduler/scheduler_core/compute_status_db.py new file mode 100755 index 0000000000..14219eeb6a --- /dev/null +++ b/python/fedml/computing/scheduler/scheduler_core/compute_status_db.py @@ -0,0 +1,123 @@ +import json +import os +import time + +from sqlalchemy import Column, String, TEXT, Integer, Float, create_engine, and_ +from sqlalchemy.orm import sessionmaker +from sqlalchemy.ext.declarative import declarative_base +from fedml.core.common.singleton import Singleton +from .base_db import FedMLBaseDb +from .compute_utils import ComputeUtils +from ..master.server_constants import ServerConstants + +Base = declarative_base() + + +class ComputeStatusDatabase(Singleton, FedMLBaseDb): + COMPUTE_STATUS_DB = "compute-status.db" + + def __init__(self): + super().__init__() + + @staticmethod + def get_instance(): + return ComputeStatusDatabase() + + def get_job_status(self, run_id): + self.open_job_db() + job = self.db_connection.query(FedMLJobStatus). \ + filter(FedMLJobStatus.job_id == f'{run_id}').first() + if job is None: + return + + return job.job_status + + def get_device_status_in_job(self, device_id, run_id): + self.open_job_db() + device = self.db_connection.query(FedMLDeviceStatusInJob). \ + filter(and_(FedMLDeviceStatusInJob.device_id == f'{device_id}', + FedMLDeviceStatusInJob.job_id == f'{run_id}')).first() + + return device.device_status + + def set_job_status(self, run_id, job_status): + self.open_job_db() + job = self.db_connection.query(FedMLJobStatus). \ + filter(FedMLJobStatus.job_id == f'{run_id}').first() + if job is None: + job = FedMLJobStatus(job_id=run_id, job_status=job_status) + self.db_connection.add(job) + self.db_connection.commit() + return + + if run_id is not None: + job.job_id = run_id + if job_status is not None: + job.job_status = job_status + + self.db_connection.commit() + + def set_device_status_in_job(self, run_id, device_id, status): + self.open_job_db() + device = self.db_connection.query(FedMLDeviceStatusInJob). \ + filter(and_(FedMLDeviceStatusInJob.device_id == f'{device_id}', + FedMLDeviceStatusInJob.job_id == f'{run_id}')).first() + if device is None: + job = FedMLDeviceStatusInJob(job_id=run_id, device_id=device_id, device_status=status) + self.db_connection.add(job) + self.db_connection.commit() + return + + if run_id is not None: + device.job_id = run_id + if device_id is not None: + device.device_id = device_id + if status is not None: + device.device_status = status + + self.db_connection.commit() + + def set_database_base_dir(self, database_base_dir): + self.db_base_dir = database_base_dir + self.init_db_path() + + def init_db_path(self): + if self.db_base_dir is None: + if not os.path.exists(ServerConstants.get_database_dir()): + os.makedirs(ServerConstants.get_database_dir(), exist_ok=True) + self.db_base_dir = ServerConstants.get_database_dir() + + self.db_path = os.path.join(self.db_base_dir, ComputeStatusDatabase.COMPUTE_STATUS_DB) + + def create_table(self): + self.open_job_db() + try: + Base.metadata.create_all(self.db_engine, checkfirst=True) + except Exception as e: + pass + + def drop_table(self): + self.open_job_db() + try: + Base.metadata.drop_all(self.db_engine, checkfirst=True) + except Exception as e: + pass + + +class FedMLJobStatus(Base): + __tablename__ = 'job_status' + + id = Column(Integer, primary_key=True) + job_id = Column(TEXT) + job_status = Column(TEXT) + timestamp = Column(Integer) + + +class FedMLDeviceStatusInJob(Base): + __tablename__ = 'device_status_in_job' + + id = Column(Integer, primary_key=True) + job_id = Column(TEXT) + device_id = Column(TEXT) + device_status = Column(TEXT) + timestamp = Column(Integer) diff --git a/python/fedml/computing/scheduler/scheduler_core/general_constants.py b/python/fedml/computing/scheduler/scheduler_core/general_constants.py new file mode 100755 index 0000000000..0fbd4881d9 --- /dev/null +++ b/python/fedml/computing/scheduler/scheduler_core/general_constants.py @@ -0,0 +1,235 @@ +import logging +import os +from enum import Enum + +from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants +from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils +from fedml.computing.scheduler.slave.client_constants import ClientConstants +from fedml.computing.scheduler.master.server_constants import ServerConstants +from fedml.computing.scheduler.model_scheduler import device_client_constants +from fedml.computing.scheduler.model_scheduler import device_server_constants + + +class MarketplaceType(Enum): + SECURE = 1 + COMMUNITY = 2 + + def __str__(self): + return self.name + + @classmethod + def from_str(cls, name: str): + """Get the enum member from a string.""" + if name.upper() in cls.__members__: + return cls[name.upper()] + else: + raise ValueError(f"Invalid marketplace type: {name}") + + +class GeneralConstants: + MSG_TOPIC_REQUEST_JOB_STATUS_PREFIX = f"anywhere/master_agent/request_job_status/" + MSG_TOPIC_REPORT_DEVICE_STATUS_IN_JOB = f"slave_job/slave_agent/report_device_status_in_job" + MSG_TOPIC_SEND_TRAINING_REQUEST_TO_EDGES = "job_runner/master_protocol_manager/send_training_request_to_edges" + + CLIENT_SHELL_BASH = SchedulerConstants.CLIENT_SHELL_BASH + CLIENT_SHELL_PS = SchedulerConstants.CLIENT_SHELL_PS + PLATFORM_WINDOWS = "Windows" + + MSG_MLOPS_CLIENT_STATUS_OFFLINE = "OFFLINE" + MSG_MLOPS_CLIENT_STATUS_PROVISIONING = "PROVISIONING" + MSG_MLOPS_CLIENT_STATUS_IDLE = "IDLE" + MSG_MLOPS_CLIENT_STATUS_UPGRADING = "UPGRADING" + MSG_MLOPS_CLIENT_STATUS_QUEUED = "QUEUED" + MSG_MLOPS_CLIENT_STATUS_INITIALIZING = "INITIALIZING" + MSG_MLOPS_CLIENT_STATUS_TRAINING = "TRAINING" + MSG_MLOPS_CLIENT_STATUS_RUNNING = "RUNNING" + MSG_MLOPS_CLIENT_STATUS_STOPPING = "STOPPING" + MSG_MLOPS_CLIENT_STATUS_KILLED = "KILLED" + MSG_MLOPS_CLIENT_STATUS_FAILED = "FAILED" + MSG_MLOPS_CLIENT_STATUS_EXCEPTION = "EXCEPTION" + MSG_MLOPS_CLIENT_STATUS_FINISHED = "FINISHED" + + MSG_MLOPS_SERVER_STATUS_OFFLINE = "OFFLINE" + MSG_MLOPS_SERVER_STATUS_PROVISIONING = "PROVISIONING" + MSG_MLOPS_SERVER_STATUS_IDLE = "IDLE" + MSG_MLOPS_SERVER_STATUS_UPGRADING = "UPGRADING" + MSG_MLOPS_SERVER_STATUS_STARTING = "STARTING" + MSG_MLOPS_SERVER_STATUS_RUNNING = "RUNNING" + MSG_MLOPS_SERVER_STATUS_STOPPING = "STOPPING" + MSG_MLOPS_SERVER_STATUS_KILLED = "KILLED" + MSG_MLOPS_SERVER_STATUS_FAILED = "FAILED" + MSG_MLOPS_SERVER_STATUS_FINISHED = "FINISHED" + MSG_MLOPS_SERVER_STATUS_EXCEPTION = "EXCEPTION" + + MSG_MODELOPS_DEPLOYMENT_STATUS_INITIALIZING = "INITIALIZING" + MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYING = "DEPLOYING" + MSG_MODELOPS_DEPLOYMENT_STATUS_INFERRING = "INFERRING" + MSG_MODELOPS_DEPLOYMENT_STATUS_OVERLOAD = "OVERLOAD" + MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED = "FAILED" + MSG_MODELOPS_DEPLOYMENT_STATUS_RESCALING = "RESCALING" + MSG_MODELOPS_DEPLOYMENT_STATUS_UPDATING = "UPDATING" + MSG_MODELOPS_DEPLOYMENT_STATUS_UPDATING_FAILED = "UPDATING_FAILED" + MSG_MODELOPS_DEPLOYMENT_STATUS_ABORTING = "ABORTING" + MSG_MODELOPS_DEPLOYMENT_STATUS_ABORTED = "ABORTED" + MSG_MODELOPS_DEPLOYMENT_STATUS_DEPLOYED = "DEPLOYED" + MSG_MODELOPS_DEPLOYMENT_STATUS_KILLED = "KILLED" + + MASTER_LOGIN_PROGRAM = "server_login.py" + SLAVE_LOGIN_PROGRAM = "client_login.py" + + CONFIG_KEY_AUTO_DETECT_PUBLIC_IP = "auto_detect_public_ip" + FEDML_OTA_CMD_UPGRADE = "upgrade" + FEDML_OTA_CMD_RESTART = "restart" + + FEDML_LOG_SOURCE_TYPE_MODEL_END_POINT = "MODEL_END_POINT" + + @staticmethod + def get_package_unzip_dir(package_download_dir): + package_unzip_dir = package_download_dir + if not os.path.exists(package_unzip_dir): + os.makedirs(package_unzip_dir, exist_ok=True) + return package_unzip_dir + + @staticmethod + def get_filename_and_extension(url): + return ClientConstants.get_filename_and_extension(url) + + @staticmethod + def generate_yaml_doc(run_config_object, yaml_file): + ClientConstants.generate_yaml_doc(run_config_object, yaml_file) + + @staticmethod + def execute_commands_with_live_logs(cmds, join='&&', should_write_log_file=True, + callback=None, error_processor=None): + return ClientConstants.execute_commands_with_live_logs( + cmds, join=join, should_write_log_file=should_write_log_file, + callback=callback, error_processor=error_processor + ) + + @staticmethod + def cleanup_run_process(run_id, is_master=False): + if is_master: + ServerConstants.cleanup_run_process(run_id) + else: + ClientConstants.cleanup_run_process(run_id) + + @staticmethod + def cleanup_learning_process(run_id, data_dir=None): + RunProcessUtils.cleanup_run_process( + run_id, data_dir, ClientConstants.LOCAL_RUNNER_INFO_DIR_NAME, + info_file_prefix=SchedulerConstants.RUN_PROCESS_TYPE_USER_PROCESS) + + @staticmethod + def cleanup_bootstrap_process(run_id, data_dir=None): + RunProcessUtils.cleanup_run_process( + run_id, data_dir, ClientConstants.LOCAL_RUNNER_INFO_DIR_NAME, + info_file_prefix=SchedulerConstants.RUN_PROCESS_TYPE_BOOTSTRAP_PROCESS) + + @staticmethod + def save_learning_process(run_id, learning_id, data_dir=None): + RunProcessUtils.save_run_process( + run_id, learning_id, data_dir, ClientConstants.LOCAL_RUNNER_INFO_DIR_NAME, + info_file_prefix=SchedulerConstants.RUN_PROCESS_TYPE_USER_PROCESS) + + @staticmethod + def save_bootstrap_process(run_id, process_id, data_dir=None): + RunProcessUtils.save_run_process( + run_id, process_id, data_dir, ClientConstants.LOCAL_RUNNER_INFO_DIR_NAME, + info_file_prefix=SchedulerConstants.RUN_PROCESS_TYPE_BOOTSTRAP_PROCESS) + + @staticmethod + def save_run_process(run_id, process_id, is_master=False): + RunProcessUtils.save_run_process( + run_id, process_id, ServerConstants.get_data_dir() if is_master else ClientConstants.get_data_dir(), + ClientConstants.LOCAL_RUNNER_INFO_DIR_NAME) + + @staticmethod + def get_learning_process_list(run_id, is_master=False): + return RunProcessUtils.get_run_process_list( + run_id, ServerConstants.get_data_dir() if is_master else ClientConstants.get_data_dir(), + ClientConstants.LOCAL_RUNNER_INFO_DIR_NAME, + info_file_prefix=SchedulerConstants.RUN_PROCESS_TYPE_USER_PROCESS) + + @staticmethod + def get_launch_fedml_home_dir(is_master=False): + return ServerConstants.get_fedml_home_dir() if is_master else ClientConstants.get_fedml_home_dir() + + @staticmethod + def get_deploy_fedml_home_dir(is_master=False): + return device_server_constants.ServerConstants.get_fedml_home_dir() if is_master \ + else device_client_constants.ClientConstants.get_fedml_home_dir() + + @staticmethod + def get_launch_log_file_dir(is_master=False): + return ServerConstants.get_log_file_dir() if is_master else ClientConstants.get_log_file_dir() + + @staticmethod + def get_deploy_log_file_dir(is_master=False): + return device_server_constants.ServerConstants.get_log_file_dir() if is_master \ + else device_client_constants.ClientConstants.get_log_file_dir() + + @staticmethod + def get_launch_data_dir(is_master=False): + return ServerConstants.get_data_dir() if is_master else ClientConstants.get_data_dir() + + @staticmethod + def get_deploy_data_dir(is_master=False): + return device_server_constants.ServerConstants.get_data_dir() if is_master \ + else device_client_constants.ClientConstants.get_data_dir() + + @staticmethod + def get_deploy_docker_location_file(is_master=False): + return device_server_constants.ServerConstants.get_docker_location_file() if is_master \ + else device_client_constants.ClientConstants.get_docker_location_file() + + @staticmethod + def get_launch_docker_location_file(is_master=False): + return ServerConstants.get_docker_location_file() if is_master \ + else ClientConstants.get_docker_location_file() + + @staticmethod + def get_local_ip(): + import socket + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + conn = s.connect(('8.8.8.8', 53)) + ip = s.getsockname()[0] + s.close() + return ip + + @staticmethod + def get_public_ip(): + import requests + ip = None + try: + ip = requests.get('https://checkip.amazonaws.com').text.strip() + except Exception as e: + logging.info("Failed to get public ip: {}".format(e)) + return ip + + @staticmethod + def get_ip_address(request_json, infer_host=None): + # OPTION 1: Use local ip + # ip = GeneralConstants.get_local_ip() + # + # # OPTION 2: Auto detect public ip + # if "parameters" in request_json and \ + # GeneralConstants.CONFIG_KEY_AUTO_DETECT_PUBLIC_IP in request_json["parameters"] and \ + # request_json["parameters"][GeneralConstants.CONFIG_KEY_AUTO_DETECT_PUBLIC_IP]: + ip = GeneralConstants.get_public_ip() + logging.info("Auto detect public ip for master: " + ip) + + # OPTION 3: Use user indicated ip + if infer_host is not None and infer_host != "127.0.0.1" and infer_host != "localhost": + ip = infer_host + + return ip + + @staticmethod + def get_topic_complete_job(server_id): + topic_complete_job = f"status_center/master_agent_{server_id}/complete_job" + return topic_complete_job + + @staticmethod + def get_payload_complete_job(run_id, server_id): + payload_complete_job = {"runId": run_id, "serverId": server_id} + return payload_complete_job diff --git a/python/fedml/computing/scheduler/scheduler_core/master_api_daemon.py b/python/fedml/computing/scheduler/scheduler_core/master_api_daemon.py index 5cebf757d6..5876a787ce 100755 --- a/python/fedml/computing/scheduler/scheduler_core/master_api_daemon.py +++ b/python/fedml/computing/scheduler/scheduler_core/master_api_daemon.py @@ -1,7 +1,8 @@ from fastapi import FastAPI, Request -from .log_manager import LogsManager -from .metrics_manager import MetricsManager -from ..comm_utils import sys_utils +from fedml.computing.scheduler.scheduler_core.log_manager import LogsManager +from fedml.computing.scheduler.scheduler_core.metrics_manager import MetricsManager +from fedml.computing.scheduler.comm_utils import sys_utils +from fedml.computing.scheduler.scheduler_core.compute_cache_manager import ComputeCacheManager import os @@ -52,6 +53,19 @@ async def update_log(request: Request): async def ready(): return {"status": "Success"} + @api.get("/get_job_status") + async def get_job_status(job_id): + ComputeCacheManager.get_instance().set_redis_params() + job_status = ComputeCacheManager.get_instance().get_status_cache().get_job_status(job_id) + return {"job_status": job_status} + + @api.get("/get_device_status_in_job") + async def get_device_status_in_job(job_id, device_id): + ComputeCacheManager.get_instance().set_redis_params() + device_status_in_job = ComputeCacheManager.get_instance().get_status_cache().get_device_status_in_job( + job_id, device_id) + return {"device_status_in_job": device_status_in_job} + import uvicorn port = 30800 if sys_utils.check_port("localhost", port): @@ -59,7 +73,6 @@ async def ready(): cur_dir = os.path.dirname(__file__) fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir))) - uvicorn.run(api, host="0.0.0.0", port=port, reload=True, reload_delay=3, reload_dirs=fedml_base_dir) - + uvicorn.run(api, host="0.0.0.0", port=port) diff --git a/python/fedml/computing/scheduler/scheduler_core/message_center.py b/python/fedml/computing/scheduler/scheduler_core/message_center.py index 2148b0c5ef..dbe11700a0 100755 --- a/python/fedml/computing/scheduler/scheduler_core/message_center.py +++ b/python/fedml/computing/scheduler/scheduler_core/message_center.py @@ -1,3 +1,4 @@ +import json import logging import os import threading @@ -7,20 +8,29 @@ import multiprocessing from multiprocessing import Process, Queue import queue +from os.path import expanduser from fedml.core.distributed.communication.mqtt.mqtt_manager import MqttManager +from ..slave.client_constants import ClientConstants from ....core.mlops.mlops_metrics import MLOpsMetrics from operator import methodcaller +from .message_common import FedMLMessageEntity, FedMLMessageRecord -class FedMLMessageCenter: +class FedMLMessageCenter(object): FUNC_SETUP_MESSAGE_CENTER = "setup_message_center" FUNC_REBUILD_MESSAGE_CENTER = "rebuild_message_center" - - def __init__(self, agent_config=None, message_queue=None, listener_message_queue=None): + ENABLE_SAVE_MESSAGE_TO_FILE = True + PUBLISH_MESSAGE_RETRY_TIMEOUT = 60 * 1000.0 + PUBLISH_MESSAGE_RETRY_COUNT = 3 + MESSAGE_SENT_RECORDS_FILE = "message-sent-records.log" + MESSAGE_SENT_SUCCESS_RECORDS_FILE = "message-sent-success-records.log" + MESSAGE_RECEIVED_RECORDS_FILE = "message-received-records.log" + + def __init__(self, agent_config=None, sender_message_queue=None, listener_message_queue=None): self.sender_agent_config = agent_config self.listener_agent_config = agent_config - self.message_queue = message_queue + self.sender_message_queue = sender_message_queue self.message_event = None self.message_center_process = None self.sender_mqtt_mgr = None @@ -32,9 +42,15 @@ def __init__(self, agent_config=None, message_queue=None, listener_message_queue self.listener_payloads = dict() self.listener_handler_funcs = dict() self.listener_handler_object = None - self.listener_message_queue = None + self.listener_message_queue = listener_message_queue self.listener_message_event = None self.listener_message_center_process = None + self.sender_message_list = list() + self.receiver_message_list = list() + self.published_message_ids = list() + self.retry_sending_count_map = dict() + self.constants = FedMLMessageCenterConstants() + self.message_center_name = None def __repr__(self): return "<{klass} @{id:x} {attrs}>".format( @@ -64,6 +80,10 @@ def on_sender_mqtt_connected(self, mqtt_client_object): self.sender_mqtt_is_connected = True self.sender_mqtt_lock.release() + def on_sender_mqtt_published(self, mqtt_client_object, obj, mid): + self.published_message_ids.append({"message_id": mid, "timestamp": time.time_ns()/100.0/1000.0}) + self.save_published_message_record(mid) + def setup_sender_mqtt_mgr(self): if self.sender_mqtt_mgr is not None: return @@ -82,6 +102,7 @@ def setup_sender_mqtt_mgr(self): self.sender_mqtt_mgr.add_connected_listener(self.on_sender_mqtt_connected) self.sender_mqtt_mgr.add_disconnected_listener(self.on_sender_mqtt_disconnected) + self.sender_mqtt_mgr.add_published_listener(self.on_sender_mqtt_published) self.sender_mqtt_mgr.connect() self.sender_mqtt_mgr.loop_start() @@ -90,6 +111,7 @@ def setup_sender_mqtt_mgr(self): self.sender_mlops_metrics.set_messenger(self) def release_sender_mqtt_mgr(self): + # noinspection PyBroadException try: if self.sender_mqtt_mgr is not None: self.sender_mqtt_mgr.loop_stop() @@ -105,17 +127,19 @@ def release_sender_mqtt_mgr(self): f"Failed to release sender mqtt manager with Exception {e}. Traceback: {traceback.format_exc()}") pass - def get_message_queue(self): - return self.message_queue + def get_sender_message_queue(self): + return self.sender_message_queue - def start_sender(self): - self.message_queue = Queue() + def start_sender(self, message_center_name=None): + self.sender_message_queue = Queue() self.message_event = multiprocessing.Event() self.message_event.clear() - message_center = FedMLMessageCenter(agent_config=self.sender_agent_config, message_queue=self.message_queue) + message_center = FedMLMessageCenter(agent_config=self.sender_agent_config, + sender_message_queue=self.sender_message_queue) self.message_center_process = Process( target=message_center.run_sender, args=( - self.message_event, self.message_queue, + self.message_event, self.sender_message_queue, + message_center_name ) ) self.message_center_process.start() @@ -134,39 +158,93 @@ def check_message_stop_event(self): def send_message(self, topic, payload, run_id=None): message_entity = FedMLMessageEntity(topic=topic, payload=payload, run_id=run_id) - self.message_queue.put(message_entity.get_message_body()) + self.sender_message_queue.put(message_entity.get_message_body()) def send_message_json(self, topic, payload): self.send_message(topic, payload) - def run_sender(self, message_event, message_queue): + def retry_sending_undelivered_message(self): + for sender_message in self.sender_message_list: + # Check if the message is published. + message_record = FedMLMessageRecord(json_record=sender_message) + is_published = False + for published_message in self.published_message_ids: + published_message_record = FedMLMessageRecord(json_record=published_message) + if published_message_record.message_id == message_record.message_id: + is_published = True + break + if is_published: + continue + + # Retry to send the unpublished message based on the timeout value + timeout_ms = time.time() * 1000.0 - message_record.timestamp + if timeout_ms >= FedMLMessageCenter.PUBLISH_MESSAGE_RETRY_TIMEOUT and \ + self.retry_sending_count_map.get(message_record.message_id, 0) < \ + FedMLMessageCenter.PUBLISH_MESSAGE_RETRY_COUNT: + # Send the message + message_entity = FedMLMessageEntity(message_body=message_record.message_body) + message_id = self.sender_mqtt_mgr.send_message_json(message_entity.topic, message_entity.payload) + self.retry_sending_count_map[message_record.message_id] += 1 + + # Generate the new message record. + sent_message_record = FedMLMessageRecord(message_id=message_id, + message_body=message_record.message_body) + + # Save the message + self.save_message_record(message_entity.run_id, message_entity.device_id, sent_message_record) + + def run_sender(self, message_event, message_queue, message_center_name): self.message_event = message_event - self.message_queue = message_queue + self.sender_message_queue = message_queue + self.message_center_name = message_center_name self.setup_sender_mqtt_mgr() - time.sleep(5) while True: + message_entity = None + message_body = None try: self.check_message_stop_event() except MessageCenterStoppedException as e: break + # noinspection PyBroadException try: + # Setup the mqtt connection self.setup_sender_mqtt_mgr() + # Get the message from the queue try: - message_body = self.message_queue.get(block=False, timeout=0.1) + message_body = message_queue.get(block=False, timeout=0.1) except queue.Empty as e: # If queue is empty, then break loop message_body = None if message_body is None: time.sleep(0.1) + # self.retry_sending_undelivered_message() continue + # Generate the message entity object message_entity = FedMLMessageEntity(message_body=message_body) - self.sender_mqtt_mgr.send_message_json(message_entity.topic, message_entity.payload) + + # Send the message to mqtt server + message_id = self.sender_mqtt_mgr.send_message_json(message_entity.topic, message_entity.payload) + + # Generate the message record. + message_record = FedMLMessageRecord(message_id=message_id, message_body=message_body) + + # Cache the message + self.cache_message_record(message_record, is_sender=True) + + # Save the message + self.save_message_record(message_entity.run_id, message_entity.device_id, message_record) + except Exception as e: - logging.info( - f"Failed to send the message with topic {message_entity.topic}, payload {message_entity.payload}, {traceback.format_exc()}") + if message_entity is not None: + logging.info( + f"Failed to send the message with topic {message_entity.topic}, " + f"payload {message_entity.payload}, {traceback.format_exc()}" + ) + else: + logging.info(f"Failed to send the message with body {message_body}, {traceback.format_exc()}") self.release_sender_mqtt_mgr() @@ -194,7 +272,9 @@ def release_listener_mqtt_mgr(self): self.listener_mqtt_mgr = None except Exception as e: logging.error( - f"Failed to release listener mqtt manager with Exception {e}. Traceback: {traceback.format_exc()}") + f"Failed to release listener mqtt manager with Exception {e}. " + f"Traceback: {traceback.format_exc()}" + ) pass def add_message_listener(self, topic, listener_func): @@ -207,23 +287,34 @@ def remove_message_listener(self, topic): self.listener_topics.remove(topic) self.listener_handler_funcs.pop(topic) - def get_runner(self): + def get_message_runner(self): return None - def start_listener(self, sender_message_queue=None, agent_config=None): + def get_listener_message_queue(self): + return self.listener_message_queue + + def setup_listener_message_queue(self): + self.listener_message_queue = Queue() + + def start_listener(self, sender_message_queue=None, listener_message_queue=None, agent_config=None, message_center_name=None): if self.listener_message_center_process is not None: return - self.listener_message_queue = Queue() + if listener_message_queue is None: + if self.listener_message_queue is None: + self.listener_message_queue = Queue() + else: + self.listener_message_queue = listener_message_queue self.listener_message_event = multiprocessing.Event() self.listener_message_event.clear() self.listener_agent_config = agent_config - message_runner = self.get_runner() + message_runner = self.get_message_runner() message_runner.listener_agent_config = agent_config self.listener_message_center_process = Process( target=message_runner.run_listener_dispatcher, args=( self.listener_message_event, self.listener_message_queue, - self.listener_handler_funcs, sender_message_queue + self.listener_handler_funcs, sender_message_queue, + message_center_name ) ) self.listener_message_center_process.start() @@ -236,6 +327,12 @@ def check_listener_message_stop_event(self): def listener_message_dispatch_center(self, topic, payload): self.receive_message_json(topic, payload) + def listener_message_passthrough_dispatch_center(self, message): + payload_obj = json.loads(message.payload) + payload_obj["is_retain"] = message.retain + payload = json.dumps(payload_obj) + self.receive_message_json(message.topic, payload) + def receive_message(self, topic, payload, run_id=None): message_entity = FedMLMessageEntity(topic=topic, payload=payload, run_id=run_id) self.listener_message_queue.put(message_entity.get_message_body()) @@ -252,10 +349,13 @@ def unsubscribe_msg(self, topic): self.listener_mqtt_mgr.unsubscribe_msg(topic) def run_listener_dispatcher( - self, message_event, message_queue, listener_funcs, sender_message_queue): + self, message_event, message_queue, listener_funcs, sender_message_queue, + message_center_name + ): self.listener_message_event = message_event self.listener_message_queue = message_queue self.listener_handler_funcs = listener_funcs + self.message_center_name = message_center_name self.setup_listener_mqtt_mgr() @@ -265,51 +365,110 @@ def run_listener_dispatcher( methodcaller(FedMLMessageCenter.FUNC_REBUILD_MESSAGE_CENTER, sender_message_queue)(self) while True: + message_entity = None try: self.check_listener_message_stop_event() except MessageCenterStoppedException as e: break + # noinspection PyBroadException try: + # Setup the mqtt connection self.setup_listener_mqtt_mgr() + # Get the message from the queue try: - message_body = self.listener_message_queue.get(block=False, timeout=0.1) + message_body = message_queue.get(block=False, timeout=0.1) except queue.Empty as e: # If queue is empty, then break loop message_body = None if message_body is None: time.sleep(0.1) continue + # Generate the message entity object message_entity = FedMLMessageEntity(message_body=message_body) + # Generate the message record + message_record = FedMLMessageRecord(message_id=str(uuid.uuid4()), message_body=message_body) + + # Cache the message + self.cache_message_record(message_record, is_sender=False) + + # Save the message + self.save_message_record(message_entity.run_id, message_entity.device_id, + message_record, is_sender=False) + + # Dispatch the message to corresponding handler message_handler_func_name = self.listener_handler_funcs.get(message_entity.topic, None) if message_handler_func_name is not None: methodcaller(message_handler_func_name, message_entity.topic, message_entity.payload)(self) except Exception as e: - logging.info( - f"Failed to dispatch messages with topic {message_entity.topic}, payload {message_entity.payload}, {traceback.format_exc()}") - + if message_entity is not None: + logging.info( + f"Failed to dispatch messages with topic {message_entity.topic}, " + f"payload {message_entity.payload}, {traceback.format_exc()}") + else: + logging.info(f"Failed to dispatch messages: {traceback.format_exc()}") self.release_listener_mqtt_mgr() -class FedMLMessageEntity(object): - def __init__(self, topic=None, payload=None, run_id=None, message_body: dict = None): - self.topic = topic - self.payload = payload - self.run_id = run_id - if message_body is not None: - self.from_message_body(message_body=message_body) + def cache_message_record(self, message_record, is_sender=True): + # Save the message to the cached list. + if is_sender: + self.sender_message_list.append(message_record.get_json_record()) + else: + self.receiver_message_list.append(message_record.get_json_record()) + + def save_message_record(self, run_id, device_id, message_record, is_sender=True): + # Check if we enable to log messages to file + if not FedMLMessageCenter.ENABLE_SAVE_MESSAGE_TO_FILE: + return + + # Log messages to file + if is_sender: + print(f"save sent message record: {message_record.get_json_record()}") + else: + print(f"save received message record: {message_record.get_json_record()}") + saved_message_file = os.path.join( + self.constants.message_log_dir, + self.message_center_name, + FedMLMessageCenter.MESSAGE_SENT_RECORDS_FILE if is_sender else + FedMLMessageCenter.MESSAGE_RECEIVED_RECORDS_FILE + ) + os.makedirs(os.path.dirname(saved_message_file), exist_ok=True) + with open(saved_message_file, "a+") as f: + f.writelines([json.dumps(message_record.get_json_record()) + "\n"]) - def from_message_body(self, message_body: dict = None): - self.topic = message_body.get("topic", None) - self.payload = message_body.get("payload", None) - self.run_id = message_body.get("run_id", None) + def save_published_message_record(self, message_id): + # Check if we enable to log messages to file + if not FedMLMessageCenter.ENABLE_SAVE_MESSAGE_TO_FILE: + return - def get_message_body(self): - message_body = {"topic": self.topic, "payload": self.payload, "run_id": self.run_id} - return message_body + # Log published message ids to file + message_record = {"message_id": message_id, "timestamp": time.time_ns()/1000.0/1000.0} + published_msg_record_file = os.path.join( + self.constants.message_log_dir, self.message_center_name, + FedMLMessageCenter.MESSAGE_SENT_SUCCESS_RECORDS_FILE) + os.makedirs(os.path.dirname(published_msg_record_file), exist_ok=True) + print(f"save sent success message record: {message_record}") + with open(published_msg_record_file, "a+") as f: + f.writelines([json.dumps(message_record) + "\n"]) + + @staticmethod + def rebuild_message_center_from_queue(sender_message_queue, listener_message_queue=None): + message_center = FedMLMessageCenter(sender_message_queue=sender_message_queue, + listener_message_queue=listener_message_queue) + return message_center class MessageCenterStoppedException(Exception): """ Message center stopped. """ pass + + +class FedMLMessageCenterConstants: + def __init__(self): + global_services_dir = ClientConstants.get_global_services_dir() + self.home_dir = expanduser("~") + self.message_center_dir = os.path.join(global_services_dir, "message_center") + self.message_log_dir = os.path.join(self.message_center_dir, "logs") + os.makedirs(self.message_log_dir, exist_ok=True) diff --git a/python/fedml/computing/scheduler/scheduler_core/message_common.py b/python/fedml/computing/scheduler/scheduler_core/message_common.py new file mode 100755 index 0000000000..13b99ff39d --- /dev/null +++ b/python/fedml/computing/scheduler/scheduler_core/message_common.py @@ -0,0 +1,79 @@ +import json +import time + + +class FedMLMessageEntity(object): + def __init__(self, topic=None, payload=None, run_id=None, device_id=None, message_body: dict = None): + self.topic = topic + self.payload = payload + self.run_id = run_id + self.device_id = device_id + if message_body is not None: + self.from_message_body(message_body=message_body) + + def from_message_body(self, message_body: dict = None): + self.topic = message_body.get("topic", None) + self.payload = message_body.get("payload", None) + if self.payload is not None: + payload_json = json.loads(self.payload) + self.run_id = payload_json.get("run_id", None) + self.run_id = payload_json.get("runId", None) if self.run_id is None else self.run_id + self.device_id = payload_json.get("edge_id", None) + self.device_id = payload_json.get("ID", None) if self.device_id is None else self.device_id + + def get_message_body(self): + message_body = {"topic": self.topic, "payload": self.payload, "run_id": self.run_id} + return message_body + + +class FedMLMessageRecord(object): + def __init__(self, message_id=None, message_body=None, json_record=None): + self.message_id = message_id + self.message_body = message_body + self.timestamp = time.time_ns() / 1000.0 / 1000.0 + if json_record is not None: + self.from_message_record(json_record=json_record) + + def get_json_record(self): + return {"message_id": self.message_id, "message_body": self.message_body, "timestamp": self.timestamp} + + def from_message_record(self, json_record: dict = None): + self.message_id = json_record.get("message_id", None) + self.message_body = json_record.get("message_body", None) + self.timestamp = json_record.get("timestamp", None) + + +class FedMLStatusEntity(object): + def __init__(self, topic=None, payload=None, status_msg_body: dict = None): + self.topic = topic + self.payload = payload + self.run_id = None + self.edge_id = None + self.server_id = None + self.status = None + if status_msg_body is not None: + self.from_message_body(status_msg_body=status_msg_body) + + def from_message_body(self, status_msg_body: dict = None): + self.topic = status_msg_body.get("topic", None) + self.payload = status_msg_body.get("payload", None) + if self.payload is not None: + payload_json = json.loads(self.payload) + self.run_id = payload_json.get("run_id", None) + self.run_id = payload_json.get("runId", None) if self.run_id is None else self.run_id + self.edge_id = payload_json.get("edge_id", None) + self.server_id = payload_json.get("server_id", None) + self.status = payload_json.get("status", None) + + def get_message_body(self): + status_msg_body = {"topic": self.topic, "payload": self.payload, "run_id": self.run_id} + return status_msg_body + + +class LogArgs: + def __init__(self, role=None, edge_id=None, server_id=None, log_server_url=None, log_file_dir=None): + self.role = role + self.edge_id = edge_id + self.server_id = server_id + self.log_server_url = log_server_url + self.log_file_dir = log_file_dir diff --git a/python/fedml/computing/scheduler/scheduler_core/ota_upgrade.py b/python/fedml/computing/scheduler/scheduler_core/ota_upgrade.py new file mode 100755 index 0000000000..e32f1df806 --- /dev/null +++ b/python/fedml/computing/scheduler/scheduler_core/ota_upgrade.py @@ -0,0 +1,99 @@ +import logging +import os +import time +import traceback +import fedml +from fedml.computing.scheduler.comm_utils import sys_utils +from .general_constants import GeneralConstants + + +class FedMLOtaUpgrade: + LOCAL_RUNNER_INFO_DIR_NAME = 'runner_infos' + STATUS_IDLE = "IDLE" + + def __init__(self, edge_id=None): + self.edge_id = edge_id + self.version = fedml.get_env_version() + + def ota_upgrade(self, payload, request_json, status_reporter=None, + is_master=False, is_deploy=False): + run_id = request_json["runId"] + force_ota = False + ota_version = None + + try: + run_config = request_json.get("run_config", None) + parameters = run_config.get("parameters", None) + common_args = parameters.get("common_args", None) + force_ota = common_args.get("force_ota", False) if common_args is not None else False + ota_version = common_args.get("ota_version", None) if common_args is not None else None + except Exception as e: + logging.error( + f"Failed to get ota upgrade parameters with Exception {e}. Traceback: {traceback.format_exc()}") + pass + + if force_ota and ota_version is not None: + should_upgrade = True if ota_version != fedml.__version__ else False + upgrade_version = ota_version + else: + try: + fedml_is_latest_version, local_ver, remote_ver = sys_utils.check_fedml_is_latest_version(self.version) + except Exception as e: + logging.error(f"Failed to check fedml version with Exception {e}. Traceback: {traceback.format_exc()}") + return + + should_upgrade = False if fedml_is_latest_version else True + upgrade_version = remote_ver + + if should_upgrade: + FedMLOtaUpgrade._save_upgrading_job( + run_id, self.edge_id, payload, is_master=is_master, is_deploy=is_deploy + ) + if status_reporter is not None: + if is_master: + status_reporter.report_server_id_status( + run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_UPGRADING, edge_id=self.edge_id, + server_id=self.edge_id, server_agent_id=self.edge_id) + else: + status_reporter.report_client_id_status( + self.edge_id, GeneralConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, run_id=run_id) + + logging.info(f"Upgrade to version {upgrade_version} ...") + + sys_utils.do_upgrade(self.version, upgrade_version) + raise Exception("Restarting after upgraded...") + + @staticmethod + def process_ota_upgrade_msg(): + os.system("pip install -U fedml") + + @staticmethod + def _save_upgrading_job(run_id, edge_id, payload, is_master=False, is_deploy=False): + if is_master and is_deploy: + from ..model_scheduler.device_server_data_interface import FedMLServerDataInterface + FedMLServerDataInterface.get_instance(). \ + save_started_job(run_id, edge_id, time.time(), + GeneralConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, + GeneralConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, + payload) + elif is_master and not is_deploy: + from ..master.server_data_interface import FedMLServerDataInterface + FedMLServerDataInterface.get_instance(). \ + save_started_job(run_id, edge_id, time.time(), + GeneralConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, + GeneralConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, + payload) + elif not is_master and is_deploy: + from ..model_scheduler.device_client_data_interface import FedMLClientDataInterface + FedMLClientDataInterface.get_instance(). \ + save_started_job(run_id, edge_id, time.time(), + GeneralConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, + GeneralConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, + payload) + elif not is_master and not is_deploy: + from ..slave.client_data_interface import FedMLClientDataInterface + FedMLClientDataInterface.get_instance(). \ + save_started_job(run_id, edge_id, time.time(), + GeneralConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, + GeneralConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, + payload) diff --git a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner.py b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner.py new file mode 100755 index 0000000000..cf700e9a9d --- /dev/null +++ b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner.py @@ -0,0 +1,649 @@ +import json +import logging +import multiprocessing +import os +import platform +import random +import shutil +import time +import traceback +import zipfile +import queue +from ..comm_utils.constants import SchedulerConstants +from ..comm_utils.job_utils import JobRunnerUtils, DockerArgs +from ..scheduler_entry.constants import Constants +from ....core.mlops import MLOpsMetrics, MLOpsRuntimeLogDaemon +from ....core.mlops.mlops_device_perfs import MLOpsDevicePerfStats +from ..comm_utils.yaml_utils import load_yaml_config +from .general_constants import GeneralConstants +from ..comm_utils.sys_utils import get_python_program +from ..comm_utils import sys_utils +from ....core.mlops.mlops_utils import MLOpsUtils +from ..scheduler_core.message_center import FedMLMessageCenter +from ..scheduler_core.status_center import FedMLStatusCenter +from abc import ABC, abstractmethod +import ssl + + +class RunnerError(Exception): + """ Runner stopped. """ + pass + + +class RunnerCompletedError(Exception): + """ Runner completed. """ + pass + + +class FedMLSchedulerBaseJobRunner(ABC): + + def __init__(self, args, edge_id=0, request_json=None, agent_config=None, run_id=0, + cuda_visible_gpu_ids_str=None, is_master_runner=False, + agent_data_dir=None, agent_package_download_dir=None, + agent_package_unzip_dir=None, agent_log_file_dir=None): + self.args = args + self.is_master_runner = is_master_runner + self.agent_data_dir = agent_data_dir + self.agent_package_download_dir = agent_package_download_dir + self.agent_package_unzip_dir = agent_package_unzip_dir + self.agent_log_file_dir = agent_log_file_dir + self.prev_download_progress = 0 + self.run_process_event = None + self.run_process_completed_event = None + self.run_process = None + self.running_request_json = dict() + self.start_request_json = None + self.edge_id = edge_id + self.edge_user_name = None + self.edge_extra_url = None + self.run_id = run_id + self.unique_device_id = args.unique_device_id + self.request_json = request_json + self.version = args.version + self.device_id = args.device_id + self.cur_dir = os.path.split(os.path.realpath(__file__))[0] + self.agent_config = agent_config + self.mlops_metrics = None + self.status_reporter = None + self.ntp_offset = MLOpsUtils.get_ntp_offset() + self.server_id = None + self.fedml_config_object = None + self.package_type = SchedulerConstants.JOB_PACKAGE_TYPE_DEFAULT + self.cuda_visible_gpu_ids_str = cuda_visible_gpu_ids_str + self.user_name = None + self.general_edge_id = None + self.message_center = None + self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES = { + "${FEDSYS.RUN_ID}": "", + "${FEDSYS.PRIVATE_LOCAL_DATA}": "", + "${FEDSYS.CLIENT_ID_LIST}": "", + "${FEDSYS.SYNTHETIC_DATA_URL}": "", + "${FEDSYS.IS_USING_LOCAL_DATA}": "", + "${FEDSYS.CLIENT_NUM}": "", + "${FEDSYS.CLIENT_INDEX}": "", + "${FEDSYS.CLIENT_OBJECT_LIST}": "", + "${FEDSYS.LOG_SERVER_URL}": "", + } + self.is_deployment_runner = False + + def __repr__(self): + return "<{klass} @{id:x} {attrs}>".format( + klass=self.__class__.__name__, + id=id(self) & 0xFFFFFF, + attrs=" ".join("{}={!r}".format(k, v) for k, v in self.__dict__.items()), + ) + + def build_dynamic_constrain_variables(self, run_id, run_config): + data_config = run_config.get("data_config", {}) + server_edge_id_list = self.request_json["edgeids"] + local_edge_id_list = list() + local_edge_id_list.append(int(self.edge_id)) + is_using_local_data = 0 + private_data_dir = data_config.get("privateLocalData", "") + synthetic_data_url = data_config.get("syntheticDataUrl", "") + edges = self.request_json["edges"] + # if private_data_dir is not None \ + # and len(str(private_data_dir).strip(' ')) > 0: + # is_using_local_data = 1 + if private_data_dir is None or len(str(private_data_dir).strip(" ")) <= 0: + params_config = run_config.get("parameters", None) + private_data_dir = self.agent_data_dir + if synthetic_data_url is None or len(str(synthetic_data_url)) <= 0: + synthetic_data_url = private_data_dir + + self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.RUN_ID}"] = run_id + self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.PRIVATE_LOCAL_DATA}"] = private_data_dir.replace(" ", "") + self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_ID_LIST}"] = \ + str(self.get_client_id_list(server_edge_id_list)).replace(" ", "") + self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.SYNTHETIC_DATA_URL}"] = synthetic_data_url.replace(" ", "") + self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.IS_USING_LOCAL_DATA}"] = str(is_using_local_data) + self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_NUM}"] = len(server_edge_id_list) + if not self.is_master_runner: + self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_INDEX}"] = 1 + for cur_index, id_value in enumerate(server_edge_id_list): + if str(id_value) == str(self.edge_id): + self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_INDEX}"] = cur_index + 1 + break + client_objects = str(json.dumps(edges)) + client_objects = client_objects.replace(" ", "").replace("\n", "").replace('"', '\\"') + self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_OBJECT_LIST}"] = client_objects + self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.LOG_SERVER_URL}"] = self.agent_config["ml_ops_config"][ + "LOG_SERVER_URL" + ] + + def get_client_id_list(self, server_edge_id_list): + local_edge_id_list = list() + local_edge_id_list.append(int(self.edge_id)) + return local_edge_id_list + + @staticmethod + def unzip_file(zip_file, unzip_file_path) -> str: + unzipped_file_name = "" + if zipfile.is_zipfile(zip_file): + with zipfile.ZipFile(zip_file, "r") as zipf: + zipf.extractall(unzip_file_path) + # Make sure the unzipped file is a directory. + if zipf.namelist()[0].endswith("/"): + unzipped_file_name = zipf.namelist()[0] + else: + raise Exception("Invalid zip file {}".format(zip_file)) + + return unzipped_file_name + + def package_download_progress(self, count, blksize, filesize): + self.check_runner_stop_event() + + downloaded = count * blksize + downloaded = filesize if downloaded > filesize else downloaded + progress = (downloaded / filesize * 100) if filesize != 0 else 0 + progress_int = int(progress) + downloaded_kb = format(downloaded / 1024, '.2f') + + # Since this hook function is stateless, we need a state to avoid print progress repeatedly. + if count == 0: + self.prev_download_progress = 0 + if progress_int != self.prev_download_progress and progress_int % 5 == 0: + self.prev_download_progress = progress_int + logging.info("package downloaded size {} KB, progress {}%".format(downloaded_kb, progress_int)) + + def download_package_proc(self, package_url, local_package_file, completed_event, info_queue): + import requests + headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'} + user_agent_list = [ + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36', + ] + for _ in user_agent_list: + user_agent = random.choice(user_agent_list) + headers = {'User-Agent': user_agent} + + # Set the stream to true so that we can reduce the memory footprint when downloading large files. + request = requests.get(package_url, headers=headers, timeout=(10, 15), stream=True) + with open(local_package_file, 'wb') as f: + # 1024 * 1024 is 1MiB + download_size = 1024 * 1024 + total_size = 0 + for chunk in request.iter_content(download_size): + # Write the chunk to the file + written_size = f.write(chunk) + total_size += written_size + logging.info("package downloaded size %.2f KB", total_size/1024) + info_queue.put(time.time()) + completed_event.set() + + def retrieve_and_unzip_package(self, package_name, package_url): + local_package_path = self.agent_package_download_dir + os.makedirs(local_package_path, exist_ok=True) + filename, filename_without_extension, file_extension = GeneralConstants.get_filename_and_extension(package_url) + local_package_file = os.path.join( + local_package_path, f"fedml_run_{self.run_id}_{self.edge_id}_{filename_without_extension}") + if os.path.exists(local_package_file): + os.remove(local_package_file) + ssl._create_default_https_context = ssl._create_unverified_context + + # Open a process to download the package so that we can avoid the request is blocked and check the timeout. + from multiprocessing import Process + completed_event = multiprocessing.Event() + info_queue = multiprocessing.Queue() + download_process = Process(target=self.download_package_proc, + args=(package_url, local_package_file, completed_event, info_queue)) + download_process.start() + allowed_block_download_time = 60 + download_finished = False + download_time = time.time() + while True: + try: + queue_time = info_queue.get(block=False, timeout=3) + download_time = queue_time + except queue.Empty as e: + pass + + block_time = time.time() - download_time + if block_time > allowed_block_download_time: + break + + if completed_event.is_set(): + download_finished = True + break + time.sleep(3) + try: + if not download_finished: + download_process.terminate() + download_process.kill() + except Exception as e: + pass + + if not download_finished: + raise Exception("Download timeout, please check if your network is stable.") + + if not os.path.exists(local_package_file): + raise Exception(f"Failed to download, the zip file is not exist at {local_package_file}.") + + # Another method to async download. + # import socket + # socket.setdefaulttimeout(15) + # try: + # urllib.request.urlretrieve(package_url, local_package_file, + # reporthook=self.package_download_progress) + # except socket.timeout: + # retry_count = 1 + # max_retry_num = 5 + # while retry_count <= max_retry_num: + # try: + # urllib.request.urlretrieve(package_url, local_package_file, + # reporthook=self.package_download_progress) + # break + # except socket.timeout: + # error_info = 'Retry %d time' % retry_count if retry_count == 1 else \ + # 'Reloading for %d times' % retry_count + # logging.info(error_info) + # retry_count += 1 + # if retry_count > max_retry_num: + # logging.error("Download failed.") + # raise Exception("Download failed") + + unzip_package_path = os.path.join(self.agent_package_unzip_dir, + f"unzip_fedml_run_{self.run_id}_{self.edge_id}_{filename_without_extension}") + try: + shutil.rmtree(unzip_package_path, ignore_errors=True) + except Exception as e: + logging.error( + f"Failed to remove directory {unzip_package_path}, Exception: {e}, Traceback: {traceback.format_exc()}") + pass + + # Using unzipped folder name + package_dir_name = FedMLSchedulerBaseJobRunner.unzip_file(local_package_file, unzip_package_path) + unzip_package_full_path = os.path.join(unzip_package_path, package_dir_name) + + logging.info("local_package_file {}, unzip_package_path {}, unzip file full path {}".format( + local_package_file, unzip_package_path, unzip_package_full_path)) + + return unzip_package_full_path + + @abstractmethod + def get_download_package_info(self, packages_config=None): + download_package_name = packages_config.get("server", None) if self.is_master_runner \ + else packages_config["linuxClient"] + download_package_url = packages_config.get("serverUrl", None) if self.is_master_runner \ + else packages_config["linuxClientUrl"] + return download_package_name, download_package_url + + def update_local_fedml_config(self, run_id, run_config): + # Download the package + packages_config = run_config["packages_config"] + download_package_name, download_package_url = self.get_download_package_info(packages_config) + unzip_package_path = self.retrieve_and_unzip_package(download_package_name, download_package_url) + fedml_local_config_file = os.path.join(unzip_package_path, "conf", "fedml.yaml") + + # Load the config file to memory + config_from_container = load_yaml_config(fedml_local_config_file) + container_entry_file_config = config_from_container["entry_config"] + container_dynamic_args_config = config_from_container["dynamic_args"] + entry_file = container_entry_file_config["entry_file"] + conf_file = container_entry_file_config["conf_file"] + self.package_type = container_entry_file_config.get("package_type", SchedulerConstants.JOB_PACKAGE_TYPE_DEFAULT) + full_conf_path = os.path.join(unzip_package_path, "fedml", "config", os.path.basename(conf_file)) + + # Dynamically build constrain variable with realtime parameters from server + self.build_dynamic_constrain_variables(run_id, run_config) + + # Update entry arguments value with constrain variable values with realtime parameters from server + # currently we support the following constrain variables: + # ${FEDSYS_RUN_ID}: a run id represented one entire Federated Learning flow + # ${FEDSYS_PRIVATE_LOCAL_DATA}: private local data path in the Federated Learning client + # ${FEDSYS_CLIENT_ID_LIST}: client list in one entire Federated Learning flow + # ${FEDSYS_SYNTHETIC_DATA_URL}: synthetic data url from server, + # if this value is not null, the client will download data from this URL to use it as + # federated training data set + # ${FEDSYS_IS_USING_LOCAL_DATA}: whether we use private local data as federated training data set + # container_dynamic_args_config["data_cache_dir"] = "${FEDSYS.PRIVATE_LOCAL_DATA}" + for constrain_variable_key, constrain_variable_value in self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES.items(): + for argument_key, argument_value in container_dynamic_args_config.items(): + if argument_value is not None and str(argument_value).find(constrain_variable_key) == 0: + replaced_argument_value = str(argument_value).replace( + constrain_variable_key, str(constrain_variable_value) + ) + container_dynamic_args_config[argument_key] = replaced_argument_value + + # Merge all container new config sections as new config dictionary + package_conf_object = dict() + package_conf_object["entry_config"] = container_entry_file_config + package_conf_object["dynamic_args"] = container_dynamic_args_config + package_conf_object["dynamic_args"]["config_version"] = self.args.config_version + container_dynamic_args_config["mqtt_config_path"] = os.path.join( + unzip_package_path, "fedml", "config", os.path.basename(container_dynamic_args_config["mqtt_config_path"]) + ) + container_dynamic_args_config["s3_config_path"] = os.path.join( + unzip_package_path, "fedml", "config", os.path.basename(container_dynamic_args_config["s3_config_path"]) + ) + log_file_dir = self.agent_log_file_dir + os.makedirs(log_file_dir, exist_ok=True) + package_conf_object["dynamic_args"]["log_file_dir"] = log_file_dir + + # Save new config dictionary to local file + fedml_updated_config_file = os.path.join(unzip_package_path, "conf", "fedml.yaml") + GeneralConstants.generate_yaml_doc(package_conf_object, fedml_updated_config_file) + + # Build dynamic arguments and set arguments to fedml config object + self.build_dynamic_args(run_id, run_config, package_conf_object, unzip_package_path) + + return unzip_package_path, package_conf_object + + def build_dynamic_args(self, run_id, run_config, package_conf_object, base_dir): + fedml_conf_file = package_conf_object["entry_config"]["conf_file"] + fedml_conf_file_processed = str(fedml_conf_file).replace('\\', os.sep).replace('/', os.sep) + fedml_conf_path = os.path.join(base_dir, "fedml", "config", + os.path.basename(fedml_conf_file_processed)) + fedml_conf_object = load_yaml_config(fedml_conf_path) + run_params = run_config.get("parameters", {}) + job_yaml = run_params.get("job_yaml", {}) + + # Replace local fedml config objects with parameters from MLOps web + parameters_object = run_config.get("parameters", None) + if parameters_object is not None: + for config_k, config_v in fedml_conf_object.items(): + parameter_v = parameters_object.get(config_k, None) + if parameter_v is not None: + fedml_conf_object[config_k] = parameter_v + parameters_object.pop(config_k) + + for config_k, config_v in parameters_object.items(): + fedml_conf_object[config_k] = config_v + + package_dynamic_args = package_conf_object["dynamic_args"] + if fedml_conf_object.get("comm_args", None) is not None: + fedml_conf_object["comm_args"]["mqtt_config_path"] = package_dynamic_args["mqtt_config_path"] + fedml_conf_object["comm_args"]["s3_config_path"] = package_dynamic_args["s3_config_path"] + fedml_conf_object["common_args"]["using_mlops"] = True + if fedml_conf_object.get("train_args", None) is not None: + fedml_conf_object["train_args"]["run_id"] = package_dynamic_args["run_id"] + fedml_conf_object["train_args"]["client_id_list"] = package_dynamic_args["client_id_list"] + fedml_conf_object["train_args"]["client_num_in_total"] = int(package_dynamic_args["client_num_in_total"]) + fedml_conf_object["train_args"]["client_num_per_round"] = int(package_dynamic_args["client_num_in_total"]) + fedml_conf_object["train_args"]["client_id"] = self.edge_id + fedml_conf_object["train_args"]["server_id"] = self.request_json.get("server_id", "0") + if fedml_conf_object.get("device_args", None) is not None: + fedml_conf_object["device_args"]["worker_num"] = int(package_dynamic_args["client_num_in_total"]) + # fedml_conf_object["data_args"]["data_cache_dir"] = package_dynamic_args["data_cache_dir"] + data_args = fedml_conf_object.get("data_args") + if data_args is not None: + data_cache_dir = fedml_conf_object["data_args"].get("data_cache_dir") + if data_cache_dir is not None: + data_cache_dir = os.path.join(data_cache_dir, str(self.edge_id)) + fedml_conf_object["data_args"]["data_cache_dir"] = data_cache_dir + if fedml_conf_object.get("tracking_args", None) is not None: + fedml_conf_object["tracking_args"]["log_file_dir"] = package_dynamic_args["log_file_dir"] + fedml_conf_object["tracking_args"]["log_server_url"] = package_dynamic_args["log_server_url"] + + fedml_conf_object["dynamic_args"] = package_dynamic_args + self.fedml_config_object = fedml_conf_object.copy() + GeneralConstants.generate_yaml_doc(fedml_conf_object, fedml_conf_path) + + def callback_run_bootstrap(self, job_pid): + GeneralConstants.save_bootstrap_process(self.run_id, job_pid, data_dir=self.agent_data_dir) + + def run_bootstrap_script(self, bootstrap_cmd_list, bootstrap_script_file): + try: + logging.info("Bootstrap commands are being executed...") + process, error_list = GeneralConstants.execute_commands_with_live_logs( + bootstrap_cmd_list, callback=self.callback_run_bootstrap) + + ret_code, out, err = process.returncode, None, None + if ret_code is None or ret_code <= 0: + if error_list is not None and len(error_list) > 0: + is_bootstrap_run_ok = False + else: + if out is not None: + out_str = sys_utils.decode_our_err_result(out) + if out_str != "": + logging.info("{}".format(out_str)) + + sys_utils.log_return_info(bootstrap_script_file, 0) + + is_bootstrap_run_ok = True + else: + if err is not None: + err_str = sys_utils.decode_our_err_result(err) + if err_str != "": + logging.error("{}".format(err_str)) + + sys_utils.log_return_info(bootstrap_script_file, ret_code) + + is_bootstrap_run_ok = False + except Exception as e: + logging.error(f"Bootstrap script error: Exception: {e}, Traceback: {traceback.format_exc()}") + is_bootstrap_run_ok = False + return is_bootstrap_run_ok + + def check_runner_stop_event(self): + if self.run_process_event.is_set(): + logging.info("Received stopping event.") + raise RunnerError("Runner stopped") + + if self.run_process_completed_event.is_set(): + logging.info("Received completed event.") + raise RunnerCompletedError("Runner completed") + + def trigger_stop_event(self): + if self.run_process_event is not None: + self.run_process_event.set() + + time.sleep(1) + MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) + + def trigger_completed_event(self): + if self.run_process_completed_event is not None: + self.run_process_completed_event.set() + + time.sleep(1) + MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) + + def execute_job_task(self, unzip_package_path, entry_file_full_path, conf_file_full_path, dynamic_args_config, + fedml_config_object): + run_config = self.request_json["run_config"] + run_params = run_config.get("parameters", {}) + client_rank = self.request_json.get("client_rank", 1) + job_yaml = run_params.get("job_yaml", {}) + job_yaml_default_none = run_params.get("job_yaml", None) + job_api_key = job_yaml.get("run_api_key", None) + job_api_key = job_yaml.get("fedml_run_dynamic_params", None) if job_api_key is None else job_api_key + assigned_gpu_ids = run_params.get("gpu_ids", None) + job_type = job_yaml.get("job_type", None) + containerize = fedml_config_object.get("containerize", None) + image_pull_policy = fedml_config_object.get("image_pull_policy", Constants.IMAGE_PULL_POLICY_ALWAYS) + job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type + conf_file_object = load_yaml_config(conf_file_full_path) + entry_args_dict = conf_file_object.get("fedml_entry_args", {}) + entry_args = entry_args_dict.get("arg_items", None) + scheduler_match_info = self.request_json.get("scheduler_match_info", {}) + if job_type == Constants.JOB_TASK_TYPE_TRAIN: + containerize = True if containerize is None else containerize + + # Bootstrap Info + bootstrap_script_path, bootstrap_script_dir, bootstrap_script_file = [None] * 3 + env_args = fedml_config_object.get("environment_args", None) + + if env_args is not None: + bootstrap_script_file = env_args.get("bootstrap", None) + if bootstrap_script_file is not None: + bootstrap_script_file = str(bootstrap_script_file).replace('\\', os.sep).replace('/', os.sep) + if platform.system() == 'Windows': + bootstrap_script_file = bootstrap_script_file.rstrip('.sh') + '.bat' + if bootstrap_script_file is not None: + bootstrap_script_dir = os.path.join(unzip_package_path, "fedml", + os.path.dirname(bootstrap_script_file)) + bootstrap_script_path = os.path.join( + bootstrap_script_dir, bootstrap_script_dir, os.path.basename(bootstrap_script_file) + ) + + bootstrap_cmd_list = list() + if bootstrap_script_path: + logging.info("Bootstrap commands are being generated...") + bootstrap_cmd_list = JobRunnerUtils.generate_bootstrap_commands(bootstrap_script_path=bootstrap_script_path, + bootstrap_script_dir=bootstrap_script_dir, + bootstrap_script_file=bootstrap_script_file) + logging.info(f"Generated following Bootstrap commands: {bootstrap_cmd_list}") + + if not containerize: + if len(bootstrap_cmd_list) and not (job_type == Constants.JOB_TASK_TYPE_DEPLOY or + job_type == Constants.JOB_TASK_TYPE_SERVE): + bootstrapping_successful = self.run_bootstrap_script(bootstrap_cmd_list=bootstrap_cmd_list, + bootstrap_script_file=bootstrap_script_file) + + if not bootstrapping_successful: + logging.info("failed to update local fedml config.") + self.check_runner_stop_event() + # Send failed msg when exceptions. + raise Exception(f"Failed to execute following bootstrap commands: {bootstrap_cmd_list}") + + logging.info("cleanup the previous learning process and bootstrap process...") + GeneralConstants.cleanup_learning_process(self.request_json["runId"], data_dir=self.agent_data_dir) + GeneralConstants.cleanup_bootstrap_process(self.request_json["runId"], data_dir=self.agent_data_dir) + + executable_interpreter = GeneralConstants.CLIENT_SHELL_PS \ + if platform.system() == GeneralConstants.PLATFORM_WINDOWS else GeneralConstants.CLIENT_SHELL_BASH + + if job_yaml_default_none is None: + # Generate the job executing commands for previous federated learning (Compatibility) + python_program = get_python_program() + rank = str(dynamic_args_config.get("rank", 1)) + role = "server" if rank == "0" else "client" + logging.info(f"Run the {role}: {python_program} {entry_file_full_path} --cf {conf_file_full_path} " + f"--rank {rank} --role {role}") + entry_command = f"{python_program} {entry_file_full_path} --cf " \ + f"{conf_file_full_path} --rank {rank} --role {role}" + shell_cmd_list = [entry_command] + + # Run the job executing commands for previous federated learning (Compatibility) + process, error_list = GeneralConstants.execute_commands_with_live_logs( + shell_cmd_list, callback=self.callback_start_fl_job, should_write_log_file=False) + is_launch_task = False + else: + self.check_runner_stop_event() + + if self.is_master_runner: + self.status_reporter.report_server_id_status( + self.run_id, GeneralConstants.MSG_MLOPS_SERVER_STATUS_RUNNING, edge_id=self.edge_id, + server_id=self.edge_id, server_agent_id=self.edge_id) + else: + self.status_reporter.report_client_id_status( + self.edge_id, GeneralConstants.MSG_MLOPS_CLIENT_STATUS_RUNNING, run_id=self.run_id) + + # Generate the job executing commands + job_executing_commands = JobRunnerUtils.generate_job_execute_commands( + self.run_id, self.edge_id, self.version, + self.package_type, executable_interpreter, entry_file_full_path, + conf_file_object, entry_args, assigned_gpu_ids, + job_api_key, client_rank, scheduler_match_info=scheduler_match_info, + cuda_visible_gpu_ids_str=self.cuda_visible_gpu_ids_str) + + if containerize is not None and containerize is True: + docker_args = fedml_config_object.get("docker", {}) + docker_args = JobRunnerUtils.create_instance_from_dict(DockerArgs, docker_args) + try: + job_executing_commands = JobRunnerUtils.generate_launch_docker_command( + docker_args=docker_args, run_id=self.run_id, edge_id=self.edge_id, + unzip_package_path=unzip_package_path, executable_interpreter=executable_interpreter, + entry_file_full_path=entry_file_full_path, bootstrap_cmd_list=bootstrap_cmd_list, + cuda_visible_gpu_ids_str=self.cuda_visible_gpu_ids_str, image_pull_policy=image_pull_policy) + except Exception as e: + logging.error(f"Error occurred while generating containerized launch commands. " + f"Exception: {e}, Traceback: {traceback.format_exc()}") + return None, None, None + + if not job_executing_commands: + raise Exception("Failed to generate docker execution command") + + # Run the job executing commands + logging.info(f"Run the client job with job id {self.run_id}, device id {self.edge_id}.") + process, error_list = GeneralConstants.execute_commands_with_live_logs( + job_executing_commands, callback=self.start_job_perf, error_processor=self.job_error_processor, + should_write_log_file=False if job_type == Constants.JOB_TASK_TYPE_FEDERATE else True) + is_launch_task = False if job_type == Constants.JOB_TASK_TYPE_FEDERATE else True + + return process, is_launch_task, error_list + + def callback_start_fl_job(self, job_pid): + GeneralConstants.save_learning_process(self.run_id, job_pid, data_dir=self.agent_data_dir) + self.mlops_metrics.report_sys_perf( + self.args, self.agent_config["mqtt_config"], job_process_id=job_pid) + + def start_job_perf(self, job_pid): + GeneralConstants.save_learning_process(self.run_id, job_pid, data_dir=self.agent_data_dir) + self.mlops_metrics.report_job_perf(self.args, self.agent_config["mqtt_config"], job_pid) + + def job_error_processor(self, error_list): + self.check_runner_stop_event() + + error_str = "\n".join(error_list) + error_message = f"Error occurred when running the job... {error_str}" + logging.error(error_message) + raise Exception(error_message) + + def start_runner_process( + self, run_id, edge_id, request_json, cuda_visible_gpu_ids_str=None, + sender_message_queue=None, status_center_queue=None + ): + return None + + @staticmethod + def cleanup_containers_and_release_gpus(run_id, edge_id, job_type=SchedulerConstants.JOB_TASK_TYPE_TRAIN): + # Check if the job type is not "serve" or "deploy" + if not (job_type == SchedulerConstants.JOB_TASK_TYPE_SERVE or + job_type == SchedulerConstants.JOB_TASK_TYPE_DEPLOY): + + # Check if docker client exists and then terminate containers. + if JobRunnerUtils.docker_client_exists(): + try: + # Terminate docker container. + docker_client = JobRunnerUtils.get_docker_client(DockerArgs()) + container_name = JobRunnerUtils.get_run_container_name(run_id) + logging.info(f"Terminating the run docker container {container_name} if exists...") + JobRunnerUtils.remove_run_container_if_exists(container_name, docker_client) + except Exception as e: + logging.error(f"Exception {e} occurred when terminating docker container. " + f"Traceback: {traceback.format_exc()}") + + # Release the GPU ids and update the GPU availability in the persistent store + JobRunnerUtils.get_instance().release_gpu_ids(run_id, edge_id) + + # Send mqtt message reporting the new gpu availability to the backend + MLOpsDevicePerfStats.report_gpu_device_info(edge_id) + + def rebuild_message_status_center(self, sender_message_queue, listener_message_queue, status_queue): + self.message_center = FedMLMessageCenter.rebuild_message_center_from_queue( + sender_message_queue, listener_message_queue=listener_message_queue) + if self.mlops_metrics is None: + self.mlops_metrics = MLOpsMetrics() + self.mlops_metrics.set_messenger(self.message_center) + self.mlops_metrics.run_id = self.run_id + + status_center = FedMLStatusCenter.rebuild_status_center_from_queue(status_queue) + if self.status_reporter is None: + self.status_reporter = MLOpsMetrics() + self.status_reporter.set_messenger(status_center) + self.status_reporter.run_id = self.run_id diff --git a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner_manager.py b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner_manager.py new file mode 100755 index 0000000000..0e30beeab4 --- /dev/null +++ b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_job_runner_manager.py @@ -0,0 +1,70 @@ + +from abc import ABC, abstractmethod + + +class FedMLSchedulerBaseJobRunnerManager(ABC): + + def __init__(self): + if not hasattr(self, "job_runners"): + self.job_runners = dict() + if not hasattr(self, "cloud_run_process_map"): + self.cloud_run_process_map = dict() + + @abstractmethod + def _generate_job_runner_instance( + self, args, run_id=None, request_json=None, agent_config=None, edge_id=None + ): + return None + + def start_job_runner( + self, run_id, request_json, args=None, edge_id=None, is_server_job=False, + sender_message_queue=None, listener_message_queue=None, status_center_queue=None, + should_start_cloud_server=False, use_local_process_as_cloud_server=False, + cuda_visible_gpu_ids_str=None + ): + run_id_str = str(run_id) + self.job_runners[run_id_str] = self._generate_job_runner_instance( + args, run_id=run_id, request_json=request_json, + agent_config=args.agent_config, edge_id=edge_id, + ) + self.job_runners[run_id_str].start_runner_process( + run_id, request_json, edge_id=edge_id, + cuda_visible_gpu_ids_str=cuda_visible_gpu_ids_str, + sender_message_queue=sender_message_queue, + listener_message_queue=listener_message_queue, + status_center_queue=status_center_queue + ) + + def stop_job_runner(self, run_id): + run_id_str = str(run_id) + if self.job_runners.get(run_id_str, None) is not None: + self.job_runners[run_id_str].trigger_stop_event() + + def stop_all_job_runner(self): + for run_id, job_runner in self.job_runners.items(): + job_runner.trigger_stop_event() + + def complete_job_runner(self, run_id): + run_id_str = str(run_id) + if self.job_runners.get(run_id_str, None) is not None: + self.job_runners[run_id_str].trigger_completed_event() + + def put_run_edge_device_info_to_queue(self, run_id, edge_id, device_info): + for job_run_id, job_runner in self.job_runners.items(): + job_runner.put_run_edge_device_info_to_queue(run_id, edge_id, device_info) + + def get_runner_process(self, run_id, is_cloud_server=False): + run_id_str = str(run_id) + + if self.job_runners.get(run_id_str, None) is None: + return None + + return self.job_runners[run_id_str].run_process + + def get_all_runner_pid_map(self): + process_id_dict = dict() + for run_id, runner in self.job_runners.items(): + if runner.run_process is not None: + process_id_dict[str(run_id)] = runner.run_process.pid + + return process_id_dict diff --git a/python/fedml/computing/scheduler/scheduler_core/scheduler_base_protocol_manager.py b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_protocol_manager.py new file mode 100755 index 0000000000..19bb7e9882 --- /dev/null +++ b/python/fedml/computing/scheduler/scheduler_core/scheduler_base_protocol_manager.py @@ -0,0 +1,271 @@ + +import json +import logging +import multiprocessing +import sys +import time +import traceback +import uuid +import fedml +from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog +from ....core.distributed.communication.mqtt.mqtt_manager import MqttManager +from ....core.mlops.mlops_metrics import MLOpsMetrics +from ..comm_utils import sys_utils +from ..scheduler_core.message_center import FedMLMessageCenter +from ..scheduler_core.status_center import FedMLStatusCenter +from .account_manager import FedMLAccountManager +from .general_constants import GeneralConstants +from abc import ABC, abstractmethod + + +class FedMLSchedulerBaseProtocolManager(FedMLMessageCenter, FedMLStatusCenter, ABC): + + def __init__(self, args, agent_config=None, is_master=False): + FedMLMessageCenter.__init__(self) + FedMLStatusCenter.__init__(self) + self.request_json = None + self.version = fedml.get_env_version() + self.args = args + self.is_master_agent = is_master + self.message_status_runner = None + self.message_center = None + self.status_center = None + self.message_center_name = "master_agent" if is_master else "slave_agent" + self.run_id = None + self.edge_id = args.edge_id + self.general_edge_id = None + self.server_agent_id = args.edge_id + self.current_device_id = args.current_device_id + self.unique_device_id = args.unique_device_id + self.agent_config = agent_config + self.topic_active = None + self.topic_last_will = None + self.communication_mgr = None + self.subscribed_topics = list() + self.mlops_metrics = None + self.status_reporter = None + self.user_name = args.user_name + + fedml._init_multiprocessing() + + def generate_topics(self): + # generate the subscribed topics. + self.subscribed_topics.clear() + # self.subscribed_topics.append(self.topic_start_train) + + def add_protocol_handler(self): + # Add the message listeners for all topics, the following is an example. + # self.add_message_listener(self.topic_start_train, self.callback_start_train) + pass + + def initialize(self): + # Generate the message topics + self.generate_topics() + + # Setup MQTT connection + self.communication_mgr = MqttManager( + self.agent_config["mqtt_config"]["BROKER_HOST"], + self.agent_config["mqtt_config"]["BROKER_PORT"], + self.agent_config["mqtt_config"]["MQTT_USER"], + self.agent_config["mqtt_config"]["MQTT_PWD"], + self.agent_config["mqtt_config"]["MQTT_KEEPALIVE"], + f"FedML_Agent_Daemon_@{self.user_name}@_@{self.current_device_id}@_@{str(uuid.uuid4())}@", + self.topic_last_will, + json.dumps({"ID": self.edge_id, "status": GeneralConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE}) + ) + + # Add the message listeners for all topics + self.add_protocol_handler() + + # Start the message center to process edge related messages. + self.setup_message_center() + + # Setup the message listener queue + self.setup_listener_message_queue() + + # Start the status center to process edge related status. + self.start_status_listener_center() + + # Start the message center for listener + self.start_listener(sender_message_queue=self.message_center.get_sender_message_queue(), + agent_config=self.agent_config, + message_center_name=self.message_center_name) + + # Init extra items, e.g. database, recovery, etc. + self._init_extra_items() + + # Setup MQTT connected listener + self.communication_mgr.add_connected_listener(self.on_agent_communication_connected) + self.communication_mgr.add_disconnected_listener(self.on_agent_communication_disconnected) + self.communication_mgr.connect() + + def start(self): + # Start MQTT message loop + try: + self.communication_mgr.loop_forever() + except Exception as e: + if str(e) == "Restarting after upgraded...": + logging.info("Restarting after upgraded...") + else: + logging.info("Server tracing: {}".format(traceback.format_exc())) + + finally: + FedMLAccountManager.write_login_failed_file(is_client=not self.is_master_agent) + + self.stop() + + time.sleep(5) + sys_utils.cleanup_all_fedml_server_login_processes( + GeneralConstants.MASTER_LOGIN_PROGRAM if self.is_master_agent else GeneralConstants.SLAVE_LOGIN_PROGRAM, + clean_process_group=False) + sys.exit(1) + + def stop(self): + if self.communication_mgr is not None: + # noinspection PyBroadException + try: + for topic in self.subscribed_topics: + self.communication_mgr.unsubscribe_msg(topic) + except Exception: + pass + + self.communication_mgr.loop_stop() + self.communication_mgr.disconnect() + + self.release_message_center() + + @abstractmethod + def _init_extra_items(self): + pass + + def on_agent_communication_connected(self, mqtt_client_object): + # Setup MQTT message passthrough listener for all messages + self.communication_mgr.add_message_passthrough_listener(self.listener_message_passthrough_dispatch_center) + + # Subscribe topics for starting train, stopping train and fetching client status. + for topic in self.subscribed_topics: + self.communication_mgr.subscribe_msg(topic) + + # Broadcast the first active message. + self.send_agent_active_msg(self.edge_id) + if self.general_edge_id is not None: + self.send_agent_active_msg(self.general_edge_id) + + # Echo results + MLOpsRuntimeLog.get_instance(self.args).enable_show_log_to_stdout() + self.print_connected_info() + MLOpsRuntimeLog.get_instance(self.args).enable_show_log_to_stdout(enable=True) + + @abstractmethod + def print_connected_info(self): + print("\nCongratulations, your device is connected to the FedML MLOps platform successfully!") + print( + "Your FedML Edge ID is " + str(self.edge_id) + ", unique device ID is " + + str(self.unique_device_id) + ) + + def on_agent_communication_disconnected(self, mqtt_client_object): + pass + + def setup_message_center(self): + if self.message_center is not None: + return + + self.message_center = FedMLMessageCenter(agent_config=self.agent_config) + self.message_center.start_sender(message_center_name=self.message_center_name) + + if self.mlops_metrics is None: + self.mlops_metrics = MLOpsMetrics() + self.mlops_metrics.set_messenger(self) + self.mlops_metrics.run_id = self.run_id + self.mlops_metrics.edge_id = self.edge_id + self.mlops_metrics.server_agent_id = self.server_agent_id + + def send_message_json(self, topic, payload): + self.message_center.send_message_json(topic, payload) + + def rebuild_message_center(self, message_center_queue): + self.message_center = FedMLMessageCenter(sender_message_queue=message_center_queue) + + if self.mlops_metrics is None: + self.mlops_metrics = MLOpsMetrics() + self.mlops_metrics.set_messenger(self) + self.mlops_metrics.run_id = self.run_id + self.mlops_metrics.edge_id = self.edge_id + self.mlops_metrics.server_agent_id = self.server_agent_id + + def release_message_center(self): + try: + if self.message_center is not None: + self.message_center.stop() + self.message_center = None + + except Exception as e: + logging.error( + f"Failed to release slave communication manager with Exception {e}. " + f"Traceback: {traceback.format_exc()}") + pass + + def start_status_listener_center(self): + self.start_status_center( + sender_message_center_queue=self.message_center.get_sender_message_queue(), + listener_message_center_queue=self.get_listener_message_queue(), + is_slave_agent=not self.is_master_agent + ) + + if self.status_reporter is None: + self.status_reporter = MLOpsMetrics() + self.status_reporter.set_messenger(self, send_message_func=self.send_status_message) + self.status_reporter.run_id = self.run_id + self.status_reporter.edge_id = self.edge_id + self.status_reporter.server_agent_id = self.server_agent_id + + def rebuild_status_center(self, status_center_queue): + self.status_center = FedMLStatusCenter(message_queue=status_center_queue) + self.status_center.is_deployment_status_center = self.is_deployment_status_center + + if self.status_reporter is None: + self.status_reporter = MLOpsMetrics() + self.status_reporter.set_messenger(self.status_center, send_message_func=self.status_center.send_status_message) + self.status_reporter.run_id = self.run_id + self.status_reporter.edge_id = self.edge_id + self.status_reporter.server_agent_id = self.server_agent_id + + def generate_status_report(self, run_id, edge_id, server_agent_id=None): + status_reporter = MLOpsMetrics() + status_reporter.set_messenger(self, send_message_func=self.send_status_message) + status_reporter.run_id = run_id + status_reporter.edge_id = edge_id + if server_agent_id is not None: + status_reporter.server_agent_id = server_agent_id + return status_reporter + + @abstractmethod + def generate_protocol_manager(self): + # Generate the protocol manager instance and set the attribute values. + return None + + def get_message_runner(self): + if self.message_status_runner is not None: + return self.message_status_runner + + self.message_status_runner = self.generate_protocol_manager() + self.message_status_runner.status_queue = self.get_status_queue() + + return self.message_status_runner + + def get_status_runner(self): + if self.message_status_runner is None: + self.get_message_runner() + if self.message_status_runner is not None: + self.message_status_runner.sender_message_queue = self.message_center.get_sender_message_queue() + + if self.message_status_runner is not None: + self.message_status_runner.sender_message_queue = self.message_center.get_sender_message_queue() + return self.message_status_runner + + return None + + def send_agent_active_msg(self, edge_id): + active_msg = {"ID": edge_id, "status": GeneralConstants.MSG_MLOPS_SERVER_STATUS_IDLE} + self.message_center.send_message_json(self.topic_active, json.dumps(active_msg)) diff --git a/python/fedml/computing/scheduler/scheduler_core/status_center.py b/python/fedml/computing/scheduler/scheduler_core/status_center.py new file mode 100755 index 0000000000..97c2115e76 --- /dev/null +++ b/python/fedml/computing/scheduler/scheduler_core/status_center.py @@ -0,0 +1,372 @@ +import logging +import time + +from enum import Enum, unique +import multiprocessing +from multiprocessing import Process, Queue +import queue +from .message_common import FedMLMessageEntity, FedMLStatusEntity +from .message_center import FedMLMessageCenter +import traceback +from .status_manager_protocols import FedMLStatusManager + + +@unique +class JobStatus(Enum): + STATUS_OFFLINE = "OFFLINE" + STATUS_PROVISIONING = "PROVISIONING" + STATUS_IDLE = "IDLE" + UPGRADING = "UPGRADING" + STARTING = "STARTING" + STATUS_RUNNING = "RUNNING" + STATUS_STOPPING = "STOPPING" + STATUS_KILLED = "KILLED" + STATUS_FAILED = "FAILED" + STATUS_FINISHED = "FINISHED" + STATUS_EXCEPTION = "EXCEPTION" + + def __str__(self): + return self.value + + @classmethod + def get_job_enum_from_str(cls, job_status_str: str): + for job_status in cls: + if job_status.value == job_status_str: + return job_status + return cls.STATUS_OFFLINE + + @staticmethod + def is_job_completed(job_status_str: str): + if job_status_str == JobStatus.STATUS_FINISHED.value or \ + job_status_str == JobStatus.STATUS_FAILED.value or \ + job_status_str == JobStatus.STATUS_KILLED.value or \ + job_status_str == JobStatus.STATUS_EXCEPTION.value: + return True + + return False + + +@unique +class DeviceStatus(Enum): + STATUS_OFFLINE = "OFFLINE" + STATUS_PROVISIONING = "PROVISIONING" + STATUS_IDLE = "IDLE" + STATUS_UPGRADING = "UPGRADING" + STATUS_QUEUED = "QUEUED" + STATUS_INITIALIZING = "INITIALIZING" + STATUS_TRAINING = "TRAINING" + STATUS_RUNNING = "RUNNING" + STATUS_STOPPING = "STOPPING" + STATUS_KILLED = "KILLED" + STATUS_FAILED = "FAILED" + STATUS_EXCEPTION = "EXCEPTION" + STATUS_FINISHED = "FINISHED" + + def __str__(self): + return self.value + + @classmethod + def get_device_enum_from_str(cls, device_status_str: str): + for device_status in cls: + if device_status.value == device_status_str: + return device_status + return cls.STATUS_OFFLINE + + +class FedMLStatusCenter(object): + TOPIC_MASTER_STATUS_PREFIX = "fl_server/flserver_agent_" + TOPIC_SLAVE_STATUS_PREFIX = "fl_client/flclient_agent_" + TOPIC_SLAVE_STATUS_TO_MLOPS_PREFIX = "fl_run/fl_client/mlops/status" + TOPIC_SLAVE_JOB_LAUNCH_PREFIX = "flserver_agent/" + TOPIC_SLAVE_JOB_LAUNCH_SUFFIX = "/start_train" + TOPIC_SLAVE_JOB_STOP_PREFIX = "flserver_agent/" + TOPIC_SLAVE_JOB_STOP_SUFFIX = "/stop_train" + ALLOWED_MAX_JOB_STATUS_CACHE_NUM = 1000 + + def __init__(self, message_queue=None): + self.status_queue = message_queue + self.status_center_process = None + self.status_event = None + self.status_sender_message_center_queue = None + self.status_listener_message_center_queue = None + self.status_message_center = None + self.status_manager_instance = None + self.status_runner = None + self.is_deployment_status_center = False + + def __repr__(self): + return "<{klass} @{id:x} {attrs}>".format( + klass=self.__class__.__name__, + id=id(self) & 0xFFFFFF, + attrs=" ".join("{}={!r}".format(k, v) for k, v in self.__dict__.items()), + ) + + def get_status_runner(self): + return None + + def start_status_center(self, sender_message_center_queue=None, + listener_message_center_queue=None, is_slave_agent=False): + self.status_queue = Queue() + self.status_event = multiprocessing.Event() + self.status_event.clear() + self.status_sender_message_center_queue = sender_message_center_queue + self.status_listener_message_center_queue = listener_message_center_queue + self.status_runner = self.get_status_runner() + target_func = self.status_runner.run_status_dispatcher if not is_slave_agent else \ + self.status_runner.run_status_dispatcher_in_slave + self.status_center_process = Process( + target=target_func, args=( + self.status_event, self.status_queue, self.status_sender_message_center_queue, + self.status_listener_message_center_queue + ) + ) + + self.status_center_process.start() + + def check_message_stop_event(self): + if self.status_event is not None and self.status_event.is_set(): + logging.info("Received status center stopping event.") + raise StatusCenterStoppedException("Status center stopped (for sender)") + + def send_message(self, topic, payload, run_id=None): + message_entity = FedMLMessageEntity(topic=topic, payload=payload, run_id=run_id) + self.status_queue.put(message_entity.get_message_body()) + + def send_message_json(self, topic, payload): + self.send_message(topic, payload) + + def send_status_message(self, topic, payload): + message_entity = FedMLMessageEntity(topic=topic, payload=payload) + self.status_queue.put(message_entity.get_message_body()) + + def get_status_queue(self): + return self.status_queue + + def status_center_process_master_status(self, topic, payload): + pass + + def status_center_process_slave_status(self, topic, payload): + pass + + def rebuild_message_center(self, message_center_queue): + pass + + def rebuild_status_center(self, status_queue): + pass + + def run_status_dispatcher(self, status_event, status_queue, + sender_message_center_queue, + listener_message_center_queue): + # Save the parameters + self.status_event = status_event + self.status_queue = status_queue + self.status_sender_message_center_queue = sender_message_center_queue + self.status_listener_message_center_queue = listener_message_center_queue + + # Rebuild message center + message_center = None + if sender_message_center_queue is not None: + self.rebuild_message_center(sender_message_center_queue) + message_center = FedMLMessageCenter( + sender_message_queue=sender_message_center_queue, + listener_message_queue=listener_message_center_queue + ) + + if sender_message_center_queue is not None: + self.rebuild_status_center(status_queue) + + # Init status manager instances + status_manager_instances = dict() + + while True: + message_entity = None + + # Check if we should stop status dispatcher + try: + self.check_message_stop_event() + except StatusCenterStoppedException as e: + break + + # Dispatch status messages. + # noinspection PyBroadException + try: + # Get the status message from the queue + try: + message_body = status_queue.get(block=False, timeout=0.1) + except queue.Empty as e: # If queue is empty, then break loop + message_body = None + if message_body is None: + time.sleep(0.1) + continue + + # Build message and status entity + message_entity = FedMLMessageEntity(message_body=message_body) + status_entity = FedMLStatusEntity(status_msg_body=message_body) + + # Generate status manager instance + run_id_str = str(status_entity.run_id) + run_id_int = int(status_entity.run_id) + if status_manager_instances.get(run_id_str) is None: + if len(status_manager_instances.keys()) >= FedMLStatusCenter.ALLOWED_MAX_JOB_STATUS_CACHE_NUM: + for iter_run_id, iter_status_mgr in status_manager_instances.items(): + if iter_status_mgr.is_job_completed(): + status_manager_instances.pop(iter_run_id) + break + status_manager_instances[run_id_str] = FedMLStatusManager( + run_id=run_id_int, edge_id=status_entity.edge_id, + server_id=status_entity.server_id, status_center=self, + message_center=message_center) + else: + status_manager_instances[run_id_str].edge_id = status_entity.edge_id + if status_entity.server_id is not None and str(status_entity.server_id) != "0": + status_manager_instances[run_id_str].server_id = status_entity.server_id + + # if the job status is completed then continue + if status_manager_instances[run_id_str].is_job_completed(): + continue + + # Process the master and slave status. + if message_entity.topic.startswith(FedMLStatusCenter.TOPIC_MASTER_STATUS_PREFIX): + # Process the job status + status_manager_instances[run_id_str].status_center_process_master_status( + message_entity.topic, message_entity.payload) + + # Save the job status + status_manager_instances[run_id_str].save_job_status() + + elif message_entity.topic.startswith(FedMLStatusCenter.TOPIC_SLAVE_STATUS_PREFIX): + # Process the slave device status + status_manager_instances[run_id_str].status_center_process_slave_status( + message_entity.topic, message_entity.payload) + + # Save the device status in job + status_manager_instances[run_id_str].save_device_status_in_job(status_entity.edge_id) + + except Exception as e: + if message_entity is not None: + logging.info( + f"Failed to process the status with topic {message_entity.topic}, " + f"payload {message_entity.payload}, {traceback.format_exc()}") + else: + logging.info(f"Failed to process the status: {traceback.format_exc()}") + + def run_status_dispatcher_in_slave(self, status_event, status_queue, + sender_message_center_queue, + listener_message_center_queue): + # Save the parameters + self.status_event = status_event + self.status_queue = status_queue + self.status_sender_message_center_queue = sender_message_center_queue + self.status_listener_message_center_queue = listener_message_center_queue + + # Rebuild message center + message_center = None + if sender_message_center_queue is not None: + self.rebuild_message_center(sender_message_center_queue) + message_center = FedMLMessageCenter( + sender_message_queue=sender_message_center_queue, + listener_message_queue=listener_message_center_queue + ) + + if sender_message_center_queue is not None: + self.rebuild_status_center(status_queue) + + # Init status manager instances + status_manager_instances = dict() + job_launch_message_map = dict() + + while True: + message_entity = None + + # Check if we should stop status dispatcher + try: + self.check_message_stop_event() + except StatusCenterStoppedException as e: + break + + # Dispatch status messages. + # noinspection PyBroadException + try: + # Get the status message from the queue + try: + message_body = status_queue.get(block=False, timeout=0.1) + except queue.Empty as e: # If queue is empty, then break loop + message_body = None + if message_body is None: + time.sleep(0.1) + continue + + # Build message and status entity + message_entity = FedMLMessageEntity(message_body=message_body) + status_entity = FedMLStatusEntity(status_msg_body=message_body) + + # Generate status manager instance + run_id_str = str(status_entity.run_id) + run_id_int = int(status_entity.run_id) + if status_manager_instances.get(run_id_str) is None: + if len(status_manager_instances.keys()) >= FedMLStatusCenter.ALLOWED_MAX_JOB_STATUS_CACHE_NUM: + for iter_run_id, iter_status_mgr in status_manager_instances.items(): + if iter_status_mgr.is_job_completed(): + status_manager_instances.pop(iter_run_id) + break + + status_manager_instances[run_id_str] = FedMLStatusManager( + run_id=run_id_int, edge_id=status_entity.edge_id, status_center=self, + message_center=message_center) + else: + status_manager_instances[run_id_str].edge_id = status_entity.edge_id + + # Process the slave status + if message_entity.topic.startswith(FedMLStatusCenter.TOPIC_SLAVE_STATUS_PREFIX): + # Report the slave status to master + status_manager_instances[run_id_str]. \ + status_center_process_slave_status_to_master_in_slave_agent( + message_entity.topic, message_entity.payload + ) + elif message_entity.topic.startswith(FedMLStatusCenter.TOPIC_SLAVE_STATUS_TO_MLOPS_PREFIX): + # Report slave status to mlops (Active/IDLE message) + status_manager_instances[run_id_str]. \ + status_center_process_slave_status_to_mlops_in_slave_agent( + message_entity.topic, message_entity.payload + ) + elif (message_entity.topic.startswith(FedMLStatusCenter.TOPIC_SLAVE_JOB_LAUNCH_PREFIX) and + message_entity.topic.endswith(FedMLStatusCenter.TOPIC_SLAVE_JOB_LAUNCH_SUFFIX)): + pass + # Async request the job status from master when launching the job + # job_launch_message_map[run_id_str] = {"topic": message_entity.topic, + # "payload": message_entity.payload} + # status_manager_instances[run_id_str]. \ + # status_center_request_job_status_from_master_in_slave_agent( + # message_entity.topic, message_entity.payload + # ) + elif (message_entity.topic.startswith(FedMLStatusCenter.TOPIC_SLAVE_JOB_STOP_PREFIX) and + message_entity.topic.endswith(FedMLStatusCenter.TOPIC_SLAVE_JOB_STOP_SUFFIX)): + # Cleanup when stopped the job + if job_launch_message_map.get(run_id_str, None) is not None: + job_launch_message_map.pop(run_id_str) + + except Exception as e: + if message_entity is not None: + logging.info( + f"Failed to process the status with topic {message_entity.topic}, " + f"payload {message_entity.payload}, {traceback.format_exc()}") + else: + logging.info(f"Failed to process the status: {traceback.format_exc()}") + + def register_job_launch_message(self, topic, payload): + message_entity = FedMLMessageEntity(topic=topic, payload=payload) + self.status_queue.put(message_entity.get_message_body()) + + def register_job_stop_message(self, topic, payload): + message_entity = FedMLMessageEntity(topic=topic, payload=payload) + self.status_queue.put(message_entity.get_message_body()) + + @staticmethod + def rebuild_status_center_from_queue(status_queue): + status_center = FedMLStatusCenter(message_queue=status_queue) + return status_center + + +class StatusCenterStoppedException(Exception): + """ Status center stopped. """ + pass diff --git a/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py b/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py new file mode 100755 index 0000000000..e045458db5 --- /dev/null +++ b/python/fedml/computing/scheduler/scheduler_core/status_manager_protocols.py @@ -0,0 +1,376 @@ +import json +import logging +import os +import shutil +import time +from os import listdir + +from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon +from ....core.mlops.mlops_metrics import MLOpsMetrics +from ..slave.client_constants import ClientConstants +from ..master.server_constants import ServerConstants +from ..master.server_data_interface import FedMLServerDataInterface +from .message_common import LogArgs +from .general_constants import GeneralConstants +from ..scheduler_core.compute_cache_manager import ComputeCacheManager + + +class FedMLStatusManager(object): + def __init__(self, run_id=None, edge_id=None, server_id=None, + edge_id_list=None, running_scheduler_contract=None, + status_center=None, message_center=None): + self.run_id = run_id + self.edge_id = edge_id + self.server_id = server_id + self.edge_id_list = edge_id_list + self.edge_status_dict = None + self.running_scheduler_contract = running_scheduler_contract if running_scheduler_contract is not None else dict() + self.message_reporter = MLOpsMetrics() + self.message_reporter.set_messenger(message_center) + self.status_reporter = MLOpsMetrics() + self.status_reporter.set_messenger(status_center, send_message_func=status_center.send_status_message) + self.status_center = status_center + self.message_center = message_center + self.log_args = LogArgs(role="server", edge_id=self.edge_id, + server_id=self.server_id, log_file_dir=ServerConstants.get_log_file_dir()) + + self.job_status_in_slave = dict() + self.entire_job_status = None + self.job_status_in_master = dict() + self.slave_devices_status = dict() + self.master_devices_status = dict() + self.completed_job_status_list = [ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, + ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED, + ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED] + + def __repr__(self): + return "<{klass} @{id:x} {attrs}>".format( + klass=self.__class__.__name__, + id=id(self) & 0xFFFFFF, + attrs=" ".join("{}={!r}".format(k, v) for k, v in self.__dict__.items()), + ) + + def add_job_status_in_slave(self, device_id, status): + self.job_status_in_slave[device_id] = self._status_transition(status) + + def add_job_status_in_master(self, device_id, status): + self.job_status_in_master[device_id] = self._status_transition(status) + + def set_entire_job_status(self, status): + self.entire_job_status = status + + def add_slave_device_status(self, device_id, status): + self.slave_devices_status[device_id] = self._status_transition(status) + + def add_master_device_status(self, run_id, device_id, status): + self.master_devices_status[device_id] = self._status_transition(status) + + def get_job_status_in_slave(self, device_id): + return self.job_status_in_slave.get(device_id, None) + + def get_job_status_in_master(self, device_id): + return self.job_status_in_master.get(device_id, None) + + def get_entire_job_status(self): + return self.entire_job_status + + def get_slave_device_status(self, device_id): + return self.slave_devices_status.get(device_id, None) + + def get_master_device_status(self, device_id): + return self.master_devices_status.get(device_id, None) + + def is_job_completed(self): + if self.entire_job_status and self.entire_job_status in self.completed_job_status_list: + return True + return False + + def _status_transition(self, status): + transition_status = status + if self.entire_job_status is not None: + if self.entire_job_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED or \ + self.entire_job_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED: + if status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED or \ + status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED or \ + status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED: + transition_status = status + else: + transition_status = ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED + + return transition_status + + def save_job_status(self): + ComputeCacheManager.get_instance().set_redis_params() + ComputeCacheManager.get_instance().get_status_cache().save_job_status( + self.run_id, self.get_entire_job_status()) + + def save_device_status_in_job(self, device_id): + ComputeCacheManager.get_instance().set_redis_params() + ComputeCacheManager.get_instance().get_status_cache().save_device_status_in_job( + self.run_id, device_id, self.get_job_status_in_slave(device_id)) + + def process_job_completed_status(self, master_id, status): + # Stop the system performance monitor + try: + self.message_reporter.stop_sys_perf() + except Exception as ex: + pass + + # Stop the job process + ServerConstants.cleanup_learning_process(self.run_id) + ServerConstants.cleanup_bootstrap_process(self.run_id) + + # Remove the package download directory. + try: + local_package_path = ServerConstants.get_package_download_dir() + for package_file in listdir(local_package_path): + if os.path.basename(package_file).startswith("run_" + str(self.run_id)): + shutil.rmtree(os.path.join(local_package_path, package_file), ignore_errors=True) + except Exception as e: + pass + + # Stop log processor for current run + MLOpsRuntimeLogDaemon.get_instance(self.log_args).stop_log_processor(self.run_id, master_id) + + # RunProcessUtils.kill_process(cloud_server_process.pid) + # self.stop_cloud_server() + # self.remove_listener_for_run_metrics(self.run_id) + # self.remove_listener_for_run_logs(self.run_id) + self.message_center.receive_message( + GeneralConstants.get_topic_complete_job(master_id), + json.dumps(GeneralConstants.get_payload_complete_job(self.run_id, master_id))) + + if self.status_center.is_deployment_status_center and status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED: + self.report_deployment_status(self.run_id, GeneralConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED) + + def process_job_exception_status(self, master_id, status): + # Report exception job status + self.report_exception_status(status) + + # Save the job status to local storage + FedMLServerDataInterface.get_instance().save_job_status(self.run_id, master_id, status, status) + + def process_job_running_status(self, master_id, status): + self.message_reporter.report_server_training_status( + self.run_id, status, edge_id=master_id, running_json=self.running_scheduler_contract, update_db=False) + + def status_center_process_master_status(self, topic, payload): + request_json = json.loads(payload) + is_retain = request_json.get("is_retain", False) + if is_retain: + return + run_id = request_json["run_id"] + status = request_json["status"] + edge_id = request_json["edge_id"] + server_id = request_json.get("server_id", None) + if server_id is None or str(server_id) == "0": + server_id = self.server_id + run_id_str = str(run_id) + + # Process the job status + if status in (ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED, + ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, + ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED): + self.process_job_completed_status(server_id, status) + elif status == ServerConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION: + self.process_job_exception_status(server_id, status) + else: + self.process_job_running_status(server_id, status) + + # Process the consensus status + self.process_job_status_consensus(run_id, server_id, status) + + def process_job_status_consensus(self, run_id, master_id, status): + # Set the master status in the job and entire job status + self.set_entire_job_status(status) + self.add_job_status_in_master(master_id, status) + status = self.get_entire_job_status() + + # Set the device status based on the job status + for edge_id_item, edge_status_item in self.edge_status_dict.items(): + if edge_id_item == "server": + continue + + # Calc the device status based on the job status + consensus_device_status = FedMLStatusManager.get_device_consensus_status_in_job( + status, edge_status_item) + if consensus_device_status is not None: + self.message_reporter.report_client_training_status( + edge_id_item, consensus_device_status, run_id=run_id, update_db=False) + + # Save the job status to local storage + FedMLServerDataInterface.get_instance().save_job_status(run_id, master_id, status, status) + + # Report the status to message center + self.message_reporter.report_server_training_status(run_id, status, edge_id=master_id, update_db=False) + + # Broadcast the status to slave agents + self.message_reporter.report_job_status(run_id, status) + + @staticmethod + def get_device_consensus_status_in_job(job_status, device_status): + if job_status == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED: + if device_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED or \ + device_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED or \ + device_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED: + return device_status + else: + return ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED + else: + return None + + def get_device_consensus_status_in_current_device(self, edge_id, status): + self.add_job_status_in_slave(edge_id, status) + consensus_status = self.get_job_status_in_slave(edge_id) + consensus_status = ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED \ + if consensus_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION else consensus_status + return consensus_status + + def status_center_process_slave_status(self, topic, payload): + payload_json = json.loads(payload) + run_id = payload_json.get("run_id", None) + edge_id = payload_json.get("edge_id", None) + status = payload_json.get("status", None) + init_edge_id_list = payload_json.get("init_all_edge_id_list", None) + init_server_id = payload_json.get("init_server_id", None) + + if self.edge_status_dict is None: + self.edge_status_dict = dict() + + if init_edge_id_list is not None: + self.edge_status_dict[f"server"] = init_server_id + for edge_id_item in init_edge_id_list: + self.edge_status_dict[f"{edge_id_item}"] = \ + ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE + + if run_id is not None and edge_id is not None: + self.edge_status_dict[f"{edge_id}"] = status + + self.process_device_status(run_id, edge_id, status) + + def process_device_status(self, run_id, edge_id, status): + number_of_failed_edges = 0 + number_of_finished_edges = 0 + number_of_killed_edges = 0 + server_id = self.edge_status_dict.get("server", 0) + enable_fault_tolerance, fault_tolerance_rate = self.parse_fault_tolerance_params(run_id) + running_edges_list = list() + edge_nums = 0 + for edge_id_item, status_item in self.edge_status_dict.items(): + if edge_id_item == "server": + continue + + edge_nums += 1 + if status_item is None or status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED or \ + status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_EXCEPTION: + number_of_failed_edges += 1 + continue + + if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED: + number_of_finished_edges += 1 + continue + + if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED: + number_of_killed_edges += 1 + continue + + if status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_IDLE or \ + status_item == ServerConstants.MSG_MLOPS_SERVER_STATUS_OFFLINE: + continue + + running_edges_list.append(edge_id_item) + + # Report client status + consensus_status = self.get_device_consensus_status_in_current_device(edge_id, status) + self.message_reporter.report_client_training_status(edge_id, consensus_status, run_id=run_id, update_db=False) + + # Report server status based on the fault tolerance model and parameters + if edge_nums <= 0: + return + status_to_report = self.calculate_server_status( + run_id, edge_nums, number_of_failed_edges, number_of_finished_edges, number_of_killed_edges, + running_edges_list, enable_fault_tolerance=enable_fault_tolerance, + fault_tolerance_rate=fault_tolerance_rate) + if status_to_report is not None: + logging.info(f"Run completed when processing edge status, will report status {status_to_report}") + self.report_server_status(run_id, server_id, server_id, status_to_report) + + def calculate_server_status( + self, run_id, total_edge_nums, number_of_failed_edges, number_of_finished_edges, + number_of_killed_edges, running_edges_list, enable_fault_tolerance=False, + fault_tolerance_rate=0.8 + ): + # Report server status based on the fault tolerance model and parameters + actual_failed_rate = number_of_failed_edges / total_edge_nums + all_edges_run_completed = True if len(running_edges_list) <= 0 else False + if all_edges_run_completed: + status_to_report = None + if enable_fault_tolerance: + if actual_failed_rate >= fault_tolerance_rate: + status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED + self.report_exception_status( + running_edges_list, run_id=run_id, status=status_to_report) + return status_to_report + else: + if number_of_killed_edges == total_edge_nums: + status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED + else: + status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED + else: + if number_of_failed_edges > 0: + status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED + elif number_of_finished_edges == total_edge_nums: + status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_FINISHED + elif number_of_killed_edges == total_edge_nums: + status_to_report = ServerConstants.MSG_MLOPS_SERVER_STATUS_KILLED + + return status_to_report + + def parse_fault_tolerance_params(self, run_id): + run_json = self.running_scheduler_contract.get(str(run_id), None) + if run_json is None: + return False, 0 + run_config = run_json.get("run_config", {}) + run_params = run_config.get("parameters", {}) + common_args = run_params.get("common_args", {}) + enable_fault_tolerance = common_args.get("enable_fault_tolerance", False) + fault_tolerance_rate = common_args.get("fault_tolerance_rate", 0) + return enable_fault_tolerance, fault_tolerance_rate + + def report_server_status(self, run_id, edge_id, server_id, status): + self.status_reporter.report_server_id_status( + run_id, status, edge_id=edge_id, server_id=server_id, server_agent_id=server_id, update_db=False) + + def report_exception_status(self, status): + self.message_reporter.report_job_status(self.run_id, status) + + def status_center_process_slave_status_to_master_in_slave_agent(self, topic, payload): + # Forward the status message to the sender queue of message center. + self.message_center.send_message(topic, payload) + + def status_center_process_slave_status_to_mlops_in_slave_agent(self, topic, payload): + # Forward the status message to message center. + self.message_center.send_message(topic, payload) + + def status_center_request_job_status_from_master_in_slave_agent(self, topic, payload): + # Parse the parameters + payload_json = json.loads(payload) + run_id = payload_json.get("run_id", None) + master_id = payload_json.get("master_id", None) + edge_id = payload_json.get("edge_id", None) + + # Request the job status from master agent. + topic_request_job_status = f"{GeneralConstants.MSG_TOPIC_REQUEST_JOB_STATUS_PREFIX}{master_id}" + payload_request_job_status = {"run_id": run_id, "edge_id": edge_id} + self.message_center.send_message(topic_request_job_status, json.dumps(payload_request_job_status)) + + def report_deployment_status(self, run_id, status): + deployment_status_topic = "model_ops/model_device/return_deployment_status" + deployment_status_payload = {"end_point_id": run_id, "end_point_name": "", + "model_name": "", + "model_url": "", + "model_status": status, + "timestamp": int(format(time.time_ns() / 1000.0, '.0f'))} + logging.info(f"[StatusCenter] deployment_status_payload is sent to mlops: {deployment_status_payload}") + + self.message_center.send_message_json(deployment_status_topic, json.dumps(deployment_status_payload)) diff --git a/python/fedml/computing/scheduler/scheduler_core/task_protocol_manager.py b/python/fedml/computing/scheduler/scheduler_core/task_protocol_manager.py deleted file mode 100755 index ddf4bb9b6e..0000000000 --- a/python/fedml/computing/scheduler/scheduler_core/task_protocol_manager.py +++ /dev/null @@ -1,24 +0,0 @@ - - -class TaskProtocolManager(object): - def __init__(self): - pass - - def log_metrics(self): - # Build the message for logging metrics - - # Send the message to MQTT server - - pass - - def log_model(self): - pass - - def log_artifacts_log(self): - pass - - def log_artifacts(self): - pass - - - diff --git a/python/fedml/computing/scheduler/scheduler_entry/README.md b/python/fedml/computing/scheduler/scheduler_entry/README.md index 0d1da81950..41d32ff399 100644 --- a/python/fedml/computing/scheduler/scheduler_entry/README.md +++ b/python/fedml/computing/scheduler/scheduler_entry/README.md @@ -132,7 +132,7 @@ You just need to customize the following config items. 3. `bootstrap`, It is the bootstrap shell command which will be executed before running entry commands. -Then you can use the following example CLI to launch the job at FedML® Nexus AI Platform +Then you can use the following example CLI to launch the job at TensorOpera® Nexus AI Platform (Replace $YourApiKey with your own account API key from open.fedml.ai) Example: @@ -142,7 +142,7 @@ fedml launch hello_job.yaml After the launch CLI is executed, the output is as follows. Here you may open the job url to confirm and actually start the job. ``` -Submitting your job to FedML® Nexus AI Platform: 100%|████████████████████████████████████████████████████████████████████████████████████████| 6.07k/6.07k [00:01<00:00, 4.94kB/s] +Submitting your job to TensorOpera® Nexus AI Platform: 100%|████████████████████████████████████████████████████████████████████████████████████████| 6.07k/6.07k [00:01<00:00, 4.94kB/s] Searched and matched the following GPU resource for your job: +-----------+-------------------+---------+------------+-------------------------+---------+-------+----------+ diff --git a/python/fedml/computing/scheduler/scheduler_entry/app_manager.py b/python/fedml/computing/scheduler/scheduler_entry/app_manager.py index 91b5ff64cc..267db08901 100755 --- a/python/fedml/computing/scheduler/scheduler_entry/app_manager.py +++ b/python/fedml/computing/scheduler/scheduler_entry/app_manager.py @@ -278,7 +278,7 @@ def push_app_package_to_s3(self, app_name, app_package_path): app_storage_url = s3_storage.upload_file_with_progress(app_package_path, app_dst_key, out_progress_to_err=True, progress_desc="Submitting your job to " - "FedML® Nexus AI Platform") + "TensorOpera® Nexus AI Platform") return app_storage_url def pull_app_package_from_s3(self, model_storage_url, model_name): @@ -315,7 +315,7 @@ def push_model_to_s3(self, model_name, model_zip_path): return FedMLModelCards.get_instance().push_model_to_s3( model_name, model_zip_path, "FedMLLaunchServe", show_progress=False, - progress_desc="Submitting your job to FedML® Nexus AI Platform") + progress_desc="Submitting your job to TensorOpera® Nexus AI Platform") def check_model_package(self, workspace): model_config_file = os.path.join( diff --git a/python/fedml/computing/scheduler/scheduler_entry/run_manager.py b/python/fedml/computing/scheduler/scheduler_entry/run_manager.py index 84fe109054..b91935e7b2 100755 --- a/python/fedml/computing/scheduler/scheduler_entry/run_manager.py +++ b/python/fedml/computing/scheduler/scheduler_entry/run_manager.py @@ -162,10 +162,10 @@ def __init__(self, run_log_list_json): self.log_devices = list() for log_dev in log_devices_json: self.log_devices.append(FedMLRunLogDeviceModel(log_dev)) - self.total_num = run_log_list_json.get("total_num", 0) - self.total_pages = run_log_list_json.get("total_pages", 0) - self.current_page = run_log_list_json.get("current_page", 0) - self.log_lines = run_log_list_json.get("logs", []) + self.total_num = run_log_list_json.get("totalSize", 0) + self.total_pages = run_log_list_json.get("totalPages", 0) + self.current_page = run_log_list_json.get("pageNum", 0) + self.log_lines = run_log_list_json.get("logList", []) class FedMLRunLogDeviceModel(object): @@ -277,7 +277,6 @@ def get_run_logs(self, run_id: str, page_num: int, page_size: int, user_api_key: run_log_list_result = None run_logs_json = { "apiKey": user_api_key, - "edgeId": "-1", "pageNum": page_num, "pageSize": page_size, "runId": run_id, diff --git a/python/fedml/computing/scheduler/slave/base_slave_agent.py b/python/fedml/computing/scheduler/slave/base_slave_agent.py new file mode 100755 index 0000000000..a149dfe046 --- /dev/null +++ b/python/fedml/computing/scheduler/slave/base_slave_agent.py @@ -0,0 +1,141 @@ + +import json +import os +from ..comm_utils import sys_utils +from ..comm_utils.run_process_utils import RunProcessUtils +from ..comm_utils.sys_utils import get_python_program +from ....core.mlops import MLOpsRuntimeLog, MLOpsMetrics +from .client_data_interface import ClientConstants +from ..scheduler_core.account_manager import FedMLAccountManager +from ..scheduler_core.general_constants import GeneralConstants +from abc import ABC, abstractmethod + + +class FedMLBaseSlaveAgent(ABC): + CLIENT_API_CMD = "fedml.computing.scheduler.slave.client_api:api" + + def __init__(self): + self.agent_args = None + self.local_api_process = None + self.process = None + self.cur_dir = os.path.split(os.path.realpath(__file__))[0] + self.mlops_metrics = MLOpsMetrics() + self.protocol_mgr = None + + def login( + self, userid, api_key=None, device_id=None, + os_name=None, need_to_check_gpu=False, role=None, + marketplace_type=None, price_per_hour=None, name="" + ): + # Preprocess the login args + if need_to_check_gpu: + gpu_count, _ = sys_utils.get_gpu_count_vendor() + if gpu_count <= 0: + print("We can't find any gpu device on your machine. \n" + "With the gpu_supplier(-g) option, you need to check if your machine " + "has nvidia GPUs and installs CUDA related drivers.") + return + + # Login account + login_result = FedMLAccountManager.get_instance().login( + userid, api_key=api_key, device_id=device_id, + os_name=os_name, role=role, marketplace_type=marketplace_type, + price_per_hour=price_per_hour, name=name + ) + if login_result is not None: + self.agent_args = login_result + else: + return None + + # Save the bound info + self._save_agent_info(login_result.current_device_id + "." + login_result.os_name, login_result.edge_id) + + # Init the logs for protocol manager + self._init_logs(login_result, login_result.edge_id) + + # Create the protocol manager to communicate with the slave agents and MLOps. + self._create_protocol_manager(login_result) + + # Initialize the protocol manager + # noinspection PyBoardException + try: + self._initialize_protocol_manager() + except Exception as e: + FedMLAccountManager.write_login_failed_file(is_client=True) + self.protocol_mgr.stop() + raise e + + # Start the protocol manager to process the messages from MLOps and slave agents. + self.protocol_mgr.start() + + return login_result + + @staticmethod + def logout(): + GeneralConstants.cleanup_run_process(None) + sys_utils.cleanup_all_fedml_client_api_processes() + + def _create_protocol_manager(self, login_result): + if self.protocol_mgr is not None: + return + self.protocol_mgr = self._generate_protocol_manager_instance( + login_result, agent_config=login_result.agent_config) + self.protocol_mgr.args = login_result + self.protocol_mgr.edge_id = login_result.edge_id + self.protocol_mgr.unique_device_id = login_result.unique_device_id + self.protocol_mgr.user_name = login_result.user_name + self.protocol_mgr.agent_config = login_result.agent_config + + def _initialize_protocol_manager(self): + # Init local database + self._init_database() + + # Initialize the master protocol + self.protocol_mgr.initialize() + + # Start the client API process + self._start_slave_api() + + def _init_logs(self, login_result, edge_id): + # Init runtime logs + in_args = login_result + in_args.log_file_dir = self._get_log_file_dir() + in_args.run_id = 0 + in_args.role = "client" + client_ids = list() + client_ids.append(edge_id) + in_args.client_id_list = json.dumps(client_ids) + in_args.using_mlops = True + MLOpsRuntimeLog.get_instance(in_args).init_logs() + + def _start_slave_api(self): + # Start the local API services + client_api_cmd = FedMLBaseSlaveAgent.CLIENT_API_CMD + client_api_pids = RunProcessUtils.get_pid_from_cmd_line(client_api_cmd) + if client_api_pids is None or len(client_api_pids) <= 0: + python_program = get_python_program() + cur_dir = os.path.dirname(__file__) + fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir))) + self.local_api_process = ClientConstants.exec_console_with_script( + "{} -m uvicorn {} --host 0.0.0.0 --port {} " + "--reload --reload-delay 3 --reload-dir {} --log-level critical".format( + python_program, client_api_cmd, ClientConstants.LOCAL_CLIENT_API_PORT, fedml_base_dir), + should_capture_stdout=False, + should_capture_stderr=False + ) + + @abstractmethod + def _get_log_file_dir(self): + pass + + @abstractmethod + def _save_agent_info(self, unique_device_id, edge_id): + pass + + @abstractmethod + def _init_database(self): + pass + + @abstractmethod + def _generate_protocol_manager_instance(self, args, agent_config=None): + return None diff --git a/python/fedml/computing/scheduler/slave/base_slave_job_runner.py b/python/fedml/computing/scheduler/slave/base_slave_job_runner.py new file mode 100755 index 0000000000..9ea2c4beaf --- /dev/null +++ b/python/fedml/computing/scheduler/slave/base_slave_job_runner.py @@ -0,0 +1,268 @@ +import json +import logging +import multiprocessing +import os +import platform +import time +import traceback +from abc import ABC, abstractmethod + +from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog +from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon +from .client_data_interface import FedMLClientDataInterface +from ..comm_utils import sys_utils +from ....core.mlops.mlops_utils import MLOpsUtils +from multiprocessing import Process +from ..scheduler_core.scheduler_base_job_runner import FedMLSchedulerBaseJobRunner, RunnerError, RunnerCompletedError +from ..scheduler_core.general_constants import GeneralConstants +from ..comm_utils.job_utils import JobRunnerUtils + + +class FedMLBaseSlaveJobRunner(FedMLSchedulerBaseJobRunner, ABC): + + def __init__(self, args, edge_id=0, request_json=None, agent_config=None, run_id=0, + cuda_visible_gpu_ids_str=None, + agent_data_dir=None, agent_package_download_dir=None, + agent_package_unzip_dir=None, agent_log_file_dir=None): + FedMLSchedulerBaseJobRunner.__init__( + self, args, edge_id=edge_id, request_json=request_json, agent_config=agent_config, run_id=run_id, + cuda_visible_gpu_ids_str=cuda_visible_gpu_ids_str, agent_data_dir=agent_data_dir, + agent_package_download_dir=agent_package_download_dir, + agent_package_unzip_dir=agent_package_unzip_dir, + agent_log_file_dir=agent_log_file_dir + ) + + self.fedml_data_base_package_dir = os.path.join("/", "fedml", "data") + self.fedml_data_local_package_dir = os.path.join("/", "fedml", "fedml-package", "fedml", "data") + self.fedml_data_dir = self.fedml_data_base_package_dir + self.fedml_config_dir = os.path.join("/", "fedml", "conf") + self.run_extend_queue_list = None + self.computing_started_time = 0 + + def __repr__(self): + return "<{klass} @{id:x} {attrs}>".format( + klass=self.__class__.__name__, + id=id(self) & 0xFFFFFF, + attrs=" ".join("{}={!r}".format(k, v) for k, v in self.__dict__.items()), + ) + + def run(self, process_event, completed_event, run_extend_queue_list, + sender_message_center, listener_message_queue, status_center_queue): + print(f"Client runner process id {os.getpid()}, run id {self.run_id}") + + if platform.system() != "Windows": + os.setsid() + + os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' + os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') + + self.run_process_event = process_event + self.run_process_completed_event = completed_event + try: + MLOpsUtils.set_ntp_offset(self.ntp_offset) + self.rebuild_message_status_center(sender_message_center, listener_message_queue, status_center_queue) + self.run_impl(run_extend_queue_list, sender_message_center, listener_message_queue, status_center_queue) + except RunnerError: + logging.info("Runner stopped.") + self.reset_devices_status(self.edge_id, GeneralConstants.MSG_MLOPS_CLIENT_STATUS_KILLED) + except RunnerCompletedError: + logging.info("Runner completed.") + except Exception as e: + logging.error(f"Runner exited with errors. Exception: {e}, Traceback {traceback.format_exc()}") + self.status_reporter.report_client_id_status( + self.edge_id, GeneralConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, + is_from_model=self.is_deployment_runner, server_id=self.server_id, run_id=self.run_id) + finally: + if self.mlops_metrics is not None: + computing_ended_time = MLOpsUtils.get_ntp_time() + self.mlops_metrics.report_edge_job_computing_cost(self.run_id, self.edge_id, + self.computing_started_time, computing_ended_time, + self.args.account_id, self.args.api_key) + logging.info("Release resources.") + job_type = JobRunnerUtils.parse_job_type(self.request_json) + FedMLSchedulerBaseJobRunner.cleanup_containers_and_release_gpus(self.run_id, self.edge_id, job_type) + MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) + if self.mlops_metrics is not None: + self.mlops_metrics.stop_sys_perf() + time.sleep(3) + GeneralConstants.cleanup_learning_process(self.run_id) + GeneralConstants.cleanup_run_process(self.run_id) + + @abstractmethod + def run_impl(self, run_extend_queue_list, sender_message_center, + listener_message_queue, status_center_queue): + run_id = self.request_json["runId"] + run_config = self.request_json["run_config"] + data_config = run_config.get("data_config", {}) + packages_config = run_config["packages_config"] + + self.computing_started_time = MLOpsUtils.get_ntp_time() + self.mlops_metrics.report_edge_job_computing_cost(run_id, self.edge_id, + self.computing_started_time, 0, + self.args.account_id, self.args.api_key) + + self.check_runner_stop_event() + + MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) + + self.status_reporter.report_client_id_status( + self.edge_id, GeneralConstants.MSG_MLOPS_CLIENT_STATUS_INITIALIZING, + is_from_model=self.is_deployment_runner, running_json=json.dumps(self.request_json), run_id=run_id) + + # get training params + private_local_data_dir = data_config.get("privateLocalData", "") + is_using_local_data = 0 + # if private_local_data_dir is not None and len(str(private_local_data_dir).strip(' ')) > 0: + # is_using_local_data = 1 + + # start a run according to the hyper-parameters + # fedml_local_data_dir = self.cur_dir + "/fedml_data/run_" + run_id_str + "_edge_" + str(edge_id) + fedml_local_data_dir = os.path.join(self.cur_dir, "fedml_data") + fedml_local_config_dir = os.path.join(self.cur_dir, "fedml_config") + if is_using_local_data: + fedml_local_data_dir = private_local_data_dir + self.fedml_data_dir = self.fedml_data_local_package_dir + + self.check_runner_stop_event() + + logging.info("Download packages") + + # update local config with real time parameters from server and dynamically replace variables value + unzip_package_path, fedml_config_object = self.update_local_fedml_config(run_id, run_config) + # if unzip_package_path is None or fedml_config_object is None: + # logging.info("failed to update local fedml config.") + # self.check_runner_stop_event() + # # Send failed msg when exceptions. + # self.cleanup_run_when_starting_failed(status=GeneralConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION) + # return + + logging.info("Check downloaded packages...") + + entry_file_config = fedml_config_object.get("entry_config", None) + dynamic_args_config = fedml_config_object.get("dynamic_args", None) + entry_file = str(entry_file_config["entry_file"]).replace('\\', os.sep).replace('/', os.sep) + entry_file = os.path.basename(entry_file) + conf_file = entry_file_config["conf_file"] + conf_file = str(conf_file).replace('\\', os.sep).replace('/', os.sep) + ##### + # GeneralConstants.cleanup_learning_process(run_id) + # GeneralConstants.cleanup_bootstrap_process(run_id) + ##### + + if not os.path.exists(unzip_package_path): + logging.info("failed to unzip file.") + self.check_runner_stop_event() + return + os.chdir(os.path.join(unzip_package_path, "fedml")) + + self.check_runner_stop_event() + + logging.info("starting the user process...") + + entry_file_full_path = os.path.join(unzip_package_path, "fedml", entry_file) + conf_file_full_path = os.path.join(unzip_package_path, "fedml", conf_file) + logging.info("waiting the user process to finish...") + logging.info(" ") + logging.info(" ") + logging.info("====Your Run Logs Begin===") + + process, is_launch_task, error_list = self.execute_job_task( + unzip_package_path=unzip_package_path, entry_file_full_path=entry_file_full_path, + conf_file_full_path=conf_file_full_path, dynamic_args_config=dynamic_args_config, + fedml_config_object=self.fedml_config_object) + + logging.info("====Your Run Logs End===") + logging.info(" ") + logging.info(" ") + + ret_code, out, err = process.returncode if process else None, None, None + is_run_ok = sys_utils.is_runner_finished_normally(process.pid) + if is_launch_task: + is_run_ok = True + if error_list is not None and len(error_list) > 0: + is_run_ok = False + if ret_code is None or ret_code <= 0: + self.check_runner_stop_event() + + if is_run_ok: + if out is not None: + out_str = sys_utils.decode_our_err_result(out) + if out_str != "": + logging.info("{}".format(out_str)) + + self.status_reporter.report_client_id_status( + self.edge_id, GeneralConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, + is_from_model=self.is_deployment_runner, server_id=self.server_id, run_id=run_id) + + if is_launch_task: + sys_utils.log_return_info(f"job {run_id}", ret_code) + else: + sys_utils.log_return_info(entry_file, ret_code) + else: + is_run_ok = False + + if not is_run_ok: + # If the run status is killed or finished, then return with the normal state. + current_job = FedMLClientDataInterface.get_instance().get_job_by_id(run_id) + if current_job is not None and (current_job.status == GeneralConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED or + current_job.status == GeneralConstants.MSG_MLOPS_CLIENT_STATUS_KILLED): + return + + self.check_runner_stop_event() + + logging.error("failed to run the learning process...") + + if err is not None: + err_str = sys_utils.decode_our_err_result(err) + if err_str != "": + logging.error("{}".format(err_str)) + + if is_launch_task: + sys_utils.log_return_info(f"job {run_id}", ret_code) + else: + sys_utils.log_return_info(entry_file, ret_code) + + # Send failed msg when exceptions. + self.status_reporter.report_client_id_status( + self.edge_id, GeneralConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, + is_from_model=self.is_deployment_runner, server_id=self.server_id, run_id=run_id) + + @abstractmethod + def _generate_job_runner_instance(self, args, run_id=None, request_json=None, agent_config=None, edge_id=None): + return None + + @abstractmethod + def _generate_extend_queue_list(self): + return list() + + def reset_devices_status(self, edge_id, status): + self.status_reporter.run_id = self.run_id + self.status_reporter.edge_id = edge_id + self.status_reporter.report_client_id_status( + edge_id, status, is_from_model=self.is_deployment_runner, server_id=self.server_id, run_id=self.run_id) + + def start_runner_process( + self, run_id, request_json, edge_id=None, + sender_message_queue=None, listener_message_queue=None, + status_center_queue=None, cuda_visible_gpu_ids_str=None + ): + client_runner = self._generate_job_runner_instance( + self.args, run_id=run_id, request_json=request_json, + agent_config=None, edge_id=edge_id + ) + client_runner.start_request_json = request_json + client_runner.cuda_visible_gpu_ids_str = cuda_visible_gpu_ids_str + run_id_str = str(run_id) + self.run_process_event = multiprocessing.Event() + client_runner.run_process_event = self.run_process_event + self.run_process_completed_event = multiprocessing.Event() + client_runner.run_process_completed_event = self.run_process_completed_event + client_runner.server_id = request_json.get("server_id", "0") + self.run_extend_queue_list = self._generate_extend_queue_list() + logging.info("start the runner process.") + self.run_process = Process(target=client_runner.run, args=( + self.run_process_event, self.run_process_completed_event, self.run_extend_queue_list, + sender_message_queue, listener_message_queue, status_center_queue + )) + self.run_process.start() + return self.run_process diff --git a/python/fedml/computing/scheduler/slave/base_slave_job_runner_manager.py b/python/fedml/computing/scheduler/slave/base_slave_job_runner_manager.py new file mode 100755 index 0000000000..80e486224e --- /dev/null +++ b/python/fedml/computing/scheduler/slave/base_slave_job_runner_manager.py @@ -0,0 +1,12 @@ + +from abc import ABC, abstractmethod +from ..scheduler_core.scheduler_base_job_runner_manager import FedMLSchedulerBaseJobRunnerManager +from ..scheduler_core.scheduler_base_job_runner import FedMLSchedulerBaseJobRunner + + +class FedMLBaseSlaveJobRunnerManager(FedMLSchedulerBaseJobRunnerManager, ABC): + def __init__(self): + FedMLSchedulerBaseJobRunnerManager.__init__(self) + + def cleanup_containers_and_release_gpus(self, run_id, edge_id, job_type): + FedMLSchedulerBaseJobRunner.cleanup_containers_and_release_gpus(run_id, edge_id, job_type) diff --git a/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py b/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py new file mode 100755 index 0000000000..3acd9f2488 --- /dev/null +++ b/python/fedml/computing/scheduler/slave/base_slave_protocol_manager.py @@ -0,0 +1,564 @@ + +import json +import logging +import os +import time +import traceback +from abc import ABC, abstractmethod + +import fedml +from ..comm_utils.constants import SchedulerConstants +from ..comm_utils.job_utils import JobRunnerUtils, DockerArgs +from ..comm_utils.run_process_utils import RunProcessUtils +from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog +from ....core.mlops.mlops_configs import MLOpsConfigs +from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon +from ..comm_utils import sys_utils +from ....core.mlops.mlops_utils import MLOpsUtils +from ..scheduler_core.compute_cache_manager import ComputeCacheManager +from ..scheduler_core.ota_upgrade import FedMLOtaUpgrade +from .client_data_interface import FedMLClientDataInterface +from ..scheduler_core.scheduler_base_protocol_manager import FedMLSchedulerBaseProtocolManager +from ..scheduler_core.general_constants import GeneralConstants + + +class FedMLBaseSlaveProtocolManager(FedMLSchedulerBaseProtocolManager, ABC): + + def __init__(self, args, agent_config=None): + FedMLSchedulerBaseProtocolManager.__init__(self, args, agent_config=agent_config) + + self.request_json = None + self.disable_client_login = None + self.args = args + self.message_status_runner = None + self.message_center = None + self.status_center = None + self.run_id = None + self.edge_id = args.edge_id + self.general_edge_id = None + self.edge_user_name = args.user_name + self.edge_extra_url = args.extra_url + self.server_agent_id = args.edge_id + self.current_device_id = args.current_device_id + self.unique_device_id = args.unique_device_id + self.agent_config = agent_config + self.topic_start_train = None + self.topic_report_status = None + self.topic_ota_msg = None + self.topic_request_device_info = None + self.topic_request_device_info_from_mlops = None + self.topic_client_logout = None + self.topic_response_job_status = None + self.topic_report_device_status_in_job = None + self.fl_topic_start_train = None + self.fl_topic_request_device_info = None + self.communication_mgr = None + self.subscribed_topics = list() + self.ota_upgrade = FedMLOtaUpgrade(edge_id=args.edge_id) + self.running_request_json = dict() + self.start_request_json = None + self.user_name = args.user_name + self.general_edge_id = args.general_edge_id + self.server_id = args.server_id + self.model_device_server_id = None + self.model_device_client_edge_id_list = None + self.model_device_server = None + self.model_device_client_list = None + + @abstractmethod + def generate_topics(self): + # The MQTT message topic format is as follows: // + + # The topic for stopping training + self.topic_start_train = "flserver_agent/" + str(self.edge_id) + "/start_train" + + # The topic for reporting current device status. + self.topic_report_status = "mlops/report_device_status" + + # The topic for OTA messages from the MLOps. + self.topic_ota_msg = "mlops/flclient_agent_" + str(self.edge_id) + "/ota" + + # The topic for requesting device info from the client. + self.topic_request_device_info = "server/client/request_device_info/" + str(self.edge_id) + + # The topic for requesting device info from mlops. + self.topic_request_device_info_from_mlops = f"deploy/mlops/slave_agent/request_device_info/{self.edge_id}" + + # The topic for requesting device info from MLOps. + self.topic_client_logout = "mlops/client/logout/" + str(self.edge_id) + + # The topic for getting job status from the status center. + self.topic_response_job_status = f"master_agent/somewhere/response_job_status/{self.edge_id}" + + # The topic for getting device status of job from the status center. + self.topic_report_device_status_in_job = f"slave_job/slave_agent/report_device_status_in_job" + + # The topic for reporting online status + self.topic_active = "flclient_agent/active" + + # The topic for last-will messages. + self.topic_last_will = "flclient_agent/last_will_msg" + + if self.general_edge_id is not None: + self.fl_topic_start_train = "flserver_agent/" + str(self.general_edge_id) + "/start_train" + self.fl_topic_request_device_info = "server/client/request_device_info/" + str(self.general_edge_id) + + # Subscribe topics for starting train, stopping train and fetching client status. + self.subscribed_topics.clear() + self.add_subscribe_topic(self.topic_start_train) + self.add_subscribe_topic(self.topic_report_status) + self.add_subscribe_topic(self.topic_ota_msg) + self.add_subscribe_topic(self.topic_request_device_info) + self.add_subscribe_topic(self.topic_request_device_info_from_mlops) + self.add_subscribe_topic(self.topic_client_logout) + self.add_subscribe_topic(self.topic_response_job_status) + self.add_subscribe_topic(self.topic_report_device_status_in_job) + if self.general_edge_id is not None: + self.add_subscribe_topic(self.fl_topic_start_train) + self.add_subscribe_topic(self.fl_topic_request_device_info) + + @abstractmethod + def add_protocol_handler(self): + # Add the message listeners for all topics, the following is an example. + # self.add_message_listener(self.topic_start_train, self.callback_start_train) + # Add the message listeners for all topics + self.add_message_listener(self.topic_start_train, self.callback_start_train) + self.add_message_listener(self.topic_ota_msg, FedMLBaseSlaveProtocolManager.callback_client_ota_msg) + self.add_message_listener(self.topic_report_status, self.callback_report_current_status) + self.add_message_listener(self.topic_request_device_info, self.callback_report_device_info) + self.add_message_listener(self.topic_request_device_info_from_mlops, self.callback_request_device_info_from_mlops) + self.add_message_listener(self.topic_client_logout, self.callback_client_logout) + self.add_message_listener(self.topic_response_job_status, self.callback_response_job_status) + self.add_message_listener(self.topic_report_device_status_in_job, self.callback_response_device_status_in_job) + self.add_message_listener(self.fl_topic_start_train, self.callback_start_train) + self.add_message_listener(self.fl_topic_request_device_info, self.callback_report_device_info) + + @abstractmethod + def _get_job_runner_manager(self): + return None + + @abstractmethod + def _init_extra_items(self): + os.environ["FEDML_CURRENT_EDGE_ID"] = str(self.edge_id) + if not ComputeCacheManager.get_instance().set_redis_params(): + os.environ["FEDML_DISABLE_REDIS_CONNECTION"] = "1" + + def add_subscribe_topic(self, topic): + self.subscribed_topics.append(topic) + + def stop(self): + if self.model_device_server is not None: + self.model_device_server.stop() + self.model_device_server = None + + if self.model_device_client_list is not None: + for model_client in self.model_device_client_list: + model_client.stop() + self.model_device_client_list.clear() + self.model_device_client_list = None + + super().stop() + + def on_agent_communication_connected(self, mqtt_client_object): + super().on_agent_communication_connected(mqtt_client_object) + + self._process_connection_ready() + + payload = {"model_master_device_id": self.model_device_server_id, + "model_slave_device_id_list": self.model_device_client_edge_id_list} + self.receive_message(self.topic_request_device_info, json.dumps(payload)) + + def on_agent_communication_disconnected(self, mqtt_client_object): + super().on_agent_communication_disconnected(mqtt_client_object) + + self._process_connection_lost() + + @abstractmethod + def _process_connection_ready(self): + pass + + @abstractmethod + def _process_connection_lost(self): + pass + + def print_connected_info(self): + print("\nCongratulations, your device is connected to the FedML MLOps platform successfully!") + print(f"Your FedML Edge ID is {str(self.edge_id)}, unique device ID is {str(self.unique_device_id)}, " + f"master deploy ID is {str(self.model_device_server_id)}, " + f"worker deploy ID is {self.model_device_client_edge_id_list}" + ) + if self.edge_extra_url is not None and self.edge_extra_url != "": + print(f"You may visit the following url to fill in more information with your device.\n" + f"{self.edge_extra_url}") + + def callback_start_train(self, topic, payload): + # Parse the parameters + request_json = json.loads(payload) + is_retain = request_json.get("is_retain", False) + if is_retain: + return + run_id = request_json["runId"] + edge_id = str(topic).split("/")[-2] + self.args.run_id = run_id + self.args.edge_id = edge_id + + # Start log processor for current run + MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) + MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor( + run_id, edge_id, log_source=SchedulerConstants.get_log_source(request_json)) + logging.info("start the log processor") + + # Fetch the config + try: + MLOpsConfigs.fetch_all_configs() + except Exception as e: + logging.error(f"Failed to fetch all configs with Exception {e}. Traceback: {traceback.format_exc()}") + pass + + # Check if the slave agent is disabled. + if not FedMLClientDataInterface.get_instance().get_agent_status(): + request_json = json.loads(payload) + run_id = request_json["runId"] + logging.error( + "FedMLDebug - Receive: topic ({}), payload ({}), but the client agent is disabled. {}".format( + topic, payload, traceback.format_exc() + ) + ) + # Send failed msg when exceptions. + self.status_reporter.report_client_id_status( + edge_id, GeneralConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION, run_id=run_id, + msg=f"the client agent {edge_id} is disabled") + MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, edge_id) + return + + # Print the payload + logging.info( + f"FedMLDebug - run id {run_id}, Receive at callback_start_train: topic ({topic}), payload ({payload})" + ) + + # Occupy GPUs + server_agent_id = request_json["cloud_agent_id"] + scheduler_match_info = request_json.get("scheduler_match_info", {}) + matched_gpu_num = scheduler_match_info.get("matched_gpu_num", 0) + model_master_device_id = scheduler_match_info.get("model_master_device_id", None) + model_slave_device_id = scheduler_match_info.get("model_slave_device_id", None) + model_slave_device_id_list = scheduler_match_info.get("model_slave_device_id_list", None) + run_config = request_json.get("run_config", {}) + run_params = run_config.get("parameters", {}) + serving_args = run_params.get("serving_args", {}) + endpoint_id = serving_args.get("endpoint_id", None) + job_yaml = run_params.get("job_yaml", {}) + job_type = job_yaml.get("job_type", SchedulerConstants.JOB_TASK_TYPE_TRAIN) + cuda_visible_gpu_ids_str = None + if not (job_type == SchedulerConstants.JOB_TASK_TYPE_SERVE or + job_type == SchedulerConstants.JOB_TASK_TYPE_DEPLOY): + cuda_visible_gpu_ids_str = JobRunnerUtils.get_instance().occupy_gpu_ids( + run_id, matched_gpu_num, edge_id, inner_id=endpoint_id, + model_master_device_id=model_master_device_id, + model_slave_device_id=model_slave_device_id) + else: + # Save the relationship between run id and endpoint + ComputeCacheManager.get_instance().set_redis_params() + ComputeCacheManager.get_instance().get_gpu_cache().set_endpoint_run_id_map( + endpoint_id, run_id) + + # Report the run status with finished status and return + self.generate_status_report(run_id, edge_id, server_agent_id=server_agent_id).report_client_id_status( + edge_id, GeneralConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, run_id=run_id) + return + logging.info( + f"Run started, available gpu ids: {JobRunnerUtils.get_instance().get_available_gpu_id_list(edge_id)}") + + # Set the listener for job status from master agent + self.setup_listener_job_status(run_id) + + # Start server with multiprocessing mode + self.request_json = request_json + run_id_str = str(run_id) + self.running_request_json[run_id_str] = request_json + self._get_job_runner_manager().start_job_runner( + run_id, request_json, args=self.args, edge_id=edge_id, + sender_message_queue=self.message_center.get_sender_message_queue(), + listener_message_queue=self.get_listener_message_queue(), + status_center_queue=self.get_status_queue(), + cuda_visible_gpu_ids_str=cuda_visible_gpu_ids_str, + ) + run_process = self._get_job_runner_manager().get_runner_process(run_id) + if run_process is not None: + GeneralConstants.save_run_process(run_id, run_process.pid) + + # Register the job launch message into the status center + self.register_job_launch_message(topic, payload) + + def callback_report_current_status(self, topic, payload): + logging.info( + f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" + ) + + self.send_agent_active_msg(self.edge_id) + if self.general_edge_id is not None: + self.send_agent_active_msg(self.general_edge_id) + + @staticmethod + def callback_client_ota_msg(topic, payload): + logging.info( + f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" + ) + + request_json = json.loads(payload) + cmd = request_json["cmd"] + + if cmd == GeneralConstants.FEDML_OTA_CMD_UPGRADE: + FedMLOtaUpgrade.process_ota_upgrade_msg() + # Process(target=FedMLClientRunner.process_ota_upgrade_msg).start() + raise Exception("After upgraded, restart runner...") + elif cmd == GeneralConstants.FEDML_OTA_CMD_RESTART: + raise Exception("Restart runner...") + + def callback_report_device_info(self, topic, payload): + payload_json = json.loads(payload) + server_id = payload_json.get("server_id", 0) + run_id = payload_json.get("run_id", 0) + listen_edge_id = str(topic).split("/")[-1] + context = payload_json.get("context", None) + need_gpu_info = payload_json.get("need_gpu_info", True) + need_running_process_list = payload_json.get("need_running_process_list", False) + model_master_device_id = payload_json.get("model_master_device_id", None) + model_slave_device_id_list = payload_json.get("model_slave_device_id_list", None) + if model_master_device_id is not None: + self.model_device_server_id = model_master_device_id + if model_slave_device_id_list is not None: + self.model_device_client_edge_id_list = model_slave_device_id_list + response_topic = f"client/server/response_device_info/{server_id}" + if self.mlops_metrics is not None: + if not need_gpu_info: + device_info_json = { + "edge_id": listen_edge_id, + "fedml_version": fedml.__version__, + "user_id": self.args.user_name + } + else: + total_mem, free_mem, total_disk_size, free_disk_size, cup_utilization, cpu_cores, gpu_cores_total, \ + gpu_cores_available, sent_bytes, recv_bytes, gpu_available_ids = sys_utils.get_sys_realtime_stats() + logging.info(f"GPU available ids from get_sys_realtime_stats --> {gpu_available_ids}") + host_ip = sys_utils.get_host_ip() + host_port = sys_utils.get_available_port() + gpu_available_ids = JobRunnerUtils.get_available_gpu_id_list(self.edge_id) + logging.info(f"GPU available ids from get_available_gpu_id_list(device_id) --> {gpu_available_ids}") + gpu_available_ids = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids) + logging.info(f"GPU available ids from trim_unavailable_gpu_ids --> {gpu_available_ids}") + gpu_cores_available = len(gpu_available_ids) + gpu_list = sys_utils.get_gpu_list() + device_info_json = { + "edge_id": listen_edge_id, + "memoryTotal": round(total_mem * MLOpsUtils.BYTES_TO_GB, 2), + "memoryAvailable": round(free_mem * MLOpsUtils.BYTES_TO_GB, 2), + "diskSpaceTotal": round(total_disk_size * MLOpsUtils.BYTES_TO_GB, 2), + "diskSpaceAvailable": round(free_disk_size * MLOpsUtils.BYTES_TO_GB, 2), + "cpuUtilization": round(cup_utilization, 2), + "cpuCores": cpu_cores, + "gpuCoresTotal": gpu_cores_total, + "gpuCoresAvailable": gpu_cores_available, + "gpu_available_ids": gpu_available_ids, + "gpu_list": gpu_list, + "node_ip": host_ip, + "node_port": host_port, + "networkTraffic": sent_bytes + recv_bytes, + "updateTime": int(MLOpsUtils.get_ntp_time()), + "fedml_version": fedml.__version__, + "user_id": self.args.user_name + } + if need_running_process_list: + device_info_json["run_process_list_map"] = self.get_all_run_process_list_map() + salve_device_ids = list() + if self.model_device_client_edge_id_list is not None and \ + isinstance(self.model_device_client_edge_id_list, list): + for model_client_edge_id in self.model_device_client_edge_id_list: + salve_device_ids.append(model_client_edge_id) + response_payload = {"slave_device_id": None if len(salve_device_ids) <= 0 else salve_device_ids[0], + "slave_device_id_list": salve_device_ids, + "master_device_id": self.model_device_server_id, + "run_id": run_id, "edge_id": listen_edge_id, + "edge_info": device_info_json} + if context is not None: + response_payload["context"] = context + + logging.info(f"Response payload --> {response_payload}") + self.message_center.send_message(response_topic, json.dumps(response_payload), run_id=run_id) + + def callback_request_device_info_from_mlops(self, topic, payload): + self.response_device_info_to_mlops(topic, payload) + + def response_device_info_to_mlops(self, topic, payload): + response_topic = f"deploy/slave_agent/mlops/response_device_info" + response_payload = {"run_id": self.run_id, "slave_agent_device_id": self.edge_id, + "fedml_version": fedml.__version__, "edge_id": self.edge_id} + self.message_center.send_message(response_topic, json.dumps(response_payload)) + + def callback_client_logout(self, topic, payload): + payload_json = json.loads(payload) + secret = payload_json.get("auth", None) + if secret is None or str(secret) != "246b1be6-0eeb-4b17-b118-7d74de1975d4": + return + logging.info("Received the logout request.") + self._get_job_runner_manager().stop_all_job_runner() + self.disable_client_login = True + time.sleep(3) + os.system("fedml logout") + + def callback_response_device_status_in_job(self, topic, payload): + # Parse the parameters + payload_json = json.loads(payload) + run_id = payload_json.get("run_id", None) + job_status = payload_json.get("status", None) + edge_id = payload_json.get("edge_id", None) + + # process the status + logging.info("process status in the device status callback.") + self.process_status(run_id, job_status, edge_id) + + def callback_response_job_status(self, topic, payload): + # Parse the parameters + payload_json = json.loads(payload) + run_id = payload_json.get("run_id", None) + master_agent = payload_json.get("master_agent", None) + job_status = payload_json.get("job_status", None) + fedml_version = payload_json.get("fedml_version", None) + edge_id = payload_json.get("edge_id", None) + + # process the status + logging.info("process status in the job status callback.") + self.process_status(run_id, job_status, edge_id, master_id=master_agent) + + def callback_broadcasted_job_status(self, topic, payload): + # Parse the parameters + payload_json = json.loads(payload) + run_id = payload_json.get("run_id", None) + job_status = payload_json.get("status", None) + + # process the status + logging.info("process status in the broadcast job status callback.") + self.process_status(run_id, job_status, self.edge_id) + + def generate_protocol_manager(self): + message_status_runner = self._generate_protocol_manager_instance( + self.args, agent_config=self.agent_config + ) + message_status_runner.request_json = self.request_json + message_status_runner.disable_client_login = self.disable_client_login + message_status_runner.message_center_name = self.message_center_name + message_status_runner.run_id = self.run_id + message_status_runner.edge_id = self.edge_id + message_status_runner.edge_user_name = self.edge_user_name + message_status_runner.edge_extra_url = self.edge_extra_url + message_status_runner.server_agent_id = self.server_agent_id + message_status_runner.current_device_id = self.current_device_id + message_status_runner.unique_device_id = self.unique_device_id + message_status_runner.subscribed_topics = self.subscribed_topics + message_status_runner.running_request_json = self.running_request_json + message_status_runner.request_json = self.start_request_json + message_status_runner.user_name = self.user_name + message_status_runner.general_edge_id = self.general_edge_id + message_status_runner.server_id = self.server_id + message_status_runner.model_device_server_id = self.model_device_server_id + message_status_runner.model_device_client_edge_id_list = self.model_device_client_edge_id_list + message_status_runner.status_queue = self.get_status_queue() + + return message_status_runner + + def process_status(self, run_id, status, edge_id, master_id=None): + run_id_str = str(run_id) + + # Process the completed status + if status == GeneralConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED or \ + status == GeneralConstants.MSG_MLOPS_CLIENT_STATUS_FAILED or \ + status == GeneralConstants.MSG_MLOPS_CLIENT_STATUS_KILLED: + self._get_job_runner_manager().complete_job_runner(run_id) + + # Stop the sys perf process + # noinspection PyBoardException + try: + self.mlops_metrics.stop_sys_perf() + except Exception as ex: + logging.error(f"Failed to stop sys perf with Exception {ex}. Traceback: {traceback.format_exc()}") + pass + + # Stop the user process + try: + GeneralConstants.cleanup_learning_process(run_id) + GeneralConstants.cleanup_bootstrap_process(run_id) + GeneralConstants.cleanup_run_process(run_id) + except Exception as e: + logging.error( + f"Failed to cleanup run when finished with Exception {e}. Traceback: {traceback.format_exc()}") + pass + + # Get the running json. + running_json = self.running_request_json.get(run_id_str) + if running_json is None: + try: + current_job = FedMLClientDataInterface.get_instance().get_job_by_id(run_id) + running_json = json.loads(current_job.running_json) + except Exception as e: + logging.error(f"Failed to get running json with Exception {e}. Traceback: {traceback.format_exc()}") + + # Cleanup the containers and release the gpu ids. + if running_json is not None: + job_type = JobRunnerUtils.parse_job_type(running_json) + if not SchedulerConstants.is_deploy_job(job_type): + logging.info(f"[run/device][{run_id}/{edge_id}] Release gpu resource when run ended.") + self._get_job_runner_manager().cleanup_containers_and_release_gpus(run_id, edge_id, job_type) + + # Stop the runner process + run_process = self._get_job_runner_manager().get_runner_process(run_id) + if run_process is not None: + if run_process.pid is not None: + RunProcessUtils.kill_process(run_process.pid) + # Check if docker client exists and then terminate containers. + if JobRunnerUtils.docker_client_exists(): + try: + # Terminate docker container. + docker_client = JobRunnerUtils.get_docker_client(DockerArgs()) + container_name = JobRunnerUtils.get_run_container_name(run_id) + logging.info(f"Terminating the run docker container {container_name} if exists...") + JobRunnerUtils.remove_run_container_if_exists(container_name, docker_client) + except Exception as e: + logging.error(f"Error occurred when terminating docker container." + f"Exception: {e}, Traceback: {traceback.format_exc()}.") + + # Stop log processor for current run + MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, edge_id) + + def setup_listener_job_status(self, run_id): + # Setup MQTT message listener to receive the job status from master agent; + topic_job_status_from_master = f"master_agent/slave_agent/job_status/{run_id}" + self.add_message_listener(topic_job_status_from_master, self.callback_broadcasted_job_status) + self.subscribe_msg(topic_job_status_from_master) + + def remove_listener_job_status(self, run_id): + # Remove MQTT message listener from master agent; + topic_job_status_from_master = f"master_agent/slave_agent/job_status/{run_id}" + self.remove_message_listener(topic_job_status_from_master) + self.unsubscribe_msg(topic_job_status_from_master) + + def get_all_run_process_list_map(self): + run_process_dict = dict() + all_runner_pid_dict = self._get_job_runner_manager().get_all_runner_pid_map() + if all_runner_pid_dict is None: + return run_process_dict + for run_id_str, process in all_runner_pid_dict.items(): + cur_run_process_list = GeneralConstants.get_learning_process_list(run_id_str) + run_process_dict[run_id_str] = cur_run_process_list + + return run_process_dict + + def stop_job(self, run_id): + self._get_job_runner_manager().stop_job_runner(run_id) + + @staticmethod + def get_start_train_topic_with_edge_id(edge_id): + return "flserver_agent/" + str(edge_id) + "/start_train" + + @abstractmethod + def _generate_protocol_manager_instance(self, args, agent_config=None): + return None diff --git a/python/fedml/computing/scheduler/slave/client_constants.py b/python/fedml/computing/scheduler/slave/client_constants.py index 2e15080541..e5b3d41846 100644 --- a/python/fedml/computing/scheduler/slave/client_constants.py +++ b/python/fedml/computing/scheduler/slave/client_constants.py @@ -153,6 +153,13 @@ def get_database_dir(): os.makedirs(database_dir, exist_ok=True) return database_dir + @staticmethod + def get_global_services_dir(): + home_dir = expanduser("~") + global_services_dir = os.path.join(home_dir, ".fedml", "global_services") + os.makedirs(global_services_dir, exist_ok=True) + return global_services_dir + @staticmethod def cleanup_run_process(run_id): RunProcessUtils.cleanup_run_process( @@ -454,7 +461,6 @@ def remove_fedml_parent_pid_file(): f"Traceback: {traceback.format_exc()}") pass - if __name__ == "__main__": ignore = "*test*,abc*" ignore = tuple(ignore.split(',')) diff --git a/python/fedml/computing/scheduler/slave/client_daemon.py b/python/fedml/computing/scheduler/slave/client_daemon.py index e543115b4c..14b841707f 100755 --- a/python/fedml/computing/scheduler/slave/client_daemon.py +++ b/python/fedml/computing/scheduler/slave/client_daemon.py @@ -1,4 +1,3 @@ - import argparse import os import time @@ -7,12 +6,11 @@ import fedml from fedml.computing.scheduler.comm_utils.sys_utils import cleanup_all_fedml_client_api_processes, \ - cleanup_all_fedml_client_learning_processes, cleanup_all_fedml_client_login_processes, get_python_program, \ - daemon_ota_upgrade + cleanup_all_fedml_client_learning_processes, cleanup_all_fedml_client_login_processes, get_python_program +from fedml.computing.scheduler.scheduler_core.general_constants import MarketplaceType from fedml.computing.scheduler.slave.client_constants import ClientConstants from fedml.computing.scheduler.comm_utils.run_process_utils import RunProcessUtils - if __name__ == "__main__": parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--type", "-t", help="Login or logout to MLOps platform") @@ -26,6 +24,9 @@ parser.add_argument("--no_gpu_check", "-ngc", type=int, default=1) parser.add_argument("--local_on_premise_platform_host", "-lp", type=str, default="127.0.0.1") parser.add_argument("--local_on_premise_platform_port", "-lpp", type=int, default=80) + parser.add_argument("--marketplace_type", "-mpt", type=str, default=MarketplaceType.SECURE.name) + parser.add_argument("--price_per_hour", "-pph", type=str, default="0.0") + parser.add_argument("--name", "-n", type=str, nargs='?', default="") args = parser.parse_args() args.user = args.user @@ -59,7 +60,6 @@ logging.error(f"Cleanup failed | Exception: {e}") pass - # daemon_ota_upgrade(args) if platform.system() == "Windows": @@ -84,21 +84,32 @@ "-k", args.api_key, "-ngc", - str(args.no_gpu_check) + str(args.no_gpu_check), + "-mpt", + args.marketplace_type, + "-pph", + args.price_per_hour, + "-n", + args.name ] ) ret_code, exec_out, exec_err = ClientConstants.get_console_sys_out_pipe_err_results(login_pid) time.sleep(3) else: login_logs = os.path.join(ClientConstants.get_log_file_dir(), "login.log") + # If we use this kind of command, we cannot penetrate the environment variables to the subprocess run_login_cmd = f"nohup {get_python_program()} -W ignore {login_cmd} -t login -u {args.user} " \ f"-v {args.version} -r {args.role} -id {args.device_id} " \ - f"-k {args.api_key} -ngc {str(args.no_gpu_check)} > {login_logs} 2>&1 &" + f"-k {args.api_key} -ngc {str(args.no_gpu_check)} -mpt {args.marketplace_type} " \ + f"-pph {args.price_per_hour} -n {args.name} > {login_logs} 2>&1 &" if args.os_name != "": run_login_cmd += f" -os {args.os_name}" os.system(run_login_cmd) login_pids = RunProcessUtils.get_pid_from_cmd_line(login_cmd) + if len(login_pids) == 0: + print(f"[Client] Cannot find login pid {login_pids}, check the log file {login_logs}") + retry_count += 1 while len(login_pids) > 0: with open(login_logs, "r") as f: log_list = f.readlines() diff --git a/python/fedml/computing/scheduler/slave/client_data_interface.py b/python/fedml/computing/scheduler/slave/client_data_interface.py index 34a7b89bd2..0e9e84381a 100755 --- a/python/fedml/computing/scheduler/slave/client_data_interface.py +++ b/python/fedml/computing/scheduler/slave/client_data_interface.py @@ -143,7 +143,7 @@ def create_job_table(self): updated_time TEXT, round_index INT, total_rounds INT, - running_json TEXT);''') + running_json TEXT NULL);''') self.db_connection.commit() except Exception as e: pass @@ -405,14 +405,14 @@ class FedMLClientJobModel(object): def __init__(self): self.job_id = 0 self.edge_id = 0 - self.started_time = "" - self.ended_time = "" - self.progress = 0 - self.eta = 0 - self.failed_time = "" + self.started_time = "0" + self.ended_time = "0" + self.progress = 0.0 + self.eta = 0.0 + self.failed_time = "0" self.error_code = -1 self.msg = "" - self.updated_time = "" + self.updated_time = "0" self.round_index = 0 self.total_rounds = 0 self.status = "" diff --git a/python/fedml/computing/scheduler/slave/client_login.py b/python/fedml/computing/scheduler/slave/client_login.py index c8123a717c..6d8b9d1ae1 100755 --- a/python/fedml/computing/scheduler/slave/client_login.py +++ b/python/fedml/computing/scheduler/slave/client_login.py @@ -1,332 +1,12 @@ - import argparse -import json -import logging import os -import platform -import subprocess -import time -import traceback - -import click import fedml -from fedml.computing.scheduler.comm_utils import sys_utils -from fedml.computing.scheduler.comm_utils.constants import SchedulerConstants -from fedml.computing.scheduler.slave.client_runner import FedMLClientRunner -from fedml.computing.scheduler.slave.client_constants import ClientConstants -from fedml.core.mlops.mlops_runtime_log import MLOpsRuntimeLog -from fedml.core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon - - -def init_logs(args, edge_id): - # Init runtime logs - args.log_file_dir = ClientConstants.get_log_file_dir() - args.run_id = 0 - args.role = "client" - client_ids = list() - client_ids.append(edge_id) - args.client_id_list = json.dumps(client_ids) - setattr(args, "using_mlops", True) - MLOpsRuntimeLog.get_instance(args).init_logs() - - -def __login_as_client(args, userid, api_key="", use_extra_device_id_suffix=None, role="client"): - setattr(args, "account_id", userid) - setattr(args, "current_running_dir", ClientConstants.get_fedml_home_dir()) - - sys_name = platform.system() - if sys_name == "Darwin": - sys_name = "MacOS" - if hasattr(args, "os_name") and args.os_name is not None and args.os_name != "": - pass - else: - setattr(args, "os_name", sys_name) - version = fedml.get_env_version() - setattr(args, "version", version) - setattr(args, "log_file_dir", ClientConstants.get_log_file_dir()) - is_from_docker = False - if hasattr(args, "device_id") and args.device_id is not None and args.device_id != "0": - setattr(args, "current_device_id", args.device_id) - is_from_docker = True - else: - is_gpu_supplier = (role == ClientConstants.login_role_list[ClientConstants.LOGIN_MODE_GPU_SUPPLIER_INDEX]) - setattr(args, "current_device_id", FedMLClientRunner.get_device_id(use_machine_id=is_gpu_supplier)) - setattr(args, "config_version", version) - setattr(args, "cloud_region", "") - - # Create client runner for communication with the FedML server. - runner = FedMLClientRunner(args) - - # Fetch configs from the MLOps config server. - service_config = dict() - config_try_count = 0 - edge_id = 0 - while config_try_count < 5: - try: - mqtt_config, s3_config, mlops_config, docker_config = runner.fetch_configs() - service_config["mqtt_config"] = mqtt_config - service_config["s3_config"] = s3_config - service_config["ml_ops_config"] = mlops_config - service_config["docker_config"] = docker_config - runner.agent_config = service_config - # click.echo("service_config = {}".format(service_config)) - log_server_url = mlops_config.get("LOG_SERVER_URL", None) - if log_server_url is not None: - setattr(args, "log_server_url", log_server_url) - setattr(runner.args, "log_server_url", log_server_url) - break - except Exception as e: - click.echo("{}\n{}".format(SchedulerConstants.ERR_MSG_BINDING_EXCEPTION_1, traceback.format_exc())) - click.echo(SchedulerConstants.ERR_MSG_BINDING_EXIT_RETRYING) - config_try_count += 1 - time.sleep(3) - continue - - if config_try_count >= 5: - click.echo("") - click.echo("[1] Oops, you failed to login the FedML MLOps platform.") - click.echo("Please check whether your network is normal!") - return - - # Judge whether running from fedml docker hub - is_from_fedml_docker_hub = False - dock_loc_file = ClientConstants.get_docker_location_file() - if os.path.exists(dock_loc_file): - is_from_fedml_docker_hub = True - - # Build unique device id - if is_from_docker: - unique_device_id = args.current_device_id + "@" + args.os_name + ".Docker.Edge.Device" - else: - unique_device_id = args.current_device_id + "@" + args.os_name + ".Edge.Device" - if is_from_fedml_docker_hub: - unique_device_id = args.current_device_id + "@" + args.os_name + ".DockerHub.Edge.Device" - - if use_extra_device_id_suffix is not None: - unique_device_id = args.current_device_id + "@" + args.os_name + use_extra_device_id_suffix - - # Bind account id to FedML® Nexus AI Platform - register_try_count = 0 - edge_id = -1 - user_name = None - extra_url = None - general_edge_id = None - while register_try_count < 5: - try: - edge_id, user_name, extra_url, general_edge_id = runner.bind_account_and_device_id( - service_config["ml_ops_config"]["EDGE_BINDING_URL"], args.account_id, unique_device_id, args.os_name, - api_key=api_key, role=role - ) - if edge_id > 0: - runner.edge_id = edge_id - runner.edge_user_name = user_name - runner.edge_extra_url = extra_url - break - except SystemExit as e: - click.echo("Your account does not exist. Please make sure your account correct.") - os.system("fedml logout -c") - return - except Exception as e: - click.echo("{}\n{}".format(SchedulerConstants.ERR_MSG_BINDING_EXCEPTION_2, traceback.format_exc())) - click.echo(SchedulerConstants.ERR_MSG_BINDING_EXIT_RETRYING) - register_try_count += 1 - time.sleep(3) - continue - - if edge_id <= 0: - click.echo("") - click.echo("[2] Oops, you failed to login the FedML MLOps platform.") - click.echo("Please check whether your network is normal!") - return - - # Init runtime logs - setattr(args, "client_id", edge_id) - setattr(args, "is_from_docker", is_from_docker) - runner.args = args - init_logs(args, edge_id) - # logging.info("args {}".format(args)) - - # Log arguments and binding results. - # logging.info("login: unique_device_id = %s" % str(unique_device_id)) - # logging.info("login: edge_id = %s" % str(edge_id)) - runner.unique_device_id = unique_device_id - runner.user_name = user_name - runner.general_edge_id = general_edge_id - ClientConstants.save_runner_infos(args.current_device_id + "." + args.os_name, edge_id, run_id=0) - - # Setup MQTT connection for communication with the FedML server. - try: - runner.setup_agent_mqtt_connection(service_config) - except Exception as e: - login_exit_file = os.path.join(ClientConstants.get_log_file_dir(), "exited.log") - with open(login_exit_file, "w") as f: - f.writelines(f"{os.getpid()}.") - print("finally") - runner.stop_agent() - raise e - - # Start mqtt looper - runner.start_agent_mqtt_loop() - - -def __login_as_simulator(args, userid, mqtt_connection=True): - setattr(args, "account_id", userid) - setattr(args, "current_running_dir", ClientConstants.get_fedml_home_dir()) - - sys_name = platform.system() - if sys_name == "Darwin": - sys_name = "MacOS" - setattr(args, "os_name", sys_name) - version = fedml.get_env_version() - setattr(args, "version", version) - setattr(args, "log_file_dir", ClientConstants.get_log_file_dir()) - setattr(args, "device_id", FedMLClientRunner.get_device_id()) - setattr(args, "current_device_id", FedMLClientRunner.get_device_id()) - setattr(args, "config_version", version) - setattr(args, "cloud_region", "") - - - # Create client runner for communication with the FedML server. - runner = FedMLClientRunner(args) - - # Fetch configs from the MLOps config server. - service_config = dict() - config_try_count = 0 - edge_id = 0 - while config_try_count < 5: - try: - mqtt_config, s3_config, mlops_config, docker_config = runner.fetch_configs() - service_config["mqtt_config"] = mqtt_config - service_config["s3_config"] = s3_config - service_config["ml_ops_config"] = mlops_config - service_config["docker_config"] = docker_config - runner.agent_config = service_config - log_server_url = mlops_config.get("LOG_SERVER_URL", None) - if log_server_url is not None: - setattr(args, "log_server_url", log_server_url) - setattr(runner.args, "log_server_url", log_server_url) - break - except Exception as e: - config_try_count += 1 - time.sleep(3) - continue - - if config_try_count >= 5: - click.echo("") - click.echo("[3] Oops, you failed to login the FedML MLOps platform.") - click.echo("Please check whether your network is normal!") - return False, edge_id, args - - # Build unique device id - if args.device_id is not None and len(str(args.device_id)) > 0: - unique_device_id = args.device_id + "@" + args.os_name + ".Edge.Simulator" - - # Bind account id to FedML® Nexus AI Platform - register_try_count = 0 - edge_id = -1 - user_name = None - extra_url = None - general_edge_id = None - while register_try_count < 5: - try: - edge_id, _, _, _ = runner.bind_account_and_device_id( - service_config["ml_ops_config"]["EDGE_BINDING_URL"], args.account_id, - unique_device_id, args.os_name, role="simulator" - ) - if edge_id > 0: - runner.edge_id = edge_id - break - except SystemExit as e: - click.echo("Your account does not exist. Please make sure your account correct.") - os.system("fedml logout -c") - return - except Exception as e: - register_try_count += 1 - time.sleep(3) - continue - - if edge_id <= 0: - click.echo("") - click.echo("[4] Oops, you failed to login the FedML MLOps platform.") - click.echo("Please check whether your network is normal!") - return False, edge_id, args - - # Init runtime logs - setattr(args, "client_id", edge_id) - runner.args = args - #init_logs(args, edge_id) - logging.info("args {}".format(args)) - - # Log arguments and binding results. - logging.info("login: unique_device_id = %s" % str(unique_device_id)) - logging.info("login: edge_id = %s" % str(edge_id)) - runner.unique_device_id = unique_device_id - - if mqtt_connection: - ClientConstants.save_runner_infos(args.device_id + "." + args.os_name, edge_id, run_id=0) - - # Setup MQTT connection for communication with the FedML server. - try: - runner.setup_agent_mqtt_connection(service_config) - except Exception as e: - pass - - # Open simulator daemon process to process run status. - simulator_daemon_cmd = os.path.join(os.path.dirname(__file__), "simulator_daemon.py") - env_version = fedml.get_env_version() - simulator_daemon_process = sys_utils.run_subprocess_open( - [ - sys_utils.get_python_program(), - simulator_daemon_cmd, - "-t", - "login", - "-u", - str(args.user), - "-v", - env_version, - "-r", - args.role, - "-id", - args.device_id, - "-os", - args.os_name, - "-rk", - "1", - "-lfd", - args.log_file_dir, - "-cf", - env_version, - "-ci", - str(edge_id) - ] - ).pid - - # Start mqtt looper - runner.start_agent_mqtt_loop() - - return True, edge_id, args - - -def login(args): - if args.role == ClientConstants.login_role_list[ClientConstants.LOGIN_MODE_CLIENT_INDEX]: - __login_as_client(args, args.user, api_key=args.api_key) - elif args.role == ClientConstants.login_role_list[ClientConstants.LOGIN_MODE_GPU_SUPPLIER_INDEX]: - if args.no_gpu_check == 0: - gpu_count, _ = sys_utils.get_gpu_count_vendor() - if gpu_count <= 0: - click.echo("We can't find any gpu device on your machine. \n" - "With the gpu_supplier(-g) option, you need to check if your machine " - "has nvidia GPUs and installs CUDA related drivers.") - return - __login_as_client(args, args.user, api_key=args.api_key, - use_extra_device_id_suffix=".Edge.GPU.Supplier", role=args.role) - elif args.role == ClientConstants.login_role_list[ClientConstants.LOGIN_MODE_EDGE_SIMULATOR_INDEX]: - __login_as_simulator(args, args.user) +from fedml.computing.scheduler.scheduler_core.general_constants import MarketplaceType +from fedml.computing.scheduler.slave.slave_agent import FedMLLaunchSlaveAgent def logout(): - ClientConstants.cleanup_run_process(None) - sys_utils.cleanup_all_fedml_client_api_processes() + FedMLLaunchSlaveAgent.logout() if __name__ == "__main__": @@ -345,6 +25,9 @@ def logout(): parser.add_argument("--no_gpu_check", "-ngc", type=int, default=1) parser.add_argument("--local_on_premise_platform_host", "-lp", type=str, default="127.0.0.1") parser.add_argument("--local_on_premise_platform_port", "-lpp", type=int, default=80) + parser.add_argument("--marketplace_type", "-mpt", type=str, default=MarketplaceType.SECURE.name) + parser.add_argument("--price_per_hour", "-pph", type=str, default="0.0") + parser.add_argument("--name", "-n", type=str, nargs='?', default="") args = parser.parse_args() args.user = args.user @@ -357,9 +40,10 @@ def logout(): fedml.set_local_on_premise_platform_port(args.local_on_premise_platform_port) fedml.set_env_version(args.version) + slave_agent = FedMLLaunchSlaveAgent() if args.type == 'login': - login(args) + slave_agent.login(userid=args.api_key, api_key=args.api_key, device_id=args.device_id, + os_name=args.os_name, role=args.role, marketplace_type=args.marketplace_type, + price_per_hour=args.price_per_hour, name=args.name) else: - logout() - - + FedMLLaunchSlaveAgent.logout() diff --git a/python/fedml/computing/scheduler/slave/client_runner.py b/python/fedml/computing/scheduler/slave/client_runner.py deleted file mode 100755 index 79b5697728..0000000000 --- a/python/fedml/computing/scheduler/slave/client_runner.py +++ /dev/null @@ -1,1872 +0,0 @@ -import json -import logging -import multiprocessing -import sys - -from multiprocessing import Process -import os -import platform -import shutil -import subprocess -import threading - -import time -import traceback -import urllib -import uuid -import zipfile -from urllib.parse import urljoin, urlparse - -import requests - -import fedml -from ..comm_utils.constants import SchedulerConstants -from ..comm_utils.job_cleanup import JobCleanup -from ..comm_utils.job_utils import JobRunnerUtils, DockerArgs -from ..comm_utils.run_process_utils import RunProcessUtils -from ..scheduler_entry.constants import Constants -from ....core.mlops.mlops_device_perfs import MLOpsDevicePerfStats -from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog - -from ....core.distributed.communication.mqtt.mqtt_manager import MqttManager -from ..comm_utils.yaml_utils import load_yaml_config -from .client_constants import ClientConstants - -from ....core.mlops.mlops_metrics import MLOpsMetrics - -from ....core.mlops.mlops_configs import MLOpsConfigs -from ....core.mlops.mlops_runtime_log_daemon import MLOpsRuntimeLogDaemon -from ....core.mlops.mlops_status import MLOpsStatus -from ..comm_utils.sys_utils import get_sys_runner_info, get_python_program -from .client_data_interface import FedMLClientDataInterface -from ..comm_utils import sys_utils -from ....core.mlops.mlops_utils import MLOpsUtils -from ..model_scheduler.model_device_client import FedMLModelDeviceClientRunner -from ..model_scheduler.model_device_server import FedMLModelDeviceServerRunner -from ..comm_utils import security_utils -from ..scheduler_core.compute_cache_manager import ComputeCacheManager -from ..scheduler_core.message_center import FedMLMessageCenter -import ssl - - -class RunnerError(Exception): - """ Runner stopped. """ - pass - - -class RunnerCompletedError(Exception): - """ Runner completed. """ - pass - - -class FedMLClientRunner(FedMLMessageCenter): - - def __init__(self, args, edge_id=0, request_json=None, agent_config=None, run_id=0, - cuda_visible_gpu_ids_str=None): - super().__init__() - self.model_device_server_id = None - self.model_device_client_edge_id_list = None - self.disable_client_login = False - self.model_device_server = None - self.model_device_client_list = None - self.run_process_event = None - self.run_process_event_map = dict() - self.run_process_completed_event = None - self.run_process_completed_event_map = dict() - self.run_process = None - self.run_process_map = dict() - self.running_request_json = dict() - self.local_api_process = None - self.start_request_json = None - self.device_status = None - self.current_training_status = None - self.mqtt_mgr = None - self.edge_id = edge_id - self.edge_user_name = None - self.edge_extra_url = None - self.run_id = run_id - self.unique_device_id = None - self.args = args - self.request_json = request_json - self.version = args.version - self.device_id = args.device_id - self.cur_dir = os.path.split(os.path.realpath(__file__))[0] - if args.current_running_dir is not None: - self.cur_dir = args.current_running_dir - self.sudo_cmd = "" - self.is_mac = False - if platform.system() == "Darwin": - self.is_mac = True - - self.agent_config = agent_config - self.fedml_data_base_package_dir = os.path.join("/", "fedml", "data") - self.fedml_data_local_package_dir = os.path.join("/", "fedml", "fedml-package", "fedml", "data") - self.fedml_data_dir = self.fedml_data_base_package_dir - self.fedml_config_dir = os.path.join("/", "fedml", "conf") - - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES = { - "${FEDSYS.RUN_ID}": "", - "${FEDSYS.PRIVATE_LOCAL_DATA}": "", - "${FEDSYS.CLIENT_ID_LIST}": "", - "${FEDSYS.SYNTHETIC_DATA_URL}": "", - "${FEDSYS.IS_USING_LOCAL_DATA}": "", - "${FEDSYS.CLIENT_NUM}": "", - "${FEDSYS.CLIENT_INDEX}": "", - "${FEDSYS.CLIENT_OBJECT_LIST}": "", - "${FEDSYS.LOG_SERVER_URL}": "", - } - - self.mlops_metrics = None - self.client_active_list = dict() - self.ntp_offset = MLOpsUtils.get_ntp_offset() - self.server_id = None - self.computing_started_time = 0 - self.fedml_config_object = None - self.package_type = SchedulerConstants.JOB_PACKAGE_TYPE_DEFAULT - self.cuda_visible_gpu_ids_str = cuda_visible_gpu_ids_str - # logging.info("Current directory of client agent: " + self.cur_dir) - self.subscribed_topics = list() - self.user_name = None - self.general_edge_id = None - self.message_center = None - - def __repr__(self): - return "<{klass} @{id:x} {attrs}>".format( - klass=self.__class__.__name__, - id=id(self) & 0xFFFFFF, - attrs=" ".join("{}={!r}".format(k, v) for k, v in self.__dict__.items()), - ) - - def copy_runner(self): - copy_runner = FedMLClientRunner(self.args) - copy_runner.disable_client_login = self.disable_client_login - copy_runner.model_device_server = self.model_device_server - copy_runner.model_device_client_list = self.model_device_client_list - copy_runner.run_process_event = self.run_process_event - copy_runner.run_process_event_map = self.run_process_event_map - copy_runner.run_process_completed_event = self.run_process_completed_event - copy_runner.run_process_completed_event_map = self.run_process_completed_event_map - copy_runner.run_process = self.run_process - copy_runner.run_process_map = self.run_process_map - copy_runner.running_request_json = self.running_request_json - copy_runner.local_api_process = self.local_api_process - copy_runner.start_request_json = self.start_request_json - copy_runner.device_status = self.device_status - copy_runner.current_training_status = self.current_training_status - copy_runner.mqtt_mgr = self.mqtt_mgr - copy_runner.edge_id = self.edge_id - copy_runner.edge_user_name = self.edge_user_name - copy_runner.edge_extra_url = self.edge_extra_url - copy_runner.run_id = self.run_id - copy_runner.unique_device_id = self.unique_device_id - copy_runner.args = self.args - copy_runner.request_json = self.request_json - copy_runner.version =self.version - copy_runner.device_id = self.device_id - copy_runner.cur_dir = self.cur_dir - copy_runner.cur_dir = self.cur_dir - copy_runner.sudo_cmd = self.sudo_cmd - copy_runner.is_mac = self.is_mac - - copy_runner.agent_config = self.agent_config - copy_runner.fedml_data_base_package_dir = self.fedml_data_base_package_dir - copy_runner.fedml_data_local_package_dir = self.fedml_data_local_package_dir - copy_runner.fedml_data_dir = self.fedml_data_dir - copy_runner.fedml_config_dir = self.fedml_config_dir - - copy_runner.FEDML_DYNAMIC_CONSTRAIN_VARIABLES = self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES - - copy_runner.mlops_metrics = self.mlops_metrics - copy_runner.client_active_list = self.client_active_list - copy_runner.ntp_offset = self.ntp_offset - copy_runner.server_id = self.server_id - copy_runner.computing_started_time = self.computing_started_time - copy_runner.fedml_config_object = self.fedml_config_object - copy_runner.package_type = self.package_type - copy_runner.cuda_visible_gpu_ids_str = self.cuda_visible_gpu_ids_str - copy_runner.subscribed_topics = self.subscribed_topics - copy_runner.user_name = self.user_name - copy_runner.general_edge_id = self.general_edge_id - copy_runner.message_center = self.message_center - - return copy_runner - - def build_dynamic_constrain_variables(self, run_id, run_config): - data_config = run_config.get("data_config", {}) - server_edge_id_list = self.request_json["edgeids"] - local_edge_id_list = list() - local_edge_id_list.append(int(self.edge_id)) - is_using_local_data = 0 - private_data_dir = data_config.get("privateLocalData", "") - synthetic_data_url = data_config.get("syntheticDataUrl", "") - edges = self.request_json["edges"] - # if private_data_dir is not None \ - # and len(str(private_data_dir).strip(' ')) > 0: - # is_using_local_data = 1 - if private_data_dir is None or len(str(private_data_dir).strip(" ")) <= 0: - params_config = run_config.get("parameters", None) - private_data_dir = ClientConstants.get_data_dir() - if synthetic_data_url is None or len(str(synthetic_data_url)) <= 0: - synthetic_data_url = private_data_dir - - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.RUN_ID}"] = run_id - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.PRIVATE_LOCAL_DATA}"] = private_data_dir.replace(" ", "") - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_ID_LIST}"] = str(local_edge_id_list).replace(" ", "") - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.SYNTHETIC_DATA_URL}"] = synthetic_data_url.replace(" ", "") - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.IS_USING_LOCAL_DATA}"] = str(is_using_local_data) - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_NUM}"] = len(server_edge_id_list) - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_INDEX}"] = 1 - for cur_index, id_value in enumerate(server_edge_id_list): - if str(id_value) == str(self.edge_id): - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_INDEX}"] = cur_index + 1 - break - client_objects = str(json.dumps(edges)) - client_objects = client_objects.replace(" ", "").replace("\n", "").replace('"', '\\"') - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.CLIENT_OBJECT_LIST}"] = client_objects - self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES["${FEDSYS.LOG_SERVER_URL}"] = self.agent_config["ml_ops_config"][ - "LOG_SERVER_URL" - ] - - def unzip_file(self, zip_file, unzip_file_path) -> str: - if zipfile.is_zipfile(zip_file): - with zipfile.ZipFile(zip_file, "r") as zipf: - zipf.extractall(unzip_file_path) - unzipped_file_name = zipf.namelist()[0] - else: - raise Exception("Invalid zip file {}".format(zip_file)) - - return unzipped_file_name - - def package_download_progress(self, count, blksize, filesize): - self.check_runner_stop_event() - - downloaded = count * blksize - downloaded = filesize if downloaded > filesize else downloaded - progress = (downloaded / filesize * 100) if filesize != 0 else 0 - progress_int = int(progress) - downloaded_kb = format(downloaded / 1024, '.2f') - - # since this hook funtion is stateless, we need a state to avoid print progress repeatly - if count == 0: - self.prev_download_progress = 0 - if progress_int != self.prev_download_progress and progress_int % 5 == 0: - self.prev_download_progress = progress_int - logging.info("package downloaded size {} KB, progress {}%".format(downloaded_kb, progress_int)) - - def retrieve_and_unzip_package(self, package_name, package_url): - local_package_path = ClientConstants.get_package_download_dir() - os.makedirs(local_package_path, exist_ok=True) - filename, filename_without_extension, file_extension = ClientConstants.get_filename_and_extension(package_url) - local_package_file = os.path.join(local_package_path, f"fedml_run_{self.run_id}_{filename_without_extension}") - if os.path.exists(local_package_file): - os.remove(local_package_file) - ssl._create_default_https_context = ssl._create_unverified_context - urllib.request.urlretrieve(package_url, local_package_file, - reporthook=self.package_download_progress) - unzip_package_path = os.path.join(ClientConstants.get_package_unzip_dir(), - f"unzip_fedml_run_{self.run_id}_{filename_without_extension}") - try: - shutil.rmtree(unzip_package_path, ignore_errors=True) - except Exception as e: - logging.error( - f"Failed to remove directory {unzip_package_path}, Exception: {e}, Traceback: {traceback.format_exc()}") - pass - - package_dir_name = self.unzip_file(local_package_file, unzip_package_path) # Using unziped folder name - unzip_package_full_path = os.path.join(unzip_package_path, package_dir_name) - - logging.info("local_package_file {}, unzip_package_path {}, unzip file full path {}".format( - local_package_file, unzip_package_path, unzip_package_full_path)) - - return unzip_package_full_path - - def update_local_fedml_config(self, run_id, run_config): - packages_config = run_config["packages_config"] - - # Copy config file from the client - unzip_package_path = self.retrieve_and_unzip_package( - packages_config["linuxClient"], packages_config["linuxClientUrl"] - ) - fedml_local_config_file = os.path.join(unzip_package_path, "conf", "fedml.yaml") - - # Load the above config to memory - config_from_container = load_yaml_config(fedml_local_config_file) - container_entry_file_config = config_from_container["entry_config"] - container_dynamic_args_config = config_from_container["dynamic_args"] - entry_file = container_entry_file_config["entry_file"] - conf_file = container_entry_file_config["conf_file"] - self.package_type = container_entry_file_config.get("package_type", SchedulerConstants.JOB_PACKAGE_TYPE_DEFAULT) - full_conf_path = os.path.join(unzip_package_path, "fedml", "config", os.path.basename(conf_file)) - - # Dynamically build constrain variable with realtime parameters from server - self.build_dynamic_constrain_variables(run_id, run_config) - - # Update entry arguments value with constrain variable values with realtime parameters from server - # currently we support the following constrain variables: - # ${FEDSYS_RUN_ID}: a run id represented one entire Federated Learning flow - # ${FEDSYS_PRIVATE_LOCAL_DATA}: private local data path in the Federated Learning client - # ${FEDSYS_CLIENT_ID_LIST}: client list in one entire Federated Learning flow - # ${FEDSYS_SYNTHETIC_DATA_URL}: synthetic data url from server, - # if this value is not null, the client will download data from this URL to use it as - # federated training data set - # ${FEDSYS_IS_USING_LOCAL_DATA}: whether use private local data as federated training data set - # container_dynamic_args_config["data_cache_dir"] = "${FEDSYS.PRIVATE_LOCAL_DATA}" - for constrain_variable_key, constrain_variable_value in self.FEDML_DYNAMIC_CONSTRAIN_VARIABLES.items(): - for argument_key, argument_value in container_dynamic_args_config.items(): - if argument_value is not None and str(argument_value).find(constrain_variable_key) == 0: - replaced_argument_value = str(argument_value).replace( - constrain_variable_key, str(constrain_variable_value) - ) - container_dynamic_args_config[argument_key] = replaced_argument_value - - # Merge all container new config sections as new config dictionary - package_conf_object = dict() - package_conf_object["entry_config"] = container_entry_file_config - package_conf_object["dynamic_args"] = container_dynamic_args_config - package_conf_object["dynamic_args"]["config_version"] = self.args.config_version - container_dynamic_args_config["mqtt_config_path"] = os.path.join( - unzip_package_path, "fedml", "config", os.path.basename(container_dynamic_args_config["mqtt_config_path"]) - ) - container_dynamic_args_config["s3_config_path"] = os.path.join( - unzip_package_path, "fedml", "config", os.path.basename(container_dynamic_args_config["s3_config_path"]) - ) - log_file_dir = ClientConstants.get_log_file_dir() - os.makedirs(log_file_dir, exist_ok=True) - package_conf_object["dynamic_args"]["log_file_dir"] = log_file_dir - - # Save new config dictionary to local file - fedml_updated_config_file = os.path.join(unzip_package_path, "conf", "fedml.yaml") - ClientConstants.generate_yaml_doc(package_conf_object, fedml_updated_config_file) - - # Build dynamic arguments and set arguments to fedml config object - self.build_dynamic_args(run_id, run_config, package_conf_object, unzip_package_path) - return unzip_package_path, package_conf_object - - def build_dynamic_args(self, run_id, run_config, package_conf_object, base_dir): - fedml_conf_file = package_conf_object["entry_config"]["conf_file"] - fedml_conf_file_processed = str(fedml_conf_file).replace('\\', os.sep).replace('/', os.sep) - fedml_conf_path = os.path.join(base_dir, "fedml", "config", - os.path.basename(fedml_conf_file_processed)) - fedml_conf_object = load_yaml_config(fedml_conf_path) - run_params = run_config.get("parameters", {}) - job_yaml = run_params.get("job_yaml", {}) - - # Replace local fedml config objects with parameters from MLOps web - parameters_object = run_config.get("parameters", None) - if parameters_object is not None: - for config_k, config_v in fedml_conf_object.items(): - parameter_v = parameters_object.get(config_k, None) - if parameter_v is not None: - fedml_conf_object[config_k] = parameter_v - parameters_object.pop(config_k) - - for config_k, config_v in parameters_object.items(): - fedml_conf_object[config_k] = config_v - - package_dynamic_args = package_conf_object["dynamic_args"] - if fedml_conf_object.get("comm_args", None) is not None: - fedml_conf_object["comm_args"]["mqtt_config_path"] = package_dynamic_args["mqtt_config_path"] - fedml_conf_object["comm_args"]["s3_config_path"] = package_dynamic_args["s3_config_path"] - fedml_conf_object["common_args"]["using_mlops"] = True - if fedml_conf_object.get("train_args", None) is not None: - fedml_conf_object["train_args"]["run_id"] = package_dynamic_args["run_id"] - fedml_conf_object["train_args"]["client_id_list"] = package_dynamic_args["client_id_list"] - fedml_conf_object["train_args"]["client_num_in_total"] = int(package_dynamic_args["client_num_in_total"]) - fedml_conf_object["train_args"]["client_num_per_round"] = int(package_dynamic_args["client_num_in_total"]) - fedml_conf_object["train_args"]["client_id"] = self.edge_id - fedml_conf_object["train_args"]["server_id"] = self.request_json.get("server_id", "0") - if fedml_conf_object.get("device_args", None) is not None: - fedml_conf_object["device_args"]["worker_num"] = int(package_dynamic_args["client_num_in_total"]) - # fedml_conf_object["data_args"]["data_cache_dir"] = package_dynamic_args["data_cache_dir"] - data_args = fedml_conf_object.get("data_args") - if data_args is not None: - data_cache_dir = fedml_conf_object["data_args"].get("data_cache_dir") - if data_cache_dir is not None: - data_cache_dir = os.path.join(data_cache_dir, str(self.edge_id)) - fedml_conf_object["data_args"]["data_cache_dir"] = data_cache_dir - if fedml_conf_object.get("tracking_args", None) is not None: - fedml_conf_object["tracking_args"]["log_file_dir"] = package_dynamic_args["log_file_dir"] - fedml_conf_object["tracking_args"]["log_server_url"] = package_dynamic_args["log_server_url"] - - fedml_conf_object["dynamic_args"] = package_dynamic_args - self.fedml_config_object = fedml_conf_object.copy() - ClientConstants.generate_yaml_doc(fedml_conf_object, fedml_conf_path) - - def run_bootstrap_script(self, bootstrap_cmd_list, bootstrap_script_file): - try: - logging.info("Bootstrap commands are being executed...") - process, error_list = ClientConstants.execute_commands_with_live_logs(bootstrap_cmd_list, - callback=self.callback_run_bootstrap) - - ret_code, out, err = process.returncode, None, None - if ret_code is None or ret_code <= 0: - if error_list is not None and len(error_list) > 0: - is_bootstrap_run_ok = False - else: - if out is not None: - out_str = sys_utils.decode_our_err_result(out) - if out_str != "": - logging.info("{}".format(out_str)) - - sys_utils.log_return_info(bootstrap_script_file, 0) - - is_bootstrap_run_ok = True - else: - if err is not None: - err_str = sys_utils.decode_our_err_result(err) - if err_str != "": - logging.error("{}".format(err_str)) - - sys_utils.log_return_info(bootstrap_script_file, ret_code) - - is_bootstrap_run_ok = False - except Exception as e: - logging.error(f"Bootstrap script error: Exception: {e}, Traceback: {traceback.format_exc()}") - is_bootstrap_run_ok = False - return is_bootstrap_run_ok - - def callback_run_bootstrap(self, job_pid): - ClientConstants.save_bootstrap_process(self.run_id, job_pid) - - def run(self, process_event, completed_event, message_center_queue): - print(f"Client runner process id {os.getpid()}, run id {self.run_id}") - - if platform.system() != "Windows": - os.setsid() - - os.environ['PYTHONWARNINGS'] = 'ignore:semaphore_tracker:UserWarning' - os.environ.setdefault('PYTHONWARNINGS', 'ignore:semaphore_tracker:UserWarning') - - self.run_process_event = process_event - self.run_process_completed_event = completed_event - try: - MLOpsUtils.set_ntp_offset(self.ntp_offset) - self.rebuild_message_center(message_center_queue) - self.run_impl() - except RunnerError: - logging.info("Runner stopped.") - self.reset_devices_status(self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED) - except RunnerCompletedError: - logging.info("Runner completed.") - except Exception as e: - logging.error(f"Runner exited with errors. Exception: {e}, Traceback {traceback.format_exc()}") - self.mlops_metrics.report_client_id_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, - server_id=self.server_id, run_id=self.run_id) - finally: - if self.mlops_metrics is not None: - computing_ended_time = MLOpsUtils.get_ntp_time() - self.mlops_metrics.report_edge_job_computing_cost(self.run_id, self.edge_id, - self.computing_started_time, computing_ended_time, - self.args.user, self.args.api_key) - logging.info("Release resources.") - self.cleanup_containers_and_release_gpus(self.run_id, self.edge_id) - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(self.run_id, self.edge_id) - if self.mlops_metrics is not None: - self.mlops_metrics.stop_sys_perf() - time.sleep(3) - ClientConstants.cleanup_learning_process(self.run_id) - ClientConstants.cleanup_run_process(self.run_id) - - def check_runner_stop_event(self): - if self.run_process_event.is_set(): - logging.info("Received stopping event.") - raise RunnerError("Runner stopped") - - if self.run_process_completed_event.is_set(): - logging.info("Received completed event.") - raise RunnerCompletedError("Runner completed") - - def run_impl(self): - run_id = self.request_json["runId"] - run_config = self.request_json["run_config"] - data_config = run_config.get("data_config", {}) - packages_config = run_config["packages_config"] - - self.computing_started_time = MLOpsUtils.get_ntp_time() - self.mlops_metrics.report_edge_job_computing_cost(run_id, self.edge_id, - self.computing_started_time, 0, - self.args.user, self.args.api_key) - - self.check_runner_stop_event() - - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - - self.mlops_metrics.report_client_id_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_INITIALIZING, - running_json=self.start_request_json, run_id=run_id) - - # get training params - private_local_data_dir = data_config.get("privateLocalData", "") - is_using_local_data = 0 - # if private_local_data_dir is not None and len(str(private_local_data_dir).strip(' ')) > 0: - # is_using_local_data = 1 - - # start a run according to the hyper-parameters - # fedml_local_data_dir = self.cur_dir + "/fedml_data/run_" + run_id_str + "_edge_" + str(edge_id) - fedml_local_data_dir = os.path.join(self.cur_dir, "fedml_data") - fedml_local_config_dir = os.path.join(self.cur_dir, "fedml_config") - if is_using_local_data: - fedml_local_data_dir = private_local_data_dir - self.fedml_data_dir = self.fedml_data_local_package_dir - - self.check_runner_stop_event() - - logging.info("Download packages") - - # update local config with real time parameters from server and dynamically replace variables value - unzip_package_path, fedml_config_object = self.update_local_fedml_config(run_id, run_config) - # if unzip_package_path is None or fedml_config_object is None: - # logging.info("failed to update local fedml config.") - # self.check_runner_stop_event() - # # Send failed msg when exceptions. - # self.cleanup_run_when_starting_failed(status=ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION) - # return - - logging.info("Check downloaded packages...") - - entry_file_config = fedml_config_object["entry_config"] - dynamic_args_config = fedml_config_object["dynamic_args"] - entry_file = str(entry_file_config["entry_file"]).replace('\\', os.sep).replace('/', os.sep) - entry_file = os.path.basename(entry_file) - conf_file = entry_file_config["conf_file"] - conf_file = str(conf_file).replace('\\', os.sep).replace('/', os.sep) - ##### - # ClientConstants.cleanup_learning_process(run_id) - # ClientConstants.cleanup_bootstrap_process(run_id) - ##### - - if not os.path.exists(unzip_package_path): - logging.info("failed to unzip file.") - self.check_runner_stop_event() - # Send failed msg when exceptions. - self.cleanup_run_when_starting_failed(status=ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION) - return - os.chdir(os.path.join(unzip_package_path, "fedml")) - - self.check_runner_stop_event() - - logging.info("starting the user process...") - - entry_file_full_path = os.path.join(unzip_package_path, "fedml", entry_file) - conf_file_full_path = os.path.join(unzip_package_path, "fedml", conf_file) - logging.info("waiting the user process to finish...") - logging.info(" ") - logging.info(" ") - logging.info("====Your Run Logs Begin===") - - process, is_launch_task, error_list = self.execute_job_task(unzip_package_path=unzip_package_path, - entry_file_full_path=entry_file_full_path, - conf_file_full_path=conf_file_full_path, - dynamic_args_config=dynamic_args_config, - fedml_config_object=self.fedml_config_object) - - logging.info("====Your Run Logs End===") - logging.info(" ") - logging.info(" ") - - ret_code, out, err = process.returncode if process else None, None, None - is_run_ok = sys_utils.is_runner_finished_normally(process.pid) - if is_launch_task: - is_run_ok = True - if error_list is not None and len(error_list) > 0: - is_run_ok = False - if ret_code is None or ret_code <= 0: - self.check_runner_stop_event() - - if is_run_ok: - if out is not None: - out_str = sys_utils.decode_our_err_result(out) - if out_str != "": - logging.info("{}".format(out_str)) - - self.mlops_metrics.report_client_id_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, - server_id=self.server_id, run_id=run_id) - - if is_launch_task: - sys_utils.log_return_info(f"job {run_id}", ret_code) - else: - sys_utils.log_return_info(entry_file, ret_code) - else: - is_run_ok = False - - if not is_run_ok: - # If the run status is killed or finished, then return with the normal state. - current_job = FedMLClientDataInterface.get_instance().get_job_by_id(run_id) - if current_job is not None and (current_job.status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED or - current_job.status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED): - return - - self.check_runner_stop_event() - - logging.error("failed to run the learning process...") - - if err is not None: - err_str = sys_utils.decode_our_err_result(err) - if err_str != "": - logging.error("{}".format(err_str)) - - if is_launch_task: - sys_utils.log_return_info(f"job {run_id}", ret_code) - else: - sys_utils.log_return_info(entry_file, ret_code) - - # Send failed msg when exceptions. - self.mlops_metrics.report_client_id_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, - server_id=self.server_id, run_id=run_id) - - def execute_job_task(self, unzip_package_path, entry_file_full_path, conf_file_full_path, dynamic_args_config, - fedml_config_object): - run_config = self.request_json["run_config"] - run_params = run_config.get("parameters", {}) - client_rank = self.request_json.get("client_rank", 1) - job_yaml = run_params.get("job_yaml", {}) - job_yaml_default_none = run_params.get("job_yaml", None) - job_api_key = job_yaml.get("run_api_key", None) - job_api_key = job_yaml.get("fedml_run_dynamic_params", None) if job_api_key is None else job_api_key - assigned_gpu_ids = run_params.get("gpu_ids", None) - job_type = job_yaml.get("job_type", None) - containerize = fedml_config_object.get("containerize", None) - image_pull_policy = fedml_config_object.get("image_pull_policy", Constants.IMAGE_PULL_POLICY_ALWAYS) - # TODO: Can we remove task_type? - job_type = job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type - conf_file_object = load_yaml_config(conf_file_full_path) - entry_args_dict = conf_file_object.get("fedml_entry_args", {}) - entry_args = entry_args_dict.get("arg_items", None) - scheduler_match_info = self.request_json.get("scheduler_match_info", {}) - if job_type == Constants.JOB_TASK_TYPE_TRAIN: - containerize = True if containerize is None else containerize - - # Bootstrap Info - bootstrap_script_path, bootstrap_script_dir, bootstrap_script_file = [None] * 3 - env_args = fedml_config_object.get("environment_args", None) - - if env_args is not None: - bootstrap_script_file = env_args.get("bootstrap", None) - if bootstrap_script_file is not None: - bootstrap_script_file = str(bootstrap_script_file).replace('\\', os.sep).replace('/', os.sep) - if platform.system() == 'Windows': - bootstrap_script_file = bootstrap_script_file.rstrip('.sh') + '.bat' - if bootstrap_script_file is not None: - bootstrap_script_dir = os.path.join(unzip_package_path, "fedml", - os.path.dirname(bootstrap_script_file)) - bootstrap_script_path = os.path.join( - bootstrap_script_dir, bootstrap_script_dir, os.path.basename(bootstrap_script_file) - ) - - bootstrap_cmd_list = list() - if bootstrap_script_path: - logging.info("Bootstrap commands are being generated...") - bootstrap_cmd_list = JobRunnerUtils.generate_bootstrap_commands(bootstrap_script_path=bootstrap_script_path, - bootstrap_script_dir=bootstrap_script_dir, - bootstrap_script_file=bootstrap_script_file) - logging.info(f"Generated following Bootstrap commands: {bootstrap_cmd_list}") - - if not containerize: - if len(bootstrap_cmd_list) and not (job_type == Constants.JOB_TASK_TYPE_DEPLOY or - job_type == Constants.JOB_TASK_TYPE_SERVE): - bootstrapping_successful = self.run_bootstrap_script(bootstrap_cmd_list=bootstrap_cmd_list, - bootstrap_script_file=bootstrap_script_file) - - if not bootstrapping_successful: - logging.info("failed to update local fedml config.") - self.check_runner_stop_event() - # Send failed msg when exceptions. - self.cleanup_run_when_starting_failed(status=ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION) - raise Exception(f"Failed to execute following bootstrap commands: {bootstrap_cmd_list}") - - logging.info("cleanup the previous learning process and bootstrap process...") - ClientConstants.cleanup_learning_process(self.request_json["runId"]) - ClientConstants.cleanup_bootstrap_process(self.request_json["runId"]) - - executable_interpreter = ClientConstants.CLIENT_SHELL_PS \ - if platform.system() == ClientConstants.PLATFORM_WINDOWS else ClientConstants.CLIENT_SHELL_BASH - - if job_yaml_default_none is None: - # Generate the job executing commands for previous federated learning (Compatibility) - python_program = get_python_program() - logging.info("Run the client: {} {} --cf {} --rank {} --role client".format( - python_program, entry_file_full_path, conf_file_full_path, str(dynamic_args_config.get("rank", 1)))) - rank = str(dynamic_args_config.get("rank", 1)) - entry_command = f"{python_program} {entry_file_full_path} --cf " \ - f"{conf_file_full_path} --rank {rank} --role client" - shell_cmd_list = [entry_command] - - # Run the job executing commands for previous federated learning (Compatibility) - process, error_list = ClientConstants.execute_commands_with_live_logs( - shell_cmd_list, callback=self.callback_start_fl_job, should_write_log_file=False) - is_launch_task = False - else: - self.check_runner_stop_event() - - self.mlops_metrics.report_client_id_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_RUNNING, run_id=self.run_id) - - # Generate the job executing commands - job_executing_commands = JobRunnerUtils.generate_job_execute_commands( - self.run_id, self.edge_id, self.version, - self.package_type, executable_interpreter, entry_file_full_path, - conf_file_object, entry_args, assigned_gpu_ids, - job_api_key, client_rank, scheduler_match_info=scheduler_match_info, - cuda_visible_gpu_ids_str=self.cuda_visible_gpu_ids_str) - - if containerize is not None and containerize is True: - docker_args = fedml_config_object.get("docker", {}) - docker_args = JobRunnerUtils.create_instance_from_dict(DockerArgs, docker_args) - try: - job_executing_commands = JobRunnerUtils.generate_launch_docker_command(docker_args=docker_args, - run_id=self.run_id, - edge_id=self.edge_id, - unzip_package_path=unzip_package_path, - executable_interpreter=executable_interpreter, - entry_file_full_path=entry_file_full_path, - bootstrap_cmd_list=bootstrap_cmd_list, - cuda_visible_gpu_ids_str=self.cuda_visible_gpu_ids_str, - image_pull_policy=image_pull_policy) - except Exception as e: - logging.error(f"Error occurred while generating containerized launch commands. " - f"Exception: {e}, Traceback: {traceback.format_exc()}") - return None, None, None - - if not job_executing_commands: - raise Exception("Failed to generate docker execution command") - - # Run the job executing commands - logging.info(f"Run the client job with job id {self.run_id}, device id {self.edge_id}.") - process, error_list = ClientConstants.execute_commands_with_live_logs( - job_executing_commands, callback=self.start_job_perf, error_processor=self.job_error_processor, - should_write_log_file=False if job_type == Constants.JOB_TASK_TYPE_FEDERATE else True) - is_launch_task = False if job_type == Constants.JOB_TASK_TYPE_FEDERATE else True - - return process, is_launch_task, error_list - - def callback_start_fl_job(self, job_pid): - ClientConstants.save_learning_process(self.run_id, job_pid) - self.mlops_metrics.report_sys_perf( - self.args, self.agent_config["mqtt_config"], job_process_id=job_pid) - - def start_job_perf(self, job_pid): - ClientConstants.save_learning_process(self.run_id, job_pid) - self.mlops_metrics.report_job_perf(self.args, self.agent_config["mqtt_config"], job_pid) - - def job_error_processor(self, error_list): - self.check_runner_stop_event() - - error_str = "\n".join(error_list) - error_message = f"Error occurred when running the job... {error_str}" - logging.error(error_message) - raise Exception(error_message) - - def reset_devices_status(self, edge_id, status, should_send_client_id_status=True): - self.mlops_metrics.run_id = self.run_id - self.mlops_metrics.edge_id = edge_id - - if should_send_client_id_status: - if status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED or \ - status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED or \ - status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION: - self.mlops_metrics.report_client_id_status( - edge_id, status, server_id=self.server_id, run_id=self.run_id) - - def sync_run_stop_status(self, run_status=ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED): - try: - if self.run_process_event is not None: - self.run_process_event.set() - - self.mlops_metrics.report_client_id_status( - self.edge_id, run_status, server_id=self.server_id, run_id=self.run_id) - except Exception as e: - logging.error(f"Failed to sync run stop status with Exception {e}. Traceback: {traceback.format_exc()}") - pass - - def cleanup_run_when_starting_failed( - self, status=ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED, should_send_client_id_status=True): - # logging.error("Cleanup run successfully when starting failed.") - - self.reset_devices_status( - self.edge_id, status, should_send_client_id_status=should_send_client_id_status) - - time.sleep(2) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - logging.error(f"Failed to stop sys perf with Exception {ex}. Traceback: {traceback.format_exc()}") - pass - - time.sleep(1) - - try: - ClientConstants.cleanup_learning_process(self.run_id) - ClientConstants.cleanup_bootstrap_process(self.run_id) - ClientConstants.cleanup_run_process(self.run_id) - except Exception as e: - logging.error( - f"Failed to cleanup run when starting failed with Exception {e}. Traceback: {traceback.format_exc()}") - pass - - def cleanup_run_when_finished(self): - # logging.info("Cleanup run successfully when finished.") - - self.reset_devices_status(self.edge_id, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED, - should_send_client_id_status=False) - - time.sleep(2) - - try: - self.mlops_metrics.stop_sys_perf() - except Exception as ex: - logging.error(f"Failed to stop sys perf with Exception {ex}. Traceback: {traceback.format_exc()}") - pass - - time.sleep(1) - - try: - ClientConstants.cleanup_learning_process(self.run_id) - ClientConstants.cleanup_bootstrap_process(self.run_id) - ClientConstants.cleanup_run_process(self.run_id) - except Exception as e: - logging.error( - f"Failed to cleanup run when finished with Exception {e}. Traceback: {traceback.format_exc()}") - pass - - def setup_message_center(self): - if self.message_center is not None: - return - - self.message_center = FedMLMessageCenter(agent_config=self.agent_config) - self.message_center.start_sender() - - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - self.mlops_metrics.set_messenger(self.message_center) - self.mlops_metrics.run_id = self.run_id - - def rebuild_message_center(self, message_center_queue): - self.message_center = FedMLMessageCenter(message_queue=message_center_queue) - - if self.mlops_metrics is None: - self.mlops_metrics = MLOpsMetrics() - self.mlops_metrics.set_messenger(self.message_center) - self.mlops_metrics.run_id = self.run_id - - def release_message_center(self): - try: - if self.message_center is not None: - self.message_center.stop() - self.message_center = None - - except Exception as e: - logging.error( - f"Failed to release client mqtt manager with Exception {e}. Traceback: {traceback.format_exc()}") - pass - - def ota_upgrade(self, payload, request_json): - run_id = request_json["runId"] - force_ota = False - ota_version = None - - try: - run_config = request_json.get("run_config", None) - parameters = run_config.get("parameters", None) - common_args = parameters.get("common_args", None) - force_ota = common_args.get("force_ota", False) if common_args is not None else False - ota_version = common_args.get("ota_version", None) if common_args is not None else None - except Exception as e: - logging.error( - f"Failed to get ota upgrade parameters with Exception {e}. Traceback: {traceback.format_exc()}") - pass - - if force_ota and ota_version is not None: - should_upgrade = True if ota_version != fedml.__version__ else False - upgrade_version = ota_version - else: - try: - fedml_is_latest_version, local_ver, remote_ver = sys_utils.check_fedml_is_latest_version(self.version) - except Exception as e: - logging.error(f"Failed to check fedml version with Exception {e}. Traceback: {traceback.format_exc()}") - return - - should_upgrade = False if fedml_is_latest_version else True - upgrade_version = remote_ver - - if should_upgrade: - FedMLClientDataInterface.get_instance(). \ - save_started_job(run_id, self.edge_id, time.time(), - ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, - ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, - payload) - self.mlops_metrics.report_client_id_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING, run_id=run_id) - - logging.info(f"Upgrade to version {upgrade_version} ...") - - sys_utils.do_upgrade(self.version, upgrade_version) - raise Exception("Restarting after upgraded...") - - def callback_start_train(self, topic, payload): - # Get training params - - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json["runId"] - - # Start log processor for current run - train_edge_id = str(topic).split("/")[-2] - self.args.run_id = run_id - self.args.edge_id = train_edge_id - MLOpsRuntimeLog.get_instance(self.args).init_logs(log_level=logging.INFO) - MLOpsRuntimeLogDaemon.get_instance(self.args).start_log_processor( - run_id, train_edge_id, log_source=SchedulerConstants.get_log_source(request_json)) - logging.info("start the log processor") - - try: - MLOpsConfigs.fetch_all_configs() - except Exception as e: - logging.error(f"Failed to fetch all configs with Exception {e}. Traceback: {traceback.format_exc()}") - pass - - if not FedMLClientDataInterface.get_instance().get_agent_status(): - request_json = json.loads(payload) - run_id = request_json["runId"] - logging.error( - "FedMLDebug - Receive: topic ({}), payload ({}), but the client agent is disabled. {}".format( - topic, payload, traceback.format_exc() - ) - ) - # Send failed msg when exceptions. - self.mlops_metrics.report_client_id_status( - train_edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION, run_id=run_id, - msg=f"the client agent {train_edge_id} is disabled") - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, train_edge_id) - return - - logging.info( - f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - ) - - # Terminate previous process about starting or stopping run command - logging.info("cleanup and save runner information") - server_agent_id = request_json["cloud_agent_id"] - ClientConstants.save_runner_infos(self.args.device_id + "." + self.args.os_name, train_edge_id, run_id=run_id) - - # OTA upgrade - # self.ota_upgrade(payload, request_json) - - # Occupy GPUs - scheduler_match_info = request_json.get("scheduler_match_info", {}) - matched_gpu_num = scheduler_match_info.get("matched_gpu_num", 0) - model_master_device_id = scheduler_match_info.get("model_master_device_id", None) - model_slave_device_id = scheduler_match_info.get("model_slave_device_id", None) - model_slave_device_id_list = scheduler_match_info.get("model_slave_device_id_list", None) - run_config = request_json.get("run_config", {}) - run_params = run_config.get("parameters", {}) - serving_args = run_params.get("serving_args", {}) - endpoint_id = serving_args.get("endpoint_id", None) - job_yaml = run_params.get("job_yaml", {}) - job_type = job_yaml.get("job_type", SchedulerConstants.JOB_TASK_TYPE_TRAIN) - cuda_visible_gpu_ids_str = None - if not (job_type == SchedulerConstants.JOB_TASK_TYPE_SERVE or - job_type == SchedulerConstants.JOB_TASK_TYPE_DEPLOY): - cuda_visible_gpu_ids_str = JobRunnerUtils.get_instance().occupy_gpu_ids( - run_id, matched_gpu_num, train_edge_id, inner_id=endpoint_id, - model_master_device_id=model_master_device_id, - model_slave_device_id=model_slave_device_id) - logging.info( - f"Run started, available gpu ids: {JobRunnerUtils.get_instance().get_available_gpu_id_list(train_edge_id)}") - - # Start server with multiprocessing mode - self.request_json = request_json - run_id_str = str(run_id) - self.running_request_json[run_id_str] = request_json - client_runner = FedMLClientRunner( - self.args, edge_id=train_edge_id, request_json=request_json, agent_config=self.agent_config, run_id=run_id, - cuda_visible_gpu_ids_str=cuda_visible_gpu_ids_str - ) - client_runner.start_request_json = payload - self.run_process_event_map[run_id_str] = multiprocessing.Event() - self.run_process_event_map[run_id_str].clear() - client_runner.run_process_event = self.run_process_event_map[run_id_str] - self.run_process_completed_event_map[run_id_str] = multiprocessing.Event() - self.run_process_completed_event_map[run_id_str].clear() - client_runner.run_process_completed_event = self.run_process_completed_event_map[run_id_str] - client_runner.server_id = request_json.get("server_id", "0") - logging.info("start the runner process.") - self.run_process_map[run_id_str] = Process(target=client_runner.run, args=( - self.run_process_event_map[run_id_str], self.run_process_completed_event_map[run_id_str], - self.message_center.get_message_queue())) - self.run_process_map[run_id_str].start() - ClientConstants.save_run_process(run_id, self.run_process_map[run_id_str].pid) - - def callback_stop_train(self, topic, payload): - # logging.info("callback_stop_train: topic = %s, payload = %s" % (topic, payload)) - # logging.info( - # f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - # ) - - train_edge_id = str(topic).split("/")[-2] - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json.get("runId", None) - if run_id is None: - run_id = request_json.get("id", None) - run_status = request_json.get("run_status", ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED) - - # logging.info("Stop run with multiprocessing...") - - # Stop client with multiprocessing mode - run_id_str = str(run_id) - client_runner = FedMLClientRunner( - self.args, edge_id=train_edge_id, request_json=request_json, agent_config=self.agent_config, run_id=run_id - ) - self.cleanup_containers_and_release_gpus(run_id, train_edge_id) - client_runner.run_process_event = self.run_process_event_map.get(run_id_str, None) - client_runner.run_process = self.run_process_map.get(run_id_str, None) - client_runner.message_center = self.message_center - client_runner.mlops_metrics = self.mlops_metrics - client_runner.sync_run_stop_status(run_status=run_status) - - def cleanup_containers_and_release_gpus(self, run_id, edge_id): - job_type = JobRunnerUtils.get_job_type_from_run_id(run_id) - - if not job_type: - logging.info(f"Failed to get job type from run id {run_id}. This is not an error as it would usually " - f"happen when the job is not found in the database because job is already finished and " - f"cleaned up. Exiting cleanup_containers_and_release_gpus.") - return - - # Check if the job type is not "serve" or "deploy" - if not (job_type == SchedulerConstants.JOB_TASK_TYPE_SERVE or - job_type == SchedulerConstants.JOB_TASK_TYPE_DEPLOY): - - # Terminate the run docker container if exists - container_name = JobRunnerUtils.get_run_container_name(run_id) - docker_client = JobRunnerUtils.get_docker_client(DockerArgs()) - logging.info(f"Terminating the run docker container {container_name} if exists...") - try: - JobRunnerUtils.remove_run_container_if_exists(container_name, docker_client) - except Exception as e: - logging.error(f"Exception {e} occurred when terminating docker container. " - f"Traceback: {traceback.format_exc()}") - - # Release the GPU ids and update the GPU availability in the persistent store - JobRunnerUtils.get_instance().release_gpu_ids(run_id, edge_id) - - # Send mqtt message reporting the new gpu availability to the backend - MLOpsDevicePerfStats.report_gpu_device_info(self.edge_id, mqtt_mgr=self.mqtt_mgr) - - def cleanup_client_with_status(self): - if self.device_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED: - # logging.info("received to finished status.") - self.cleanup_run_when_finished() - elif self.device_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED: - # logging.error("received to failed status from the server agent") - self.cleanup_run_when_starting_failed(should_send_client_id_status=False) - elif self.device_status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED: - # logging.error("received to failed status from the server agent") - self.cleanup_run_when_starting_failed(status=self.device_status, should_send_client_id_status=False) - - def callback_runner_id_status(self, topic, payload): - # logging.info("callback_runner_id_status: topic = %s, payload = %s" % (topic, payload)) - # logging.info(f"FedMLDebug - Receive: topic ({topic}), payload ({payload})") - request_json = json.loads(payload) - is_retain = request_json.get("is_retain", False) - if is_retain: - return - run_id = request_json["run_id"] - edge_id = str(topic).split("/")[-2].split('_')[-1] - status = request_json["status"] - run_id_str = str(run_id) - - self.save_training_status( - edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED - if status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_EXCEPTION else status) - - if status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FINISHED or \ - status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_FAILED or \ - status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_KILLED: - completed_event = self.run_process_completed_event_map.get(run_id_str, None) - if completed_event is not None: - completed_event.set() - - # Stop client with multiprocessing mode - client_runner = FedMLClientRunner( - self.args, - edge_id=edge_id, - request_json=request_json, - agent_config=self.agent_config, - run_id=run_id, - ) - client_runner.device_status = status - client_runner.message_center = self.message_center - client_runner.mlops_metrics = self.mlops_metrics - client_runner.cleanup_client_with_status() - - running_json = self.running_request_json.get(run_id_str) - if running_json is None: - try: - current_job = FedMLClientDataInterface.get_instance().get_job_by_id(run_id) - running_json = json.loads(current_job.running_json) - except Exception as e: - logging.error(f"Failed to get running json with Exception {e}. Traceback: {traceback.format_exc()}") - - if running_json is not None: - job_type = JobRunnerUtils.parse_job_type(running_json) - if not SchedulerConstants.is_deploy_job(job_type): - logging.info(f"[run/device][{run_id}/{edge_id}] Release gpu resource when run ended.") - self.cleanup_containers_and_release_gpus(run_id, edge_id) - - run_process = self.run_process_map.get(run_id_str, None) - if run_process is not None: - if run_process.pid is not None: - RunProcessUtils.kill_process(run_process.pid) - - # Terminate the run docker container if exists - try: - container_name = JobRunnerUtils.get_run_container_name(run_id) - docker_client = JobRunnerUtils.get_docker_client(DockerArgs()) - logging.info(f"Terminating the run docker container {container_name} if exists...") - JobRunnerUtils.remove_run_container_if_exists(container_name, docker_client) - except Exception as e: - logging.error(f"Error occurred when terminating docker container." - f"Exception: {e}, Traceback: {traceback.format_exc()}.") - - self.run_process_map.pop(run_id_str) - - # Stop log processor for current run - MLOpsRuntimeLogDaemon.get_instance(self.args).stop_log_processor(run_id, edge_id) - - def callback_report_current_status(self, topic, payload): - logging.info( - f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - ) - - self.send_agent_active_msg() - if self.general_edge_id is not None: - self.send_agent_active_msg(self.general_edge_id) - - @staticmethod - def process_ota_upgrade_msg(): - os.system("pip install -U fedml") - - @staticmethod - def callback_client_ota_msg(topic, payload): - logging.info( - f"FedMLDebug - Receive: topic ({topic}), payload ({payload})" - ) - - request_json = json.loads(payload) - cmd = request_json["cmd"] - - if cmd == ClientConstants.FEDML_OTA_CMD_UPGRADE: - FedMLClientRunner.process_ota_upgrade_msg() - # Process(target=FedMLClientRunner.process_ota_upgrade_msg).start() - raise Exception("After upgraded, restart runner...") - elif cmd == ClientConstants.FEDML_OTA_CMD_RESTART: - raise Exception("Restart runner...") - - def get_all_run_process_list_map(self): - run_process_dict = dict() - for run_id_str, process in self.run_process_map.items(): - cur_run_process_list = ClientConstants.get_learning_process_list(run_id_str) - run_process_dict[run_id_str] = cur_run_process_list - - return run_process_dict - - def response_device_info_to_mlops(self, topic, payload): - payload_json = json.loads(payload) - server_id = payload_json.get("server_id", 0) - run_id = payload_json.get("run_id", 0) - listen_edge_id = str(topic).split("/")[-1] - context = payload_json.get("context", None) - need_gpu_info = payload_json.get("need_gpu_info", False) - need_running_process_list = payload_json.get("need_running_process_list", False) - response_topic = f"deploy/slave_agent/mlops/response_device_info" - if self.mlops_metrics is not None and self.model_device_client_edge_id_list is not None and \ - self.model_device_server_id is not None: - if not need_gpu_info: - device_info_json = { - "edge_id": listen_edge_id, - "fedml_version": fedml.__version__, - "user_id": self.args.user - } - else: - total_mem, free_mem, total_disk_size, free_disk_size, cup_utilization, cpu_cores, gpu_cores_total, \ - gpu_cores_available, sent_bytes, recv_bytes, gpu_available_ids = sys_utils.get_sys_realtime_stats() - host_ip = sys_utils.get_host_ip() - host_port = sys_utils.get_available_port() - gpu_available_ids = JobRunnerUtils.get_available_gpu_id_list(self.edge_id) - gpu_available_ids = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids) - gpu_cores_available = len(gpu_available_ids) - gpu_list = sys_utils.get_gpu_list() - device_info_json = { - "edge_id": listen_edge_id, - "memoryTotal": round(total_mem * MLOpsUtils.BYTES_TO_GB, 2), - "memoryAvailable": round(free_mem * MLOpsUtils.BYTES_TO_GB, 2), - "diskSpaceTotal": round(total_disk_size * MLOpsUtils.BYTES_TO_GB, 2), - "diskSpaceAvailable": round(free_disk_size * MLOpsUtils.BYTES_TO_GB, 2), - "cpuUtilization": round(cup_utilization, 2), - "cpuCores": cpu_cores, - "gpuCoresTotal": gpu_cores_total, - "gpuCoresAvailable": gpu_cores_available, - "gpu_available_ids": gpu_available_ids, - "gpu_list": gpu_list, - "node_ip": host_ip, - "node_port": host_port, - "networkTraffic": sent_bytes + recv_bytes, - "updateTime": int(MLOpsUtils.get_ntp_time()), - "fedml_version": fedml.__version__, - "user_id": self.args.user - } - if need_running_process_list: - device_info_json["run_process_list_map"] = self.get_all_run_process_list_map() - salve_device_ids = list() - for model_client_edge_id in self.model_device_client_edge_id_list: - salve_device_ids.append(model_client_edge_id) - response_payload = {"slave_device_id": self.model_device_client_edge_id_list[0], - "slave_device_id_list": salve_device_ids, - "master_device_id": self.model_device_server_id, - "run_id": run_id, "edge_id": listen_edge_id, - "edge_info": device_info_json} - if context is not None: - response_payload["context"] = context - self.message_center.send_message(response_topic, json.dumps(response_payload), run_id=run_id) - - def callback_report_device_info(self, topic, payload): - payload_json = json.loads(payload) - server_id = payload_json.get("server_id", 0) - run_id = payload_json.get("run_id", 0) - listen_edge_id = str(topic).split("/")[-1] - context = payload_json.get("context", None) - need_gpu_info = payload_json.get("need_gpu_info", False) - need_running_process_list = payload_json.get("need_running_process_list", False) - response_topic = f"client/server/response_device_info/{server_id}" - if self.mlops_metrics is not None and self.model_device_client_edge_id_list is not None and \ - self.model_device_server_id is not None: - if not need_gpu_info: - device_info_json = { - "edge_id": listen_edge_id, - "fedml_version": fedml.__version__, - "user_id": self.args.user - } - else: - total_mem, free_mem, total_disk_size, free_disk_size, cup_utilization, cpu_cores, gpu_cores_total, \ - gpu_cores_available, sent_bytes, recv_bytes, gpu_available_ids = sys_utils.get_sys_realtime_stats() - host_ip = sys_utils.get_host_ip() - host_port = sys_utils.get_available_port() - gpu_available_ids = JobRunnerUtils.get_available_gpu_id_list(self.edge_id) - gpu_available_ids = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids) - gpu_cores_available = len(gpu_available_ids) - gpu_list = sys_utils.get_gpu_list() - device_info_json = { - "edge_id": listen_edge_id, - "memoryTotal": round(total_mem * MLOpsUtils.BYTES_TO_GB, 2), - "memoryAvailable": round(free_mem * MLOpsUtils.BYTES_TO_GB, 2), - "diskSpaceTotal": round(total_disk_size * MLOpsUtils.BYTES_TO_GB, 2), - "diskSpaceAvailable": round(free_disk_size * MLOpsUtils.BYTES_TO_GB, 2), - "cpuUtilization": round(cup_utilization, 2), - "cpuCores": cpu_cores, - "gpuCoresTotal": gpu_cores_total, - "gpuCoresAvailable": gpu_cores_available, - "gpu_available_ids": gpu_available_ids, - "gpu_list": gpu_list, - "node_ip": host_ip, - "node_port": host_port, - "networkTraffic": sent_bytes + recv_bytes, - "updateTime": int(MLOpsUtils.get_ntp_time()), - "fedml_version": fedml.__version__, - "user_id": self.args.user - } - if need_running_process_list: - device_info_json["run_process_list_map"] = self.get_all_run_process_list_map() - salve_device_ids = list() - for model_client_edge_id in self.model_device_client_edge_id_list: - salve_device_ids.append(model_client_edge_id) - response_payload = {"slave_device_id": self.model_device_client_edge_id_list[0], - "slave_device_id_list": salve_device_ids, - "master_device_id": self.model_device_server_id, - "run_id": run_id, "edge_id": listen_edge_id, - "edge_info": device_info_json} - if context is not None: - response_payload["context"] = context - self.message_center.send_message(response_topic, json.dumps(response_payload), run_id=run_id) - - def callback_client_logout(self, topic, payload): - payload_json = json.loads(payload) - secret = payload_json.get("auth", None) - if secret is None or str(secret) != "246b1be6-0eeb-4b17-b118-7d74de1975d4": - return - logging.info("Received the logout request.") - if self.run_process_event is not None: - self.run_process_event.set() - if self.run_process_completed_event is not None: - self.run_process_completed_event.set() - self.disable_client_login = True - time.sleep(3) - os.system("fedml logout") - - def save_training_status(self, edge_id, training_status): - self.current_training_status = training_status - ClientConstants.save_training_infos(edge_id, training_status) - - @staticmethod - def get_gpu_machine_id(): - gpu_list = sys_utils.get_gpu_list() - gpu_uuids = "" - if len(gpu_list) > 0: - for gpu in gpu_list: - gpu_uuids += gpu.get("uuid", "") - else: - gpu_uuids = str(uuid.uuid4()) - device_id_combination = \ - f"{FedMLClientRunner.get_machine_id()}-{hex(uuid.getnode())}-{gpu_uuids}" - device_id = security_utils.get_content_hash(device_id_combination) - return device_id - - @staticmethod - def get_device_id(use_machine_id=False): - device_file_path = os.path.join(ClientConstants.get_data_dir(), - ClientConstants.LOCAL_RUNNER_INFO_DIR_NAME) - file_for_device_id = os.path.join(device_file_path, "devices.id") - if not os.path.exists(device_file_path): - os.makedirs(device_file_path, exist_ok=True) - elif os.path.exists(file_for_device_id): - with open(file_for_device_id, 'r', encoding='utf-8') as f: - device_id_from_file = f.readline() - if device_id_from_file is not None and device_id_from_file != "": - return device_id_from_file - - if platform.system() == "Darwin": - cmd_get_serial_num = "system_profiler SPHardwareDataType | grep Serial | awk '{gsub(/ /,\"\")}{print}' " \ - "|awk -F':' '{print $2}' " - device_id = os.popen(cmd_get_serial_num).read() - device_id = device_id.replace('\n', '').replace(' ', '') - if device_id is None or device_id == "": - if not use_machine_id: - device_id = hex(uuid.getnode()) - else: - device_id = FedMLClientRunner.get_gpu_machine_id() - else: - device_id = "0x" + device_id - else: - if "nt" in os.name: - - def get_uuid(): - guid = "" - try: - cmd = "wmic csproduct get uuid" - guid = str(subprocess.check_output(cmd)) - pos1 = guid.find("\\n") + 2 - guid = guid[pos1:-15] - except Exception as ex: - logging.error(f"Failed to get uuid with Exception {ex}. Traceback: {traceback.format_exc()}") - pass - return str(guid) - - device_id = str(get_uuid()) - logging.info(device_id) - elif "posix" in os.name: - device_id = sys_utils.get_device_id_in_docker() - if device_id is None: - if not use_machine_id: - device_id = hex(uuid.getnode()) - else: - device_id = device_id = FedMLClientRunner.get_gpu_machine_id() - else: - device_id = sys_utils.run_subprocess_open( - "hal-get-property --udi /org/freedesktop/Hal/devices/computer --key system.hardware.uuid".split() - ) - device_id = hex(device_id) - - if device_id is not None and device_id != "": - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - else: - device_id = hex(uuid.uuid4()) - with open(file_for_device_id, 'w', encoding='utf-8') as f: - f.write(device_id) - - return device_id - - @staticmethod - def get_machine_id(): - try: - import machineid - return machineid.id().replace('\n', '').replace('\r\n', '').strip() - except Exception as e: - logging.error(f"Failed to get machine id with Exception {e}. Traceback: {traceback.format_exc()}") - return hex(uuid.getnode()) - - @staticmethod - def bind_account_and_device_id(url, account_id, device_id, os_name, api_key="", role="client"): - ip = requests.get('https://checkip.amazonaws.com').text.strip() - fedml_ver, exec_path, os_ver, cpu_info, python_ver, torch_ver, mpi_installed, \ - cpu_usage, available_mem, total_mem, gpu_info, gpu_available_mem, gpu_total_mem, \ - gpu_count, gpu_vendor, cpu_count, gpu_device_name = get_sys_runner_info() - host_name = sys_utils.get_host_name() - json_params = { - "accountid": account_id, - "deviceid": device_id, - "type": os_name, - "state": ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE, - "status": ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE, - "processor": cpu_info, - "core_type": cpu_info, - "network": "", - "role": role, - "os_ver": os_ver, - "memory": total_mem, - "ip": ip, - "api_key": api_key, - "extra_infos": {"fedml_ver": fedml_ver, "exec_path": exec_path, "os_ver": os_ver, - "cpu_info": cpu_info, "python_ver": python_ver, "torch_ver": torch_ver, - "mpi_installed": mpi_installed, "cpu_usage": cpu_usage, - "available_mem": available_mem, "total_mem": total_mem, - "cpu_count": cpu_count, "gpu_count": 0, "host_name": host_name} - } - if gpu_count > 0: - if gpu_total_mem is not None: - json_params["gpu"] = gpu_info if gpu_info is not None else "" + ", Total GPU Memory: " + gpu_total_mem - else: - json_params["gpu"] = gpu_info if gpu_info is not None else "" - json_params["extra_infos"]["gpu_info"] = gpu_info if gpu_info is not None else "" - if gpu_available_mem is not None: - json_params["extra_infos"]["gpu_available_mem"] = gpu_available_mem - if gpu_total_mem is not None: - json_params["extra_infos"]["gpu_total_mem"] = gpu_total_mem - - json_params["extra_infos"]["gpu_count"] = gpu_count - json_params["extra_infos"]["gpu_vendor"] = gpu_vendor - json_params["extra_infos"]["gpu_device_name"] = gpu_device_name - - gpu_available_id_list = sys_utils.get_available_gpu_id_list(limit=gpu_count) - gpu_available_count = len(gpu_available_id_list) if gpu_available_id_list is not None else 0 - gpu_list = sys_utils.get_gpu_list() - json_params["extra_infos"]["gpu_available_count"] = gpu_available_count - json_params["extra_infos"]["gpu_available_id_list"] = gpu_available_id_list - json_params["extra_infos"]["gpu_list"] = gpu_list - else: - json_params["gpu"] = "None" - json_params["extra_infos"]["gpu_available_count"] = 0 - json_params["extra_infos"]["gpu_available_id_list"] = [] - json_params["extra_infos"]["gpu_list"] = [] - - _, cert_path = MLOpsConfigs.get_request_params() - if cert_path is not None: - try: - requests.session().verify = cert_path - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - except requests.exceptions.SSLError as err: - logging.error( - f"Failed to bind account and device id with error: {err}, traceback: {traceback.format_exc()}") - MLOpsConfigs.install_root_ca_file() - response = requests.post( - url, json=json_params, verify=True, - headers={"content-type": "application/json", "Connection": "close"} - ) - else: - response = requests.post(url, json=json_params, headers={"Connection": "close"}) - edge_id, user_name, extra_url, general_edge_id = -1, None, None, None - if response.status_code != 200: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - pass - else: - # print("url = {}, response = {}".format(url, response)) - status_code = response.json().get("code") - if status_code == "SUCCESS": - edge_id = response.json().get("data").get("id") - user_name = response.json().get("data").get("userName", None) - extra_url = response.json().get("data").get("url", None) - general_edge_id = response.json().get("data").get("general_edge_id", None) - if edge_id is None or edge_id <= 0: - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - else: - if status_code == SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR: - raise SystemExit(SchedulerConstants.BINDING_ACCOUNT_NOT_EXIST_ERROR) - print(f"Binding to MLOps with response.status_code = {response.status_code}, " - f"response.content: {response.content}") - return -1, None, None, None - return edge_id, user_name, extra_url, general_edge_id - - def fetch_configs(self): - return MLOpsConfigs.fetch_all_configs() - - def send_agent_active_msg(self, edge_id): - active_topic = "flclient_agent/active" - status = MLOpsStatus.get_instance().get_client_agent_status(edge_id) - if ( - status is not None - and status != ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE - and status != ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE - ): - return - - try: - current_job = FedMLClientDataInterface.get_instance().get_job_by_id(self.run_id) - except Exception as e: - logging.error(f"Failed to get current job with Exception {e}. Traceback: {traceback.format_exc()}") - current_job = None - if current_job is None: - if status is not None and status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE: - status = ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE - else: - return - else: - status = ClientConstants.get_device_state_from_run_edge_state(current_job.status) - active_msg = {"ID": edge_id, "status": status} - MLOpsStatus.get_instance().set_client_agent_status(edge_id, status) - self.mqtt_mgr.send_message_json(active_topic, json.dumps(active_msg)) - logging.info(f"Send agent active msg {active_msg}") - - def recover_start_train_msg_after_upgrading(self): - try: - current_job = FedMLClientDataInterface.get_instance().get_current_job() - if current_job is not None and \ - current_job.status == ClientConstants.MSG_MLOPS_CLIENT_STATUS_UPGRADING: - logging.info("start training after upgrading.") - topic_start_train = "flserver_agent/" + str(self.edge_id) + "/start_train" - self.callback_start_train(topic_start_train, current_job.running_json) - except Exception as e: - logging.error(f"recover starting train message after upgrading failed with exception {e}, " - f"Traceback {traceback.format_exc()}") - - def on_agent_mqtt_connected(self, mqtt_client_object): - # The MQTT message topic format is as follows: // - - # Setup MQTT message listener for starting training - topic_start_train = "flserver_agent/" + str(self.edge_id) + "/start_train" - self.add_message_listener(topic_start_train, self.callback_start_train) - self.mqtt_mgr.add_message_listener(topic_start_train, self.listener_message_dispatch_center) - - # Setup MQTT message listener for stopping training - topic_stop_train = "flserver_agent/" + str(self.edge_id) + "/stop_train" - self.add_message_listener(topic_stop_train, self.callback_stop_train) - self.mqtt_mgr.add_message_listener(topic_stop_train, self.listener_message_dispatch_center) - - - # Setup MQTT message listener for client status switching - topic_client_status = "fl_client/flclient_agent_" + str(self.edge_id) + "/status" - self.add_message_listener(topic_client_status, self.callback_runner_id_status) - self.mqtt_mgr.add_message_listener(topic_client_status, self.listener_message_dispatch_center) - - # Setup MQTT message listener to report current device status. - topic_report_status = "mlops/report_device_status" - self.add_message_listener(topic_report_status, self.callback_report_current_status) - self.mqtt_mgr.add_message_listener(topic_report_status, self.listener_message_dispatch_center) - - # Setup MQTT message listener to OTA messages from the MLOps. - topic_ota_msg = "mlops/flclient_agent_" + str(self.edge_id) + "/ota" - self.add_message_listener(topic_ota_msg, self.callback_client_ota_msg) - self.mqtt_mgr.add_message_listener(topic_ota_msg, self.listener_message_dispatch_center) - - # Setup MQTT message listener to OTA messages from the MLOps. - topic_request_device_info = "server/client/request_device_info/" + str(self.edge_id) - self.add_message_listener(topic_request_device_info, self.callback_report_device_info) - self.mqtt_mgr.add_message_listener(topic_request_device_info, self.listener_message_dispatch_center) - - topic_request_edge_device_info_from_mlops = f"deploy/mlops/slave_agent/request_device_info/{self.edge_id}" - self.add_message_listener(topic_request_edge_device_info_from_mlops, self.response_device_info_to_mlops) - self.mqtt_mgr.add_message_listener(topic_request_edge_device_info_from_mlops, self.listener_message_dispatch_center) - - topic_request_deploy_master_device_info_from_mlops = None - if self.model_device_server_id is not None: - topic_request_deploy_master_device_info_from_mlops = f"deploy/mlops/master_agent/request_device_info/{self.model_device_server_id}" - self.add_message_listener(topic_request_deploy_master_device_info_from_mlops, self.response_device_info_to_mlops) - self.mqtt_mgr.add_message_listener(topic_request_deploy_master_device_info_from_mlops, self.listener_message_dispatch_center) - - topic_request_deploy_slave_device_info_from_mlops = None - if self.model_device_client_edge_id_list is not None and len(self.model_device_client_edge_id_list) > 0: - topic_request_deploy_slave_device_info_from_mlops = f"deploy/mlops/slave_agent/request_device_info/{self.model_device_client_edge_id_list[0]}" - self.add_message_listener(topic_request_deploy_slave_device_info_from_mlops, self.response_device_info_to_mlops) - self.mqtt_mgr.add_message_listener(topic_request_deploy_slave_device_info_from_mlops, self.listener_message_dispatch_center) - - # Setup MQTT message listener to logout from MLOps. - topic_client_logout = "mlops/client/logout/" + str(self.edge_id) - self.add_message_listener(topic_client_logout, self.callback_client_logout) - self.mqtt_mgr.add_message_listener(topic_client_logout, self.listener_message_dispatch_center) - - # Subscribe topics for starting train, stopping train and fetching client status. - mqtt_client_object.subscribe(topic_start_train, qos=2) - mqtt_client_object.subscribe(topic_stop_train, qos=2) - mqtt_client_object.subscribe(topic_client_status, qos=2) - mqtt_client_object.subscribe(topic_report_status, qos=2) - mqtt_client_object.subscribe(topic_ota_msg, qos=2) - mqtt_client_object.subscribe(topic_request_device_info, qos=2) - mqtt_client_object.subscribe(topic_request_edge_device_info_from_mlops, qos=2) - if topic_request_deploy_master_device_info_from_mlops is not None: - mqtt_client_object.subscribe(topic_request_deploy_master_device_info_from_mlops, qos=2) - if topic_request_deploy_slave_device_info_from_mlops is not None: - mqtt_client_object.subscribe(topic_request_deploy_slave_device_info_from_mlops, qos=2) - mqtt_client_object.subscribe(topic_client_logout, qos=2) - - self.subscribed_topics.clear() - self.subscribed_topics.append(topic_start_train) - self.subscribed_topics.append(topic_stop_train) - self.subscribed_topics.append(topic_client_status) - self.subscribed_topics.append(topic_report_status) - self.subscribed_topics.append(topic_ota_msg) - self.subscribed_topics.append(topic_request_device_info) - self.subscribed_topics.append(topic_request_edge_device_info_from_mlops) - if topic_request_deploy_master_device_info_from_mlops is not None: - self.subscribed_topics.append(topic_request_deploy_master_device_info_from_mlops) - if topic_request_deploy_slave_device_info_from_mlops is not None: - self.subscribed_topics.append(topic_request_deploy_slave_device_info_from_mlops) - self.subscribed_topics.append(topic_client_logout) - - # Subscribe the messages for federated learning. - self.subscribe_fl_msgs() - - # Broadcast the first active message. - self.send_agent_active_msg(self.edge_id) - if self.general_edge_id is not None: - self.send_agent_active_msg(self.general_edge_id) - - # Echo results - MLOpsRuntimeLog.get_instance(self.args).enable_show_log_to_stdout() - worker_deploy_id_list = [modeld_device_clint.edge_id for index, modeld_device_clint in - enumerate(self.model_device_client_list)] - print("\nCongratulations, your device is connected to the FedML MLOps platform successfully!") - print(f"Your FedML Edge ID is {str(self.edge_id)}, unique device ID is {str(self.unique_device_id)}, " - f"master deploy ID is {str(self.model_device_server.edge_id)}, " - f"worker deploy ID is {worker_deploy_id_list}" - ) - if self.edge_extra_url is not None and self.edge_extra_url != "": - print(f"You may visit the following url to fill in more information with your device.\n" - f"{self.edge_extra_url}") - MLOpsRuntimeLog.get_instance(self.args).enable_show_log_to_stdout(enable=False) - - from fedml.core.mlops import sync_deploy_id - sync_deploy_id( - self.edge_id, self.model_device_server.edge_id, worker_deploy_id_list) - - # Start the message center for listener - self.start_listener(sender_message_queue=self.message_center.get_message_queue(), - agent_config=self.agent_config) - - def subscribe_fl_msgs(self): - if self.general_edge_id is None: - return - - # Setup MQTT message listener for starting training - topic_start_train = "flserver_agent/" + str(self.general_edge_id) + "/start_train" - self.add_message_listener(topic_start_train, self.callback_start_train) - self.mqtt_mgr.add_message_listener(topic_start_train, self.listener_message_dispatch_center) - - # Setup MQTT message listener for stopping training - topic_stop_train = "flserver_agent/" + str(self.general_edge_id) + "/stop_train" - self.add_message_listener(topic_stop_train, self.callback_stop_train) - self.mqtt_mgr.add_message_listener(topic_stop_train, self.listener_message_dispatch_center) - - # Setup MQTT message listener for client status switching - topic_client_status = "fl_client/flclient_agent_" + str(self.general_edge_id) + "/status" - self.add_message_listener(topic_client_status, self.callback_runner_id_status) - self.mqtt_mgr.add_message_listener(topic_client_status, self.listener_message_dispatch_center) - - # Setup MQTT message listener to OTA messages from the MLOps. - topic_request_device_info = "server/client/request_device_info/" + str(self.general_edge_id) - self.add_message_listener(topic_request_device_info, self.callback_report_device_info) - self.mqtt_mgr.add_message_listener(topic_request_device_info, self.listener_message_dispatch_center) - - topic_request_device_info_from_mlops = f"deploy/mlops/client_agent/request_device_info/{self.general_edge_id}" - self.add_message_listener(topic_request_device_info_from_mlops, self.response_device_info_to_mlops) - self.mqtt_mgr.add_message_listener(topic_request_device_info_from_mlops, self.listener_message_dispatch_center) - - # Subscribe topics for starting train, stopping train and fetching client status. - self.mqtt_mgr.subscribe_msg(topic_start_train) - self.mqtt_mgr.subscribe_msg(topic_stop_train) - self.mqtt_mgr.subscribe_msg(topic_client_status) - self.mqtt_mgr.subscribe_msg(topic_request_device_info) - self.mqtt_mgr.subscribe_msg(topic_request_device_info_from_mlops) - - self.subscribed_topics.append(topic_start_train) - self.subscribed_topics.append(topic_stop_train) - self.subscribed_topics.append(topic_client_status) - self.subscribed_topics.append(topic_request_device_info) - self.subscribed_topics.append(topic_request_device_info_from_mlops) - - def on_agent_mqtt_disconnected(self, mqtt_client_object): - MLOpsStatus.get_instance().set_client_agent_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE - ) - pass - - def setup_agent_mqtt_connection(self, service_config): - # Setup MQTT connection - self.mqtt_mgr = MqttManager( - service_config["mqtt_config"]["BROKER_HOST"], - service_config["mqtt_config"]["BROKER_PORT"], - service_config["mqtt_config"]["MQTT_USER"], - service_config["mqtt_config"]["MQTT_PWD"], - service_config["mqtt_config"]["MQTT_KEEPALIVE"], - f"FedML_ClientAgent_Daemon_@{self.user_name}@_@{self.args.current_device_id}@_@{str(uuid.uuid4())}@", - "flclient_agent/last_will_msg", - json.dumps({"ID": self.edge_id, "status": ClientConstants.MSG_MLOPS_CLIENT_STATUS_OFFLINE}) - ) - self.agent_config = service_config - - # Init local database - FedMLClientDataInterface.get_instance().create_job_table() - - # Start the message center to process edge related messages. - self.setup_message_center() - - # Start local API services - client_api_cmd = "fedml.computing.scheduler.slave.client_api:api" - client_api_pids = RunProcessUtils.get_pid_from_cmd_line(client_api_cmd) - if client_api_pids is None or len(client_api_pids) <= 0: - python_program = get_python_program() - cur_dir = os.path.dirname(__file__) - fedml_base_dir = os.path.dirname(os.path.dirname(os.path.dirname(cur_dir))) - self.local_api_process = ClientConstants.exec_console_with_script( - "{} -m uvicorn {} --host 0.0.0.0 --port {} " - "--reload --reload-delay 3 --reload-dir {} --log-level critical".format( - python_program, client_api_cmd, ClientConstants.LOCAL_CLIENT_API_PORT, fedml_base_dir), - should_capture_stdout=False, - should_capture_stderr=False - ) - # if self.local_api_process is not None and self.local_api_process.pid is not None: - # print(f"Client local API process id {self.local_api_process.pid}") - - # Setup MQTT connected listener - self.mqtt_mgr.add_connected_listener(self.on_agent_mqtt_connected) - self.mqtt_mgr.add_disconnected_listener(self.on_agent_mqtt_disconnected) - self.mqtt_mgr.connect() - - # Report the IDLE status to MLOps - self.mlops_metrics.report_client_training_status( - self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE) - MLOpsStatus.get_instance().set_client_agent_status(self.edge_id, ClientConstants.MSG_MLOPS_CLIENT_STATUS_IDLE) - - # MLOpsRuntimeLogDaemon.get_instance(self.args).stop_all_log_processor() - self.recover_start_train_msg_after_upgrading() - - infer_host = os.getenv("FEDML_INFER_HOST", None) - infer_redis_addr = os.getenv("FEDML_INFER_REDIS_ADDR", None) - infer_redis_port = os.getenv("FEDML_INFER_REDIS_PORT", None) - infer_redis_password = os.getenv("FEDML_INFER_REDIS_PASSWORD", None) - model_client_num = os.getenv("FEDML_MODEL_WORKER_NUM", None) - os.environ["FEDML_CURRENT_EDGE_ID"] = str(self.edge_id) - - if not ComputeCacheManager.get_instance().set_redis_params(): - os.environ["FEDML_DISABLE_REDIS_CONNECTION"] = "1" - - if self.model_device_client_edge_id_list is None: - self.model_device_client_edge_id_list = list() - if self.model_device_client_list is None: - model_client_num = 1 if model_client_num is None else int(model_client_num) - self.model_device_client_list = list() - for client_index in range(model_client_num): - model_device_client = FedMLModelDeviceClientRunner( - self.args, f"{self.args.current_device_id}_{client_index + 1}", self.args.os_name, - self.args.is_from_docker, self.agent_config) - if infer_host is not None: - model_device_client.infer_host = infer_host - if infer_redis_addr is not None: - model_device_client.redis_addr = infer_redis_addr - if infer_redis_port is not None: - model_device_client.redis_port = infer_redis_port - if infer_redis_password is not None: - model_device_client.redis_password = infer_redis_password - model_device_client.start() - self.model_device_client_list.append(model_device_client) - self.model_device_client_edge_id_list.append(model_device_client.get_edge_id()) - - if self.model_device_server is None: - self.model_device_server = FedMLModelDeviceServerRunner(self.args, self.args.current_device_id, - self.args.os_name, self.args.is_from_docker, - self.agent_config) - if infer_host is not None: - self.model_device_server.infer_host = infer_host - if infer_redis_addr is not None: - self.model_device_server.redis_addr = infer_redis_addr - if infer_redis_port is not None: - self.model_device_server.redis_port = infer_redis_port - if infer_redis_password is not None: - self.model_device_server.redis_password = infer_redis_password - - self.model_device_server.start() - self.model_device_server_id = self.model_device_server.get_edge_id() - - JobCleanup.get_instance().sync_data_on_startup(self.edge_id) - - os.environ["FEDML_DEPLOY_MASTER_ID"] = str(self.model_device_server.get_edge_id()) - os.environ["FEDML_DEPLOY_WORKER_IDS"] = str([client.get_edge_id() for client in self.model_device_client_list]) - self.mlops_metrics.stop_device_realtime_perf() - self.mlops_metrics.report_device_realtime_perf(self.args, service_config["mqtt_config"]) - - def start_agent_mqtt_loop(self): - # Start MQTT message loop - try: - self.mqtt_mgr.loop_forever() - except Exception as e: - logging.error(f"Errors in the MQTT loop: Exception {e}, Traceback: {traceback.format_exc()}") - if str(e) == "Restarting after upgraded...": - logging.info("Restarting after upgraded...") - else: - logging.info("Client tracing: {}".format(traceback.format_exc())) - finally: - print("finally") - login_exit_file = os.path.join(ClientConstants.get_log_file_dir(), "exited.log") - with open(login_exit_file, "w") as f: - f.writelines(f"{os.getpid()}.") - - self.stop_agent() - - time.sleep(5) - sys_utils.cleanup_all_fedml_client_login_processes( - ClientConstants.CLIENT_LOGIN_PROGRAM, clean_process_group=False) - sys.exit(1) - - def stop_agent(self): - if self.run_process_event is not None: - self.run_process_event.set() - - if self.model_device_server is not None: - self.model_device_server.stop() - self.model_device_server = None - - if self.model_device_client_list is not None: - for model_client in self.model_device_client_list: - model_client.stop() - self.model_device_client_list.clear() - self.model_device_client_list = None - - if self.mqtt_mgr is not None: - try: - for topic in self.subscribed_topics: - self.mqtt_mgr.unsubscribe_msg(topic) - except Exception as e: - logging.error(f"Unsubscribe topics error: {e}, Traceback: {traceback.format_exc()}") - pass - - self.mqtt_mgr.loop_stop() - self.mqtt_mgr.disconnect() - - self.release_message_center() - - def get_runner(self): - runner = FedMLClientRunner( - self.args, edge_id=self.edge_id, request_json=self.request_json, - agent_config=self.agent_config, run_id=self.run_id, - cuda_visible_gpu_ids_str=self.cuda_visible_gpu_ids_str - ) - runner.edge_user_name = self.user_name - runner.edge_extra_url = self.edge_extra_url - runner.unique_device_id = self.unique_device_id - runner.user_name = self.user_name - runner.general_edge_id = self.general_edge_id - runner.model_device_client_edge_id_list = self.model_device_client_edge_id_list - runner.model_device_server_id = self.model_device_server_id - return runner diff --git a/python/fedml/computing/scheduler/slave/launch_job_runner.py b/python/fedml/computing/scheduler/slave/launch_job_runner.py new file mode 100755 index 0000000000..07533af399 --- /dev/null +++ b/python/fedml/computing/scheduler/slave/launch_job_runner.py @@ -0,0 +1,41 @@ +from abc import ABC + +from .base_slave_job_runner import FedMLBaseSlaveJobRunner +from .client_constants import ClientConstants + + +class FedMLLaunchSlaveJobRunner(FedMLBaseSlaveJobRunner, ABC): + + def __init__(self, args, edge_id=0, request_json=None, agent_config=None, run_id=0, + cuda_visible_gpu_ids_str=None): + FedMLBaseSlaveJobRunner.__init__( + self, args, edge_id=edge_id, request_json=request_json, agent_config=agent_config, run_id=run_id, + cuda_visible_gpu_ids_str=cuda_visible_gpu_ids_str, agent_data_dir=ClientConstants.get_data_dir(), + agent_package_download_dir=ClientConstants.get_package_download_dir(), + agent_package_unzip_dir=ClientConstants.get_package_unzip_dir(), + agent_log_file_dir=ClientConstants.get_log_file_dir() + ) + + # Override + def _generate_job_runner_instance(self, args, run_id=None, request_json=None, agent_config=None, edge_id=None): + return FedMLLaunchSlaveJobRunner( + args, run_id=run_id, request_json=request_json, agent_config=self.agent_config, edge_id=edge_id + ) + + # Override + def _generate_extend_queue_list(self): + return None + + # Override + def get_download_package_info(self, packages_config=None): + return super().get_download_package_info(packages_config) + + # Override + def run_impl( + self, run_extend_queue_list, sender_message_center, + listener_message_queue, status_center_queue + ): + super().run_impl( + run_extend_queue_list, sender_message_center, + listener_message_queue, status_center_queue) + diff --git a/python/fedml/computing/scheduler/slave/launch_job_runner_manager.py b/python/fedml/computing/scheduler/slave/launch_job_runner_manager.py new file mode 100755 index 0000000000..3f65438f9e --- /dev/null +++ b/python/fedml/computing/scheduler/slave/launch_job_runner_manager.py @@ -0,0 +1,22 @@ + +from fedml.core.common.singleton import Singleton +from .base_slave_job_runner_manager import FedMLBaseSlaveJobRunnerManager +from .launch_job_runner import FedMLLaunchSlaveJobRunner + + +class FedMLLaunchJobRunnerManager(FedMLBaseSlaveJobRunnerManager, Singleton): + def __init__(self): + FedMLBaseSlaveJobRunnerManager.__init__(self) + + @staticmethod + def get_instance(): + return FedMLLaunchJobRunnerManager() + + # Override + def _generate_job_runner_instance( + self, args, run_id=None, request_json=None, agent_config=None, edge_id=None + ): + return FedMLLaunchSlaveJobRunner( + args, run_id=run_id, request_json=request_json, agent_config=agent_config, edge_id=edge_id) + + diff --git a/python/fedml/computing/scheduler/slave/slave_agent.py b/python/fedml/computing/scheduler/slave/slave_agent.py new file mode 100755 index 0000000000..e9c8b2fc93 --- /dev/null +++ b/python/fedml/computing/scheduler/slave/slave_agent.py @@ -0,0 +1,26 @@ + +from .base_slave_agent import FedMLBaseSlaveAgent +from .client_constants import ClientConstants +from .client_data_interface import FedMLClientDataInterface +from .slave_protocol_manager import FedMLLaunchSlaveProtocolManager + + +class FedMLLaunchSlaveAgent(FedMLBaseSlaveAgent): + def __init__(self): + FedMLBaseSlaveAgent.__init__(self) + + # Override + def _get_log_file_dir(self): + return ClientConstants.get_log_file_dir() + + # Override + def _save_agent_info(self, unique_device_id, edge_id): + ClientConstants.save_runner_infos(unique_device_id, edge_id) + + # Override + def _init_database(self): + FedMLClientDataInterface.get_instance().create_job_table() + + # Override + def _generate_protocol_manager_instance(self, args, agent_config=None): + return FedMLLaunchSlaveProtocolManager(args, agent_config=agent_config) diff --git a/python/fedml/computing/scheduler/slave/slave_protocol_manager.py b/python/fedml/computing/scheduler/slave/slave_protocol_manager.py new file mode 100755 index 0000000000..a1067a0d96 --- /dev/null +++ b/python/fedml/computing/scheduler/slave/slave_protocol_manager.py @@ -0,0 +1,105 @@ +import copy +import json +import os +import fedml +from ..comm_utils.job_cleanup import JobCleanup +from .base_slave_protocol_manager import FedMLBaseSlaveProtocolManager +from .launch_job_runner_manager import FedMLLaunchJobRunnerManager +from ..model_scheduler.model_device_server import FedMLModelDeviceServerRunner +from ..model_scheduler.model_device_client import FedMLModelDeviceClientRunner + + +class FedMLLaunchSlaveProtocolManager(FedMLBaseSlaveProtocolManager): + + def __init__(self, args, agent_config=None): + FedMLBaseSlaveProtocolManager.__init__(self, args, agent_config=agent_config) + + # Override + def generate_topics(self): + super().generate_topics() + + # Override + def add_protocol_handler(self): + super().add_protocol_handler() + + # Override + def _generate_protocol_manager_instance(self, args, agent_config=None): + return FedMLLaunchSlaveProtocolManager(args, agent_config=agent_config) + + # Override + def _get_job_runner_manager(self): + return FedMLLaunchJobRunnerManager.get_instance() + + # Override + def _process_connection_ready(self): + from fedml.core.mlops import sync_deploy_id + sync_deploy_id( + self.edge_id, self.model_device_server.edge_id, self.model_device_client_edge_id_list) + + # Override + def _process_connection_lost(self): + pass + + # Override + def _init_extra_items(self): + super()._init_extra_items() + + # Sync the data when startup + JobCleanup.get_instance().sync_data_on_startup(self.args.edge_id) + + # Get the environment variables + infer_host = os.getenv("FEDML_INFER_HOST", None) + infer_redis_addr = os.getenv("FEDML_INFER_REDIS_ADDR", None) + infer_redis_port = os.getenv("FEDML_INFER_REDIS_PORT", None) + infer_redis_password = os.getenv("FEDML_INFER_REDIS_PASSWORD", None) + model_client_num = os.getenv("FEDML_MODEL_WORKER_NUM", None) + + # Start deploy master agent and slave agent + in_args = copy.deepcopy(self.args) + if self.model_device_client_edge_id_list is None: + self.model_device_client_edge_id_list = list() + if self.model_device_client_list is None: + model_client_num = 1 if model_client_num is None else int(model_client_num) + self.model_device_client_list = list() + for client_index in range(model_client_num): + model_device_client = FedMLModelDeviceClientRunner( + in_args, f"{in_args.current_device_id}_{client_index + 1}", in_args.os_name, + in_args.is_from_docker, self.agent_config) + if infer_host is not None: + model_device_client.infer_host = infer_host + if infer_redis_addr is not None: + model_device_client.redis_addr = infer_redis_addr + if infer_redis_port is not None: + model_device_client.redis_port = infer_redis_port + if infer_redis_password is not None: + model_device_client.redis_password = infer_redis_password + model_device_client.start() + self.model_device_client_list.append(model_device_client) + self.model_device_client_edge_id_list.append(model_device_client.get_edge_id()) + + self.args = copy.deepcopy(in_args) + if self.model_device_server is None: + self.model_device_server = FedMLModelDeviceServerRunner(in_args, in_args.current_device_id, + in_args.os_name, in_args.is_from_docker, + self.agent_config) + if infer_host is not None: + self.model_device_server.infer_host = infer_host + if infer_redis_addr is not None: + self.model_device_server.redis_addr = infer_redis_addr + if infer_redis_port is not None: + self.model_device_server.redis_port = infer_redis_port + if infer_redis_password is not None: + self.model_device_server.redis_password = infer_redis_password + + self.model_device_server.start() + self.model_device_server_id = self.model_device_server.get_edge_id() + + # Save the deployed master and worker id list to the environment variable. + os.environ["FEDML_DEPLOY_MASTER_ID"] = str(self.model_device_server_id) + os.environ["FEDML_DEPLOY_WORKER_IDS"] = str(self.model_device_client_edge_id_list) + + # Start the monitor process + self.args = copy.deepcopy(in_args) + self.mlops_metrics.stop_device_realtime_perf() + self.mlops_metrics.report_device_realtime_perf(self.args, self.args.agent_config["mqtt_config"]) + pass \ No newline at end of file diff --git a/python/fedml/core/distributed/communication/grpc/grpc_comm_manager.py b/python/fedml/core/distributed/communication/grpc/grpc_comm_manager.py index 6eb9fe613e..347f449937 100644 --- a/python/fedml/core/distributed/communication/grpc/grpc_comm_manager.py +++ b/python/fedml/core/distributed/communication/grpc/grpc_comm_manager.py @@ -1,52 +1,59 @@ +import csv +import grpc +import logging import os import pickle import threading -from concurrent import futures -from typing import List - -import grpc - -from ..grpc import grpc_comm_manager_pb2_grpc, grpc_comm_manager_pb2 +import time lock = threading.Lock() +from collections import namedtuple +from concurrent import futures +from fedml.core.mlops.mlops_profiler_event import MLOpsProfilerEvent +from typing import List + from ...communication.base_com_manager import BaseCommunicationManager from ...communication.message import Message from ...communication.observer import Observer from ..constants import CommunicationConstants - -from fedml.core.mlops.mlops_profiler_event import MLOpsProfilerEvent - -import time +from ..grpc import grpc_comm_manager_pb2_grpc, grpc_comm_manager_pb2 # Check Service or serve? from ...communication.grpc.grpc_server import GRPCCOMMServicer -import logging -import csv +GRPCMapping = \ + namedtuple("GRPCMapping", + ["eid", "rank", "grpc_server_ip", "grpc_server_port", "ingress_ip"]) class GRPCCommManager(BaseCommunicationManager): + MSG_ARG_KEY_SENDER_RANK = "sender_rank" + MSG_ARG_KEY_SENDER_IP = "sender_ip" + MSG_ARG_KEY_SENDER_PORT = "sender_port" + def __init__( - self, - host, - port, - ip_config_path, - topic="fedml", - client_id=0, - client_num=0, + self, + grpc_ipconfig_path, + topic="fedml", + client_rank=0, + client_num=0, + args=None ): - # host is the ip address of server - self.host = host - self.port = str(port) + self._topic = topic - self.client_id = client_id - self.client_num = client_num self._observers: List[Observer] = [] - self.rank = client_id + self.grpc_ipconfig_path = grpc_ipconfig_path + self.grpc_mappings = dict() + self.client_rank = client_rank + self.client_id = self.client_rank + self.client_num = client_num + self.args = args - if client_id == 0: + self._init_grpc_mappings() # Initialize self.grpc_mappings variable. + + if self.client_rank == 0: self.node_type = "server" logging.info("############# THIS IS FL SERVER ################") else: @@ -61,24 +68,48 @@ def __init__( futures.ThreadPoolExecutor(max_workers=client_num), options=self.opts, ) - self.grpc_servicer = GRPCCOMMServicer(host, port, client_num, client_id) + + if self.client_id not in self.grpc_mappings: + # if no record exists for the current client id, then + # default ip and rank to "0.0.0.0" and BASE + RANK. + self.grpc_mappings[self.client_id] = GRPCMapping( + eid=self.client_id, + rank=self.client_id, + grpc_server_ip="0.0.0.0", + grpc_server_port=CommunicationConstants.GRPC_BASE_PORT + self.client_rank, + ingress_ip=None) + + self.grpc_servicer = GRPCCOMMServicer( + self.grpc_mappings[self.client_id].grpc_server_ip, + self.grpc_mappings[self.client_id].grpc_server_port, + self.client_num, + self.client_rank + ) grpc_comm_manager_pb2_grpc.add_gRPCCommManagerServicer_to_server( self.grpc_servicer, self.grpc_server ) logging.info(os.getcwd()) - self.ip_config = self._build_ip_table(ip_config_path) - # starts a grpc_server on local machine using ip address "0.0.0.0" - self.grpc_server.add_insecure_port("{}:{}".format("0.0.0.0", port)) + grpc_insecure_ip_port = "{}:{}".format(self.grpc_mappings[self.client_id].grpc_server_ip, + self.grpc_mappings[self.client_id].grpc_server_port) + self.grpc_server.add_insecure_port(grpc_insecure_ip_port) self.grpc_server.start() + # Wait for 100 milliseconds to make sure the grpc + # server has started before proceeding. + time.sleep(0.01) self.is_running = True - logging.info("grpc server started. Listening on port " + str(port)) + logging.info("Started gRPC server: {}.".format(grpc_insecure_ip_port)) def send_message(self, msg: Message): - logging.info("msg = {}".format(msg)) - # payload = msg.to_json() - + # Register the sender rank, ip and port attribute on the message. + msg.add_params(GRPCCommManager.MSG_ARG_KEY_SENDER_RANK, self.client_rank) + if self.grpc_mappings[self.client_id].ingress_ip: + msg.add_params(GRPCCommManager.MSG_ARG_KEY_SENDER_IP, self.grpc_mappings[self.client_id].ingress_ip) + else: + msg.add_params(GRPCCommManager.MSG_ARG_KEY_SENDER_IP, self.grpc_mappings[self.client_id].grpc_server_ip) + msg.add_params(GRPCCommManager.MSG_ARG_KEY_SENDER_PORT, self.grpc_mappings[self.client_id].grpc_server_port) + logging.info("sending msg = {}".format(msg.get_params_wout_model())) logging.info("pickle.dumps(msg) START") pickle_dump_start_time = time.time() msg_pkl = pickle.dumps(msg) @@ -86,10 +117,13 @@ def send_message(self, msg: Message): logging.info("pickle.dumps(msg) END") receiver_id = msg.get_receiver_id() - PORT_BASE = CommunicationConstants.GRPC_BASE_PORT - # lookup ip of receiver from self.ip_config table - receiver_ip = self.ip_config[str(receiver_id)] - channel_url = "{}:{}".format(receiver_ip, str(PORT_BASE + receiver_id)) + receiver_grpc_mappings = self.grpc_mappings[int(receiver_id)] + if receiver_grpc_mappings.ingress_ip: + channel_url = "{}:{}".format(receiver_grpc_mappings.ingress_ip, + receiver_grpc_mappings.grpc_server_port) + else: + channel_url = "{}:{}".format(receiver_grpc_mappings.grpc_server_ip, + receiver_grpc_mappings.grpc_server_port) channel = grpc.insecure_channel(channel_url, options=self.opts) stub = grpc_comm_manager_pb2_grpc.gRPCCommManagerStub(channel) @@ -98,7 +132,6 @@ def send_message(self, msg: Message): logging.info("sending message to {}".format(channel_url)) request.client_id = self.client_id - request.message = msg_pkl tick = time.time() @@ -116,10 +149,8 @@ def remove_observer(self, observer: Observer): def handle_receive_message(self): self._notify_connection_ready() self.message_handling_subroutine() - - # Cannont run message_handling_subroutine in new thread + # Cannot run message_handling_subroutine in new thread # Related https://stackoverflow.com/a/70705165 - # thread = threading.Thread(target=self.message_handling_subroutine) # thread.start() @@ -131,19 +162,33 @@ def message_handling_subroutine(self): lock.acquire() busy_time_start_time = time.time() msg_pkl = self.grpc_servicer.message_q.get() - logging.info("unpickle START") + logging.info("Unpickle START.") unpickle_start_time = time.time() msg = pickle.loads(msg_pkl) MLOpsProfilerEvent.log_to_wandb({"UnpickleTime": time.time() - unpickle_start_time}) - logging.info("unpickle END") + logging.info("Unpickle END.") msg_type = msg.get_type() + + sender_id = int(msg.get_sender_id()) + if sender_id not in self.grpc_mappings: + sender_rank = int(msg.get_params()[GRPCCommManager.MSG_ARG_KEY_SENDER_RANK]) + sender_ip = str(msg.get_params()[GRPCCommManager.MSG_ARG_KEY_SENDER_IP]) + sender_port = int(msg.get_params()[GRPCCommManager.MSG_ARG_KEY_SENDER_PORT]) + self.grpc_mappings[sender_id] = GRPCMapping( + eid=sender_id, + rank=sender_rank, + grpc_server_ip=sender_ip, + grpc_server_port=sender_port, + ingress_ip=sender_ip) + for observer in self._observers: _message_handler_start_time = time.time() observer.receive_message(msg_type, msg) MLOpsProfilerEvent.log_to_wandb({"MessageHandlerTime": time.time() - _message_handler_start_time}) MLOpsProfilerEvent.log_to_wandb({"BusyTime": time.time() - busy_time_start_time}) lock.release() - time.sleep(0.0001) + time.sleep(0.0001) + MLOpsProfilerEvent.log_to_wandb({"TotalTime": time.time() - start_listening_time}) return @@ -158,20 +203,26 @@ def notify(self, message: Message): def _notify_connection_ready(self): msg_params = Message() - msg_params.sender_id = self.rank - msg_params.receiver_id = self.rank + msg_params.sender_id = self.client_rank + msg_params.receiver_id = self.client_rank msg_type = CommunicationConstants.MSG_TYPE_CONNECTION_IS_READY for observer in self._observers: observer.receive_message(msg_type, msg_params) - def _build_ip_table(self, path): - ip_config = dict() - with open(path, newline="") as csv_file: - csv_reader = csv.reader(csv_file) - # skip header line - next(csv_reader) - - for row in csv_reader: - receiver_id, receiver_ip = row - ip_config[receiver_id] = receiver_ip - return ip_config + def _init_grpc_mappings(self): + csv_dict_reader = csv.DictReader(open(self.grpc_ipconfig_path, "r")) + data_dict = list(csv_dict_reader) + for row in data_dict: + eid = int(row["eid"]) + rank = int(row["rank"]) + grpc_server_ip = str(row["grpc_server_ip"]) + grpc_server_port = int(row["grpc_server_port"]) + ingress_ip = None + if "ingress_ip" in row: + ingress_ip = row["ingress_ip"] + self.grpc_mappings[int(eid)] = GRPCMapping( + eid=eid, + rank=rank, + grpc_server_ip=grpc_server_ip, + grpc_server_port=grpc_server_port, + ingress_ip=ingress_ip) diff --git a/python/fedml/core/distributed/communication/grpc/ip_config_utils.py b/python/fedml/core/distributed/communication/grpc/ip_config_utils.py deleted file mode 100644 index 1ebedfd73a..0000000000 --- a/python/fedml/core/distributed/communication/grpc/ip_config_utils.py +++ /dev/null @@ -1,14 +0,0 @@ -import csv - - -def build_ip_table(path): - ip_config = dict() - with open(path, newline="") as csv_file: - csv_reader = csv.reader(csv_file) - # skip header line - next(csv_reader) - - for row in csv_reader: - receiver_id, receiver_ip = row - ip_config[receiver_id] = receiver_ip - return ip_config diff --git a/python/fedml/core/distributed/communication/message.py b/python/fedml/core/distributed/communication/message.py index 7d465461e5..7b1bc63cec 100644 --- a/python/fedml/core/distributed/communication/message.py +++ b/python/fedml/core/distributed/communication/message.py @@ -4,16 +4,10 @@ class Message(object): - MSG_ARG_KEY_OPERATION = "operation" MSG_ARG_KEY_TYPE = "msg_type" MSG_ARG_KEY_SENDER = "sender" MSG_ARG_KEY_RECEIVER = "receiver" - MSG_OPERATION_SEND = "send" - MSG_OPERATION_RECEIVE = "receive" - MSG_OPERATION_BROADCAST = "broadcast" - MSG_OPERATION_REDUCE = "reduce" - MSG_ARG_KEY_MODEL_PARAMS = "model_params" MSG_ARG_KEY_MODEL_PARAMS_URL = "model_params_url" MSG_ARG_KEY_MODEL_PARAMS_KEY = "model_params_key" @@ -54,6 +48,15 @@ def add_params(self, key, value): def get_params(self): return self.msg_params + def get_params_wout_model(self): + # We explicitly return the message triple, because the msg params + # dictionary is populated at different stages during execution, + # e.g., Message.MSG_ARG_KEY_MODEL_PARAMS + return { + k: v for k, v in self.msg_params.items() + if k != Message.MSG_ARG_KEY_MODEL_PARAMS + } + def add(self, key, value): self.msg_params[key] = value @@ -65,7 +68,7 @@ def get(self, key): def get_type(self): return self.msg_params[Message.MSG_ARG_KEY_TYPE] - def to_string(self): + def to_string(self, include_model_params=True): return self.msg_params def to_json(self): diff --git a/python/fedml/core/distributed/communication/mqtt/mqtt_manager.py b/python/fedml/core/distributed/communication/mqtt/mqtt_manager.py index bdafe159c2..401f2e7521 100644 --- a/python/fedml/core/distributed/communication/mqtt/mqtt_manager.py +++ b/python/fedml/core/distributed/communication/mqtt/mqtt_manager.py @@ -11,10 +11,11 @@ import fedml + class MqttManager(object): def __init__(self, host, port, user, pwd, keepalive_time, client_id, last_will_topic=None, last_will_msg=None, - clean_session=True, retain_msg=False): + clean_session=True, retain_msg=True): self._client = None self.mqtt_connection_id = None self._host = host diff --git a/python/fedml/core/distributed/fedml_comm_manager.py b/python/fedml/core/distributed/fedml_comm_manager.py index d358b0b7cd..29cd498bdc 100644 --- a/python/fedml/core/distributed/fedml_comm_manager.py +++ b/python/fedml/core/distributed/fedml_comm_manager.py @@ -188,11 +188,11 @@ def _init_manager(self): ) elif self.backend == "GRPC": from .communication.grpc.grpc_comm_manager import GRPCCommManager - - HOST = "0.0.0.0" - PORT = CommunicationConstants.GRPC_BASE_PORT + self.rank self.com_manager = GRPCCommManager( - HOST, PORT, ip_config_path=self.args.grpc_ipconfig_path, client_id=self.rank, client_num=self.size, + grpc_ipconfig_path=self.args.grpc_ipconfig_path, + client_rank=self.rank, + client_num=self.size, + args=self.args, ) elif self.backend == "TRPC": from .communication.trpc.trpc_comm_manager import TRPCCommManager diff --git a/python/fedml/core/mlops/__init__.py b/python/fedml/core/mlops/__init__.py index 77ad06165e..4d60534547 100644 --- a/python/fedml/core/mlops/__init__.py +++ b/python/fedml/core/mlops/__init__.py @@ -9,7 +9,6 @@ import uuid from multiprocessing import Process -import click import requests import fedml @@ -17,6 +16,9 @@ from fedml.computing.scheduler.comm_utils import sys_utils from fedml.core.mlops.mlops_configs import MLOpsConfigs from .mlops_constants import MLOpsConstants + +from ...constants import FEDML_TRAINING_PLATFORM_SIMULATION, FEDML_TRAINING_PLATFORM_SIMULATION_TYPE + from .mlops_metrics import MLOpsMetrics from .mlops_profiler_event import MLOpsProfilerEvent from .mlops_runtime_log import MLOpsRuntimeLog @@ -28,11 +30,13 @@ from ..distributed.communication.mqtt.mqtt_manager import MqttManager from ..distributed.communication.s3.remote_storage import S3Storage from ...computing.scheduler.master.server_constants import ServerConstants -from ...computing.scheduler.master.server_runner import FedMLServerRunner from ...computing.scheduler.slave.client_constants import ClientConstants from ...computing.scheduler.slave.client_data_interface import FedMLClientDataInterface -from ...computing.scheduler.slave.client_runner import FedMLClientRunner -from ...constants import FEDML_TRAINING_PLATFORM_SIMULATION, FEDML_TRAINING_PLATFORM_SIMULATION_TYPE +from .mlops_utils import MLOpsUtils +from .mlops_constants import MLOpsConstants +from ...computing.scheduler.master.master_protocol_manager import FedMLLaunchMasterProtocolManager +from ...computing.scheduler.scheduler_core.account_manager import FedMLAccountManager + FEDML_MLOPS_API_RESPONSE_SUCCESS_CODE = "SUCCESS" @@ -47,6 +51,8 @@ "log_aggregation_failed_status", "log_training_failed_status", "log_endpoint_status", + "MLOpsConfigs", + "sync_deploy_id" ] @@ -95,13 +101,13 @@ def init(args, should_init_logs=True): if not mlops_parrot_enabled(args): if not hasattr(args, "config_version"): args.config_version = "release" - fetch_config(args, args.config_version) if should_init_logs: MLOpsRuntimeLog.get_instance(args).init_logs() + fetch_config(args, args.config_version) return else: if hasattr(args, "simulator_daemon"): - # Bind local device as simulation device on FedML® Nexus AI Platform + # Bind local device as simulation device on TensorOpera® Nexus AI Platform setattr(args, "using_mlops", True) setattr(args, "rank", 1) MLOpsStore.mlops_bind_result = bind_simulation_device(args, args.user) @@ -119,7 +125,7 @@ def init(args, should_init_logs=True): if project_name is None or api_key is None: raise Exception("Please check mlops_project_name and mlops_api_key params.") - # Bind local device as simulation device on FedML® Nexus AI Platform + # Bind local device as simulation device on TensorOpera® Nexus AI Platform setattr(args, "using_mlops", True) setattr(args, "rank", 1) MLOpsStore.mlops_bind_result = bind_simulation_device(args, api_key, args.config_version) @@ -137,7 +143,7 @@ def init(args, should_init_logs=True): MLOpsStore.mlops_project_id = project_id MLOpsStore.mlops_run_id = run_id if result_project is False or result_run is False: - click.echo("Failed to init project and run.") + print("Failed to init project and run.") return # Init runtime logs @@ -747,7 +753,7 @@ def push_artifact_to_s3(artifact: fedml.mlops.Artifact, version="release", show_ show_progress=show_progress, out_progress_to_err=True, progress_desc="Submitting your artifact to " - "FedML® Nexus AI Platform") + "TensorOpera® Nexus AI Platform") artifact_storage_url = str(artifact_storage_url).split("?")[0] except Exception as e: pass @@ -973,10 +979,9 @@ def _generate_log_metrics(metrics: dict, step: int = None, customized_step_key: def log_mlops_running_logs(artifact: fedml.mlops.Artifact, version=None, run_id=None, edge_id=None, only_push_artifact=False): - fedml_args = get_fedml_args() artifact_archive_zip_file, artifact_storage_url = push_artifact_to_s3( - artifact, version=version if version is not None else fedml_args.config_version, show_progress=False) + artifact, version=version if version is not None else fedml.get_env_version(), show_progress=False) if only_push_artifact: return artifact_storage_url @@ -1242,12 +1247,13 @@ def bind_simulation_device(args, userid): setattr(args, "version", version) if args.rank == 0: setattr(args, "log_file_dir", ServerConstants.get_log_file_dir()) - setattr(args, "device_id", FedMLServerRunner.get_device_id()) - runner = FedMLServerRunner(args) + setattr(args, "device_id", + FedMLAccountManager.get_device_id(ServerConstants.get_data_dir())) + runner = FedMLLaunchMasterProtocolManager(args) else: setattr(args, "log_file_dir", ClientConstants.get_log_file_dir()) - setattr(args, "device_id", FedMLClientRunner.get_device_id()) - runner = FedMLClientRunner(args) + setattr(args, "device_id", FedMLAccountManager.get_device_id()) + runner = FedMLSlaveProtocolManager(args) setattr(args, "config_version", version) setattr(args, "cloud_region", "") @@ -1274,8 +1280,8 @@ def bind_simulation_device(args, userid): continue if config_try_count >= 5: - click.echo("\nNote: Internet is not connected. " - "Experimental tracking results will not be synchronized to the MLOps (open.fedml.ai).\n") + logging.info("\nNote: Internet is not connected. " + "Experimental tracking results will not be synchronized to the MLOps (open.fedml.ai).\n") return False # Build unique device id @@ -1283,7 +1289,7 @@ def bind_simulation_device(args, userid): device_role = "Edge.Simulator" unique_device_id = "{}@{}.{}".format(args.device_id, args.os_name, device_role) - # Bind account id to FedML® Nexus AI Platform + # Bind account id to TensorOpera® Nexus AI Platform register_try_count = 0 edge_id = -1 while register_try_count < 5: @@ -1301,8 +1307,8 @@ def bind_simulation_device(args, userid): continue if edge_id <= 0: - click.echo("Oops, you failed to login the FedML MLOps platform.") - click.echo("Please check whether your network is normal!") + print("Oops, you failed to login the FedML MLOps platform.") + print("Please check whether your network is normal!") return False MLOpsStore.mlops_edge_id = edge_id setattr(MLOpsStore.mlops_args, "client_id", edge_id) @@ -1324,10 +1330,10 @@ def fetch_config(args, version="release"): setattr(args, "version", version) if args.rank == 0: setattr(args, "log_file_dir", ServerConstants.get_log_file_dir()) - setattr(args, "device_id", FedMLServerRunner.get_device_id()) + setattr(args, "device_id", FedMLAccountManager.get_device_id(ServerConstants.get_data_dir())) else: setattr(args, "log_file_dir", ClientConstants.get_log_file_dir()) - setattr(args, "device_id", FedMLClientRunner.get_device_id()) + setattr(args, "device_id", FedMLAccountManager.get_device_id(ClientConstants.get_data_dir())) setattr(args, "config_version", version) setattr(args, "cloud_region", "") @@ -1353,8 +1359,8 @@ def fetch_config(args, version="release"): continue if config_try_count >= 5: - click.echo("\nNote: Internet is not connected. " - "Experimental tracking results will not be synchronized to the MLOps (open.fedml.ai).\n") + logging.info("\nNote: Internet is not connected. " + "Experimental tracking results will not be synchronized to the MLOps (open.fedml.ai).\n") return False diff --git a/python/fedml/core/mlops/mlops_configs.py b/python/fedml/core/mlops/mlops_configs.py index c8c6422d6c..891f721c9d 100644 --- a/python/fedml/core/mlops/mlops_configs.py +++ b/python/fedml/core/mlops/mlops_configs.py @@ -4,9 +4,11 @@ import certifi import requests +import cachetools.func import fedml from fedml.core.mlops.mlops_utils import MLOpsUtils +from urllib.parse import urlparse class Configs(Enum): @@ -41,15 +43,22 @@ def __init__(self): pass @staticmethod + @cachetools.func.ttl_cache(ttl=600) def get_request_params(): url = fedml._get_backend_service() url = f"{url}/fedmlOpsServer/configs/fetch" cert_path = None if str(url).startswith("https://"): cur_source_dir = os.path.dirname(__file__) - cert_path = os.path.join( - cur_source_dir, "ssl", "open-" + fedml.get_env_version() + ".fedml.ai_bundle.crt" - ) + version = fedml.get_env_version() + if version == "local": + cert_path = os.path.join( + cur_source_dir, "ssl", f"{urlparse(url).hostname}.{version}.crt" + ) + else: + cert_path = os.path.join( + cur_source_dir, "ssl", "open-" + fedml.get_env_version() + ".fedml.ai_bundle.crt" + ) return url, cert_path @@ -86,17 +95,30 @@ def get_cert_path_with_version(): cert_path = None if str(url).startswith("https://"): cur_source_dir = os.path.dirname(__file__) - cert_path = os.path.join( - cur_source_dir, "ssl", "open-" + version + ".fedml.ai_bundle.crt" - ) + if version == "local": + cert_path = os.path.join( + cur_source_dir, "ssl", f"{urlparse(url).hostname}.{version}.crt" + ) + else: + cert_path = os.path.join( + cur_source_dir, "ssl", "open-" + version + ".fedml.ai_bundle.crt" + ) + return cert_path @staticmethod def get_root_ca_path(): cur_source_dir = os.path.dirname(__file__) - cert_path = os.path.join( - cur_source_dir, "ssl", "open-root-ca.crt" - ) + version = fedml.get_env_version() + if version == "local": + url = fedml._get_backend_service() + cert_path = os.path.join( + cur_source_dir, "ssl", f"{urlparse(url).hostname}.{version}.rootca.crt" + ) + else: + cert_path = os.path.join( + cur_source_dir, "ssl", "open-root-ca.crt" + ) return cert_path @staticmethod @@ -115,8 +137,20 @@ def _fetch_configs(configs) -> dict: request_configs = request_configs.union(configs) json_params = {"config_name": [config.value for config in request_configs], "device_send_time": int(time.time() * 1000)} - response = MLOpsConfigs._request(request_url=url, request_json=json_params, cert_path=cert_path) - status_code = response.json().get("code") + try: + response = MLOpsConfigs._request(request_url=url, request_json=json_params, cert_path=cert_path) + except Exception as e: + print(f"Fetch configs failed due to {e} " + f"please check the network connection and try again.") + return {} + + msg_str = "" + if response: + status_code = response.json().get("code") + msg_str = response.json() + else: + status_code = "FAILED" + result = {} if status_code == "SUCCESS": data = response.json().get("data") @@ -125,7 +159,8 @@ def _fetch_configs(configs) -> dict: mlops_config = data.get(Configs.ML_OPS_CONFIG.value) MLOpsUtils.calc_ntp_from_config(mlops_config) else: - raise Exception("failed to fetch device configurations!") + raise Exception(f"failed to fetch device configs from server, with status code: {status_code} " + f"and response: {msg_str}") return result @staticmethod @@ -152,6 +187,11 @@ def fetch_all_configs(): fetched_configs[Configs.ML_OPS_CONFIG], fetched_configs[Configs.DOCKER_CONFIG]) + @staticmethod + def fetch_mqtt_config(): + fetched_config = MLOpsConfigs._fetch_configs({Configs.MQTT_CONFIG}) + return fetched_config[Configs.MQTT_CONFIG] + if __name__ == "__main__": fedml.set_env_version("release") diff --git a/python/fedml/core/mlops/mlops_device_perfs.py b/python/fedml/core/mlops/mlops_device_perfs.py index 728652410c..0c2bde6785 100644 --- a/python/fedml/core/mlops/mlops_device_perfs.py +++ b/python/fedml/core/mlops/mlops_device_perfs.py @@ -6,7 +6,7 @@ import uuid from os.path import expanduser -import multiprocess as multiprocessing +import multiprocessing import psutil from fedml.computing.scheduler.comm_utils import sys_utils @@ -16,6 +16,8 @@ from ...computing.scheduler.comm_utils.job_monitor import JobMonitor from ...core.distributed.communication.mqtt.mqtt_manager import MqttManager + +ROLE_DEVICE_JOB_TOTAL_MONITOR = 0 ROLE_DEVICE_INFO_REPORTER = 1 ROLE_ENDPOINT_MASTER = 2 ROLE_ENDPOINT_SLAVE = 3 @@ -39,6 +41,8 @@ def __init__(self): self.monitor_auto_scaler_process = None self.monitor_replica_num_process = None self.monitor_replica_perf_process = None + self.job_total_monitor_process = None + self.enable_job_total_monitor = False # TODO(Raphael): Enable the healthiness check by this job total monitor self.args = None self.device_id = None self.run_id = None @@ -74,55 +78,55 @@ def setup_realtime_stats_process(self, sys_args): self.device_realtime_stats_process = multiprocessing.Process( target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_DEVICE_INFO_REPORTER)) + args=(self.device_realtime_stats_event, ROLE_DEVICE_INFO_REPORTER, self.is_client)) self.device_realtime_stats_process.start() - if self.is_client: - self.monitor_endpoint_slave_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_SLAVE)) - self.monitor_endpoint_slave_process.start() - - self.monitor_endpoint_master_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_MASTER)) - self.monitor_endpoint_master_process.start() - - self.monitor_run_slave_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_RUN_SLAVE)) - self.monitor_run_slave_process.start() - - self.monitor_endpoint_logs_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_LOGS)) - self.monitor_endpoint_logs_process.start() - - # Register auto-scaler process - self.monitor_auto_scaler_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_AUTO_SCALER)) - self.monitor_auto_scaler_process.start() - - # Register replica number report channel - self.monitor_replica_num_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_NUM)) - self.monitor_replica_num_process.start() - - # Register replica performance report channel - self.monitor_replica_perf_process = multiprocessing.Process( + if self.enable_job_total_monitor: + self.job_total_monitor_process = multiprocessing.Process( target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_PERF)) - self.monitor_replica_perf_process.start() - + args=(self.device_realtime_stats_event, ROLE_DEVICE_JOB_TOTAL_MONITOR, self.is_client)) + self.job_total_monitor_process.start() else: - self.monitor_run_master_process = multiprocessing.Process( - target=perf_stats.report_device_realtime_stats_entry, - args=(self.device_realtime_stats_event, ROLE_RUN_MASTER)) - self.monitor_run_master_process.start() + if self.is_client: + self.monitor_endpoint_master_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_MASTER)) + self.monitor_endpoint_master_process.start() + + self.monitor_run_slave_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_RUN_SLAVE)) + self.monitor_run_slave_process.start() + + self.monitor_endpoint_logs_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_LOGS)) + self.monitor_endpoint_logs_process.start() + + # Register auto-scaler process + self.monitor_auto_scaler_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_AUTO_SCALER)) + self.monitor_auto_scaler_process.start() + + # Register replica number report channel + self.monitor_replica_num_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_NUM)) + self.monitor_replica_num_process.start() + + # Register replica performance report channel + self.monitor_replica_perf_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_ENDPOINT_REPLICA_PERF)) + self.monitor_replica_perf_process.start() + else: + self.monitor_run_master_process = multiprocessing.Process( + target=perf_stats.report_device_realtime_stats_entry, + args=(self.device_realtime_stats_event, ROLE_RUN_MASTER)) + self.monitor_run_master_process.start() - def report_device_realtime_stats_entry(self, sys_event, role): + def report_device_realtime_stats_entry(self, sys_event, role, is_client=False): # print(f"Report device realtime stats, process id {os.getpid()}") self.device_realtime_stats_event = sys_event @@ -140,8 +144,7 @@ def report_device_realtime_stats_entry(self, sys_event, role): parent_pid = psutil.Process(os.getpid()).ppid() sys_stats_obj = SysStats(process_id=parent_pid) - if role == ROLE_RUN_MASTER: - device_info_reporter = FedMLDeviceInfoReportProtocol(run_id=self.run_id, mqtt_mgr=mqtt_mgr) + device_info_reporter = FedMLDeviceInfoReportProtocol(run_id=self.run_id, mqtt_mgr=mqtt_mgr) JobMonitor.get_instance().mqtt_config = self.args.mqtt_config_path @@ -154,14 +157,25 @@ def report_device_realtime_stats_entry(self, sys_event, role): } job_monitor_obj = None - if role == ROLE_AUTO_SCALER: + if role == ROLE_AUTO_SCALER or role == ROLE_DEVICE_JOB_TOTAL_MONITOR: # job_monitor Should be initialized once job_monitor_obj = JobMonitor.get_instance() + sleep_time_interval_for_device_info = 60 + sleep_time_interval_for_client_monitor = 30 + sleep_time_interval_for_server_monitor = 60 + while not self.should_stop_device_realtime_stats(): - try: + if self.enable_job_total_monitor: + if role == ROLE_DEVICE_INFO_REPORTER: + time.sleep(sleep_time_interval_for_device_info) + elif role == ROLE_DEVICE_JOB_TOTAL_MONITOR: + time.sleep(sleep_time_interval_for_client_monitor if is_client + else sleep_time_interval_for_server_monitor) + else: time.sleep(time_interval_map[role]) + try: if role == ROLE_DEVICE_INFO_REPORTER: MLOpsDevicePerfStats.report_gpu_device_info(self.edge_id, mqtt_mgr=mqtt_mgr) elif role == ROLE_RUN_SLAVE: @@ -181,13 +195,23 @@ def report_device_realtime_stats_entry(self, sys_event, role): JobMonitor.get_instance().monitor_replicas_perf(self.edge_id, mqtt_mgr=mqtt_mgr) elif role == ROLE_AUTO_SCALER: job_monitor_obj.autoscaler_reconcile_after_interval() + elif role == ROLE_DEVICE_JOB_TOTAL_MONITOR: + if is_client: + JobMonitor.get_instance().monitor_slave_run_process_status() + JobMonitor.get_instance().monitor_slave_endpoint_status() + JobMonitor.get_instance().monitor_master_endpoint_status() + JobMonitor.get_instance().monitor_endpoint_logs() + JobMonitor.get_instance().monitor_replicas_number() + JobMonitor.get_instance().monitor_replicas_perf(self.edge_id, mqtt_mgr=mqtt_mgr) + job_monitor_obj.autoscaler_reconcile_after_interval() + else: + JobMonitor.get_instance().monitor_master_run_process_status( + self.edge_id, device_info_reporter=device_info_reporter) except Exception as e: logging.error(f"exception {e} when reporting device pref: {traceback.format_exc()}.") pass - time.sleep(sleep_time_interval) - if role == ROLE_DEVICE_INFO_REPORTER: self.check_fedml_client_parent_process() @@ -209,7 +233,7 @@ def report_gpu_device_info(edge_id, mqtt_mgr=None): # Do not use the following two lines as the realtime available gpu ids. # gpu_available_ids = JobRunnerUtils.get_available_gpu_id_list(edge_id) # gpu_available_ids = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids) - gpu_cores_available = len(gpu_available_ids) + gpu_cores_available = len(gpu_available_ids) if gpu_available_ids is not None else 0 deploy_worker_id_list = list() try: deploy_worker_id_list = json.loads(os.environ.get("FEDML_DEPLOY_WORKER_IDS", "[]")) @@ -241,6 +265,7 @@ def check_fedml_client_parent_process(self): if not self.is_client: return + # inspection PyBroadException try: home_dir = expanduser("~") fedml_ppids_dir = os.path.join(home_dir, ".fedml", "fedml-client", "fedml", "data", "ppids") @@ -262,13 +287,14 @@ def check_fedml_client_parent_process(self): print(f"Parent client process {file_list} has been killed, so fedml will exit.") logging.info(f"Parent client process {file_list} has been killed, so fedml will exit.") os.system("fedml logout") - except Exception as e: + except Exception: pass def check_fedml_server_parent_process(self): if self.is_client: return + # inspection PyBroadException try: home_dir = expanduser("~") fedml_ppids_dir = os.path.join(home_dir, ".fedml", "fedml-server", "fedml", "data", "ppids") @@ -290,5 +316,5 @@ def check_fedml_server_parent_process(self): print(f"Parent server process {file_list} has been killed, so fedml will exit.") logging.info(f"Parent server process {file_list} has been killed, so fedml will exit.") os.system("fedml logout -s") - except Exception as e: + except Exception: pass diff --git a/python/fedml/core/mlops/mlops_metrics.py b/python/fedml/core/mlops/mlops_metrics.py index b24025c8f9..04a1b4a207 100644 --- a/python/fedml/core/mlops/mlops_metrics.py +++ b/python/fedml/core/mlops/mlops_metrics.py @@ -7,7 +7,8 @@ import requests import fedml -from . import MLOpsConfigs +from .mlops_utils import MLOpsUtils +from .mlops_configs import MLOpsConfigs from .mlops_device_perfs import MLOpsDevicePerfStats from .mlops_job_perfs import MLOpsJobPerfStats from ...computing.scheduler.master.server_constants import ServerConstants @@ -16,18 +17,12 @@ class MLOpsMetrics(object): - def __new__(cls, *args, **kw): - if not hasattr(cls, "_instance"): - orig = super(MLOpsMetrics, cls) - cls._instance = orig.__new__(cls, *args, **kw) - cls._instance.init() - return cls._instance - def __init__(self): - pass + self.init() def init(self): self.messenger = None + self.send_message_func = None self.args = None self.run_id = None self.edge_id = None @@ -38,8 +33,9 @@ def init(self): self.job_perfs = MLOpsJobPerfStats() self.device_perfs = MLOpsDevicePerfStats() - def set_messenger(self, msg_messenger, args=None): + def set_messenger(self, msg_messenger, args=None, send_message_func=None): self.messenger = msg_messenger + self.send_message_func = send_message_func if args is not None: self.args = args self.run_id = args.run_id @@ -72,15 +68,17 @@ def comm_sanity_check(self): else: return True - def report_client_training_status(self, edge_id, status, running_json=None, is_from_model=False, run_id=0): + def report_client_training_status(self, edge_id, status, running_json=None, + is_from_model=False, run_id=0, update_db=True): self.common_report_client_training_status(edge_id, status, run_id=run_id) - if is_from_model: - from ...computing.scheduler.model_scheduler.device_client_data_interface import FedMLClientDataInterface - FedMLClientDataInterface.get_instance().save_job(run_id, edge_id, status, running_json) - else: - from ...computing.scheduler.slave.client_data_interface import FedMLClientDataInterface - FedMLClientDataInterface.get_instance().save_job(run_id, edge_id, status, running_json) + if update_db: + if is_from_model: + from ...computing.scheduler.model_scheduler.device_client_data_interface import FedMLClientDataInterface + FedMLClientDataInterface.get_instance().save_job(run_id, edge_id, status, running_json) + else: + from ...computing.scheduler.slave.client_data_interface import FedMLClientDataInterface + FedMLClientDataInterface.get_instance().save_job(run_id, edge_id, status, running_json) def report_client_device_status_to_web_ui(self, edge_id, status, run_id=0): """ @@ -94,7 +92,7 @@ def report_client_device_status_to_web_ui(self, edge_id, status, run_id=0): message_json = json.dumps(msg) logging.info("report_client_device_status. message_json = %s" % message_json) MLOpsStatus.get_instance().set_client_status(edge_id, status) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def common_report_client_training_status(self, edge_id, status, run_id=0): # if not self.comm_sanity_check(): @@ -109,7 +107,7 @@ def common_report_client_training_status(self, edge_id, status, run_id=0): message_json = json.dumps(msg) logging.info("report_client_training_status. message_json = %s" % message_json) MLOpsStatus.get_instance().set_client_status(edge_id, status) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def broadcast_client_training_status(self, edge_id, status, is_from_model=False, run_id=0): # if not self.comm_sanity_check(): @@ -137,14 +135,14 @@ def common_broadcast_client_training_status(self, edge_id, status, run_id=0): msg = {"edge_id": edge_id, "run_id": run_id, "status": status} message_json = json.dumps(msg) logging.info("broadcast_client_training_status. message_json = %s" % message_json) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def client_send_exit_train_msg(self, run_id, edge_id, status, msg=None): topic_exit_train_with_exception = "flserver_agent/" + str(run_id) + "/client_exit_train_with_exception" msg = {"run_id": run_id, "edge_id": edge_id, "status": status, "msg": msg if msg is not None else ""} message_json = json.dumps(msg) logging.info("client_send_exit_train_msg.") - self.messenger.send_message_json(topic_exit_train_with_exception, message_json) + self.send_message(topic_exit_train_with_exception, message_json) def report_client_id_status(self, edge_id, status, running_json=None, is_from_model=False, server_id="0", run_id=0, msg=""): @@ -172,19 +170,30 @@ def common_report_client_id_status(self, run_id, edge_id, status, server_id="0", msg = {"run_id": run_id, "edge_id": edge_id, "status": status, "server_id": server_id, "msg": msg} message_json = json.dumps(msg) # logging.info("report_client_id_status. message_json = %s" % message_json) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) - def report_server_training_status(self, run_id, status, edge_id=0, role=None, running_json=None, is_from_model=False): + def report_server_training_status(self, run_id, status, edge_id=0, role=None, + running_json=None, is_from_model=False, update_db=True): # if not self.comm_sanity_check(): # return self.common_report_server_training_status(run_id, status, role=role, edge_id=edge_id) - if is_from_model: - from ...computing.scheduler.model_scheduler.device_server_data_interface import FedMLServerDataInterface - FedMLServerDataInterface.get_instance().save_job(run_id, self.edge_id, status, running_json) - else: - from ...computing.scheduler.master.server_data_interface import FedMLServerDataInterface - FedMLServerDataInterface.get_instance().save_job(run_id, self.edge_id, status, running_json) + if update_db: + if is_from_model: + from ...computing.scheduler.model_scheduler.device_server_data_interface import FedMLServerDataInterface + FedMLServerDataInterface.get_instance().save_job(run_id, self.edge_id, status, running_json) + else: + from ...computing.scheduler.master.server_data_interface import FedMLServerDataInterface + FedMLServerDataInterface.get_instance().save_job(run_id, self.edge_id, status, running_json) + + def report_job_status(self, run_id, status, master_id=None): + topic_name = f"master_agent/slave_agent/job_status/{run_id}" + payload = {"run_id": run_id, "status": status, "fedml_version": fedml.__version__} + if master_id is not None: + payload["master_agent"] = master_id + + message_json = json.dumps(payload) + self.send_message(topic_name, message_json) def report_server_device_status_to_web_ui(self, run_id, status, edge_id=0, role=None): """ @@ -206,7 +215,7 @@ def report_server_device_status_to_web_ui(self, run_id, status, edge_id=0, role= # logging.info("report_server_device_status. msg = %s" % msg) message_json = json.dumps(msg) MLOpsStatus.get_instance().set_server_status(self.edge_id, status) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def common_report_server_training_status(self, run_id, status, role=None, edge_id=0): # if not self.comm_sanity_check(): @@ -215,6 +224,7 @@ def common_report_server_training_status(self, run_id, status, role=None, edge_i if role is None: role = "normal" msg = { + "timestamp": MLOpsUtils.get_ntp_time(), "run_id": run_id, "edge_id": edge_id, "status": status, @@ -223,7 +233,7 @@ def common_report_server_training_status(self, run_id, status, role=None, edge_i # logging.info("report_server_training_status. msg = %s" % msg) message_json = json.dumps(msg) MLOpsStatus.get_instance().set_server_status(self.edge_id, status) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def broadcast_server_training_status(self, run_id, status, role=None, is_from_model=False, edge_id=None): if self.messenger is None: @@ -239,7 +249,7 @@ def broadcast_server_training_status(self, run_id, status, role=None, is_from_mo } logging.info("broadcast_server_training_status. msg = %s" % msg) message_json = json.dumps(msg) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) if is_from_model: from ...computing.scheduler.model_scheduler.device_server_data_interface import FedMLServerDataInterface @@ -248,19 +258,30 @@ def broadcast_server_training_status(self, run_id, status, role=None, is_from_mo from ...computing.scheduler.master.server_data_interface import FedMLServerDataInterface FedMLServerDataInterface.get_instance().save_job(run_id, self.edge_id, status) - def report_server_id_status(self, run_id, status, edge_id=None, server_id=None, server_agent_id=None): + def report_server_id_status(self, run_id, status, edge_id=None, server_id=None, server_agent_id=None, + is_from_model=False, running_json=None, update_db=True): # if not self.comm_sanity_check(): # return topic_name = "fl_server/flserver_agent_" + str(server_agent_id if server_agent_id is not None else self.server_agent_id) + "/status" - msg = {"run_id": run_id, "edge_id": edge_id if edge_id is not None else self.edge_id, "status": status} + in_edge_id = edge_id if edge_id is not None else self.edge_id + msg = {"run_id": run_id, "edge_id": in_edge_id, + "status": status, "is_from_model": is_from_model} if server_id is not None: msg["server_id"] = server_id message_json = json.dumps(msg) logging.info(f"report_server_id_status; topic_name: {topic_name}, msg: {msg}") # logging.info("report_server_id_status server id {}".format(server_agent_id)) # logging.info("report_server_id_status. message_json = %s" % message_json) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) + + if update_db: + if is_from_model: + from ...computing.scheduler.model_scheduler.device_server_data_interface import FedMLServerDataInterface + FedMLServerDataInterface.get_instance().save_job(run_id, in_edge_id, status, running_json) + else: + from ...computing.scheduler.master.server_data_interface import FedMLServerDataInterface + FedMLServerDataInterface.get_instance().save_job(run_id, in_edge_id, status, running_json) def report_client_training_metric(self, metric_json): # if not self.comm_sanity_check(): @@ -268,7 +289,7 @@ def report_client_training_metric(self, metric_json): topic_name = "fl_client/mlops/training_metrics" logging.info("report_client_training_metric. message_json = %s" % metric_json) message_json = json.dumps(metric_json) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def report_server_training_metric(self, metric_json, payload=None): # if not self.comm_sanity_check(): @@ -279,7 +300,7 @@ def report_server_training_metric(self, metric_json, payload=None): else: message_json = json.dumps(metric_json) # logging.info("report_server_training_metric. message_json = %s" % metric_json) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def report_endpoint_metric(self, metric_json, payload=None): # if not self.comm_sanity_check(): @@ -290,7 +311,7 @@ def report_endpoint_metric(self, metric_json, payload=None): else: message_json = json.dumps(metric_json) # logging.info("report_endpoint_metric. message_json = %s" % metric_json) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def report_fedml_train_metric(self, metric_json, run_id=0, is_endpoint=False): # if not self.comm_sanity_check(): @@ -299,42 +320,42 @@ def report_fedml_train_metric(self, metric_json, run_id=0, is_endpoint=False): logging.info("report_fedml_train_metric. message_json = %s" % metric_json) metric_json["is_endpoint"] = is_endpoint message_json = json.dumps(metric_json) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def report_fedml_run_logs(self, logs_json, run_id=0): # if not self.comm_sanity_check(): # return topic_name = f"fedml_slave/fedml_master/logs/{run_id}" message_json = json.dumps(logs_json) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def report_server_training_round_info(self, round_info): # if not self.comm_sanity_check(): # return topic_name = "fl_server/mlops/training_roundx" message_json = json.dumps(round_info) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def report_client_model_info(self, model_info_json): # if not self.comm_sanity_check(): # return topic_name = "fl_server/mlops/client_model" message_json = json.dumps(model_info_json) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def report_aggregated_model_info(self, model_info_json): # if not self.comm_sanity_check(): # return topic_name = "fl_server/mlops/global_aggregated_model" message_json = json.dumps(model_info_json) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def report_training_model_net_info(self, model_net_info_json): # if not self.comm_sanity_check(): # return topic_name = "fl_server/mlops/training_model_net" message_json = json.dumps(model_net_info_json) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def report_llm_record(self, metric_json): # if not self.comm_sanity_check(): @@ -342,7 +363,7 @@ def report_llm_record(self, metric_json): topic_name = "model_serving/mlops/llm_input_output_record" logging.info("report_llm_record. message_json = %s" % metric_json) message_json = json.dumps(metric_json) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def report_edge_job_computing_cost(self, job_id, edge_id, computing_started_time, computing_ended_time, @@ -359,7 +380,7 @@ def report_edge_job_computing_cost(self, job_id, edge_id, "computing_ended_time": computing_ended_time, "duration": duration, "user_id": user_id, "api_key": api_key} message_json = json.dumps(msg) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) # logging.info("report_job_computing_cost. message_json = %s" % message_json) def report_logs_updated(self, run_id): @@ -369,7 +390,7 @@ def report_logs_updated(self, run_id): msg = {"time": time.time()} message_json = json.dumps(msg) logging.info("report_logs_updated. message_json = %s" % message_json) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def report_artifact_info(self, job_id, edge_id, artifact_name, artifact_type, artifact_local_path, artifact_url, @@ -388,7 +409,7 @@ def report_artifact_info(self, job_id, edge_id, artifact_name, artifact_type, "timestamp": timestamp } message_json = json.dumps(artifact_info_json) - self.messenger.send_message_json(topic_name, message_json) + self.send_message(topic_name, message_json) def report_endpoint_status(self, end_point_id, model_status, timestamp=None, end_point_name="", model_name="", model_inference_url=""): @@ -401,8 +422,8 @@ def report_endpoint_status(self, end_point_id, model_status, timestamp=None, "model_status": model_status, "timestamp": int(format(time_param, '.0f'))} - self.messenger.send_message_json(deployment_status_topic, json.dumps(deployment_status_payload)) - self.messenger.send_message_json(deployment_status_topic_prefix, json.dumps(deployment_status_payload)) + self.send_message(deployment_status_topic, json.dumps(deployment_status_payload)) + self.send_message(deployment_status_topic_prefix, json.dumps(deployment_status_payload)) def report_run_log( self, run_id, device_id, log_list, log_source=None, use_mqtt=False @@ -480,4 +501,10 @@ def stop_device_realtime_perf(self): self.device_perfs.stop_device_realtime_stats() def report_json_message(self, topic, payload): - self.messenger.send_message_json(topic, payload) \ No newline at end of file + self.send_message(topic, payload) + + def send_message(self, topic, payload): + if self.send_message_func is not None: + self.send_message_func(topic, payload) + elif self.messenger is not None: + self.messenger.send_message_json(topic, payload) \ No newline at end of file diff --git a/python/fedml/core/mlops/mlops_runtime_log.py b/python/fedml/core/mlops/mlops_runtime_log.py index 6992c44555..0fc5db3d23 100644 --- a/python/fedml/core/mlops/mlops_runtime_log.py +++ b/python/fedml/core/mlops/mlops_runtime_log.py @@ -5,6 +5,7 @@ import sys import threading import time +import shutil from logging.handlers import TimedRotatingFileHandler from fedml import mlops @@ -12,19 +13,19 @@ LOG_LEVEL = logging.INFO ROTATION_FREQUENCY = 'D' +# when rollover is done, no more than backupCount files are kept - the oldest ones are deleted. BACKUP_COUNT = 100 class MLOpsFileHandler(TimedRotatingFileHandler): def __init__(self, run_id, edge_id, log_config_file, filepath): - super(MLOpsFileHandler, self).__init__(filename=filepath, when=ROTATION_FREQUENCY, backupCount=BACKUP_COUNT, - encoding='utf-8') + super().__init__(filename=filepath, when=ROTATION_FREQUENCY, + backupCount=BACKUP_COUNT,encoding='utf-8') self.run_id = run_id self.edge_id = edge_id self.file_path = filepath self.rotate_count = 0 - self.backupCount = BACKUP_COUNT self.rotator: callable = self.update_config_and_rotate self.log_config_file = log_config_file self.__initialize_config() @@ -32,17 +33,26 @@ def __init__(self, run_id, edge_id, log_config_file, filepath): def update_config_and_rotate(self, source, dest): # source = current log file name # dest = log file name (dated) - if os.path.exists(source): - os.rename(source, dest) MLOpsLoggingUtils.acquire_lock() - config_data = MLOpsLoggingUtils.load_log_config(self.run_id, self.edge_id, self.log_config_file) + + # Check if the source and destination files exist. If it does, return + if os.path.exists(source): + # Copy the contents of the source file to the destination file + shutil.copy(source, dest) + # Clear everything in the source file + with open(source, 'w') as src_file: + src_file.truncate(0) + src_file.close() + + config_data = MLOpsLoggingUtils.load_log_config(self.run_id, self.edge_id, + self.log_config_file) # Update file name of current log file config_data[self.rotate_count].file_path = dest self.rotate_count += 1 # Store the rotate count, and corresponding log file name in the config file - rotated_log_file = LogFile(file_path=source, uploaded_file_index=self.backupCount) + rotated_log_file = LogFile(file_path=source) config_data[self.rotate_count] = rotated_log_file MLOpsLoggingUtils.save_log_config(run_id=self.run_id, device_id=self.edge_id, log_config_file=self.log_config_file, @@ -133,6 +143,8 @@ def __init__(self, args): self.should_write_log_file = args.using_mlops else: self.should_write_log_file = False + if not hasattr(args, "log_file_dir"): + setattr(args, "log_file_dir", "./logs") self.log_file_dir = args.log_file_dir self.log_file = None self.run_id = args.run_id diff --git a/python/fedml/core/mlops/mlops_runtime_log_daemon.py b/python/fedml/core/mlops/mlops_runtime_log_daemon.py index 64bd982ae3..ff06dc91b3 100644 --- a/python/fedml/core/mlops/mlops_runtime_log_daemon.py +++ b/python/fedml/core/mlops/mlops_runtime_log_daemon.py @@ -223,23 +223,17 @@ def __upload(self, log_upload_request) -> bool: if cert_path is not None: try: requests.session().verify = cert_path - # logging.info(f"FedMLDebug POST log to server. run_id {run_id}, device_id {device_id}") response = requests.post( self.log_server_url, json=log_upload_request, verify=True, headers=log_headers ) - # logging.info(f"FedMLDebug POST log to server run_id {run_id}, device_id {device_id}. response.status_code: {response.status_code}") except requests.exceptions.SSLError as err: MLOpsConfigs.install_root_ca_file() - # logging.info(f"FedMLDebug POST log to server. run_id {run_id}, device_id {device_id}") response = requests.post( self.log_server_url, json=log_upload_request, verify=True, headers=log_headers ) - # logging.info(f"FedMLDebug POST log to server run_id {run_id}, device_id {device_id}. response.status_code: {response.status_code}") else: - # logging.info(f"FedMLDebug POST log to server. run_id {run_id}, device_id {device_id}") response = requests.post(self.log_server_url, headers=log_headers, json=log_upload_request) - # logging.info(f"FedMLDebug POST log to server. run_id {run_id}, device_id {device_id}. response.status_code: {response.status_code}") if response.status_code != 200: logging.error(f"Failed to upload log to server. run_id {self.run_id}, device_id {self.device_id}. " f"response.status_code: {response.status_code}") @@ -404,9 +398,9 @@ def __new__(cls, *args, **kwargs): def __init__(self, in_args): self.args = in_args self.edge_id = MLOpsLoggingUtils.get_edge_id_from_args(self.args) + url = fedml._get_backend_service() try: if self.args.log_server_url is None or self.args.log_server_url == "": - url = fedml._get_backend_service() self.log_server_url = f"{url}/fedmlLogsServer/logs/update" else: self.log_server_url = self.args.log_server_url diff --git a/python/fedml/core/mlops/mlops_utils.py b/python/fedml/core/mlops/mlops_utils.py index 7313141550..8bde9e4299 100644 --- a/python/fedml/core/mlops/mlops_utils.py +++ b/python/fedml/core/mlops/mlops_utils.py @@ -128,15 +128,17 @@ def get_program_prefix(args, edge_id): @staticmethod def get_edge_id_from_args(args): if args.role == "server": - if hasattr(args, "server_id"): + # Considering that 0 is a valid value, we need to ensure it is not None rather than solely checking + # for truthiness + if getattr(args, "server_id", None) is not None: edge_id = args.server_id else: - if hasattr(args, "edge_id"): + if getattr(args, "edge_id", None) is not None: edge_id = args.edge_id else: edge_id = 0 else: - if hasattr(args, "client_id"): + if getattr(args, "client_id", None) is not None: edge_id = args.client_id elif hasattr(args, "client_id_list"): if args.client_id_list is None: @@ -148,10 +150,11 @@ def get_edge_id_from_args(args): else: edge_id = 0 else: - if hasattr(args, "edge_id"): + if getattr(args, "edge_id", None) is not None: edge_id = args.edge_id else: edge_id = 0 + return edge_id @staticmethod diff --git a/python/fedml/core/mlops/ssl/open.chainopera.ai.local.crt b/python/fedml/core/mlops/ssl/open.chainopera.ai.local.crt new file mode 100644 index 0000000000..400c30aaa0 --- /dev/null +++ b/python/fedml/core/mlops/ssl/open.chainopera.ai.local.crt @@ -0,0 +1,63 @@ +-----BEGIN CERTIFICATE----- +MIIF8zCCBFugAwIBAgIQKyZVUxZMMiOwsUN8tL5DgjANBgkqhkiG9w0BAQwFADBZ +MQswCQYDVQQGEwJDTjElMCMGA1UEChMcVHJ1c3RBc2lhIFRlY2hub2xvZ2llcywg +SW5jLjEjMCEGA1UEAxMaVHJ1c3RBc2lhIFJTQSBEViBUTFMgQ0EgRzIwHhcNMjQw +OTA5MDAwMDAwWhcNMjQxMjA4MjM1OTU5WjAdMRswGQYDVQQDExJvcGVuLmNoYWlu +b3BlcmEuYWkwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDLX+3ww3v2 +rTn+rvoqkjKPRwhtHtgYjaSRzoovPiiS9QrC/sTHSVAIR/FrftD+MOWn4GmyMBjv +lYkLi5qT0e5jLQwVQeRHgGZmuBHuNbk4RDzRI1bM2HmO7Lgv6u1Ce0K3CE5U2A5r +4a40KJFqhJlV6TEOu0XkxMMZ+l1l/rr/1MZDioYP9Bvwq09hvVaokHhrbirhTSYF +JkvnjaXu1e2Lq7c+7vMphr5AK+H3lT6Ct7PBZw0Yby1AX6EzMbjpqlU+fRwUuOeg +NJAzSWw9U4lCwW3H3JptZyvjbbm+4V/TZfl3q8G2JUJFkEEE6M3IeL4DIWaf9xAs +dzFOHPAxuWzRAgMBAAGjggJxMIICbTAfBgNVHSMEGDAWgBRfOnwREH4MZ3Fh3Iuj +tQADZ/VXHDAdBgNVHQ4EFgQUl+bxD+UL1hxiISSDQA8N2RsEd24wDgYDVR0PAQH/ +BAQDAgWgMAwGA1UdEwEB/wQCMAAwHQYDVR0lBBYwFAYIKwYBBQUHAwEGCCsGAQUF +BwMCMEkGA1UdIARCMEAwNAYLKwYBBAGyMQECAjEwJTAjBggrBgEFBQcCARYXaHR0 +cHM6Ly9zZWN0aWdvLmNvbS9DUFMwCAYGZ4EMAQIBMH0GCCsGAQUFBwEBBHEwbzBC +BggrBgEFBQcwAoY2aHR0cDovL2NydC50cnVzdC1wcm92aWRlci5jbi9UcnVzdEFz +aWFSU0FEVlRMU0NBRzIuY3J0MCkGCCsGAQUFBzABhh1odHRwOi8vb2NzcC50cnVz +dC1wcm92aWRlci5jbjCCAQMGCisGAQQB1nkCBAIEgfQEgfEA7wB1AHb/iD8KtvuV +UcJhzPWHujS0pM27KdxoQgqf5mdMWjp0AAABkdUkleAAAAQDAEYwRAIgLnpFW+eZ +M0ueXvdQpXsBRoWQUrL+UdD9gFSoH140GR4CICq/zgGD7Nwwehb3BpdpVLr0sfA3 +9i39Bm11r0yipqvGAHYAPxdLT9ciR1iUHWUchL4NEu2QN38fhWrrwb8ohez4ZG4A +AAGR1SSVwgAABAMARzBFAiEAnR+qJq2xcMYoBG4J6xJwhX+a/WoBSN0AVs7EGEv2 +WxgCIF38/4vZJ6Hf+5R/j1SH/XRuFoiV9/dU1dqHn/2C78bqMB0GA1UdEQQWMBSC +Em9wZW4uY2hhaW5vcGVyYS5haTANBgkqhkiG9w0BAQwFAAOCAYEAjjqLYm/6hx4s +ZSbPvyCsQ2KbjjX50aKHhk+/rkcGBtwwfDqF926/pW04dQ7/YiA+8eQGcg8yORSB +YSPoJjKzj72dt0KfccVAvido/2OnFBzDGqSPYXEpsaC//zioztao5DxWHPM8BtMo +nkav7slLkCBAtH1B8P50usY3b2k4JoIzPSowMxyfHeCMyzW90X9AhegPl/3SVTaJ +ec8l/oLpmhYWKaX8QkDfdBL2ceOVj2QDVNmdvy4UNKD/ZFedL/rZAETSx6H2bgGZ +PukL0gXSCaEydi33cKi46ExogHkdqp9nmay9un20ZESbOH9o3fth7EtzlK8s98tG +uiqxm+Gq4rSU2mS1zxaHQsKANBN52LdA86TZPxE48jPtvTMXbZhHujJ3OIqwOwIU +yA0p72D0uXLlRtusun3xq3vAcG7L4n8qLu601oJPkOd2asFYWCXI3D33bpPhLW2g +Ds2c6MGRaNxrmYx90fzyudF7w40AX9PMgBLWKuuX+qiEpab9MHhI +-----END CERTIFICATE----- +-----BEGIN CERTIFICATE----- +MIIFBzCCA++gAwIBAgIRALIM7VUuMaC/NDp1KHQ76aswDQYJKoZIhvcNAQELBQAw +ezELMAkGA1UEBhMCR0IxGzAZBgNVBAgMEkdyZWF0ZXIgTWFuY2hlc3RlcjEQMA4G +A1UEBwwHU2FsZm9yZDEaMBgGA1UECgwRQ29tb2RvIENBIExpbWl0ZWQxITAfBgNV +BAMMGEFBQSBDZXJ0aWZpY2F0ZSBTZXJ2aWNlczAeFw0yMjAxMTAwMDAwMDBaFw0y +ODEyMzEyMzU5NTlaMFkxCzAJBgNVBAYTAkNOMSUwIwYDVQQKExxUcnVzdEFzaWEg +VGVjaG5vbG9naWVzLCBJbmMuMSMwIQYDVQQDExpUcnVzdEFzaWEgUlNBIERWIFRM +UyBDQSBHMjCCAaIwDQYJKoZIhvcNAQEBBQADggGPADCCAYoCggGBAKjGDe0GSaBs +Yl/VhMaTM6GhfR1TAt4mrhN8zfAMwEfLZth+N2ie5ULbW8YvSGzhqkDhGgSBlafm +qq05oeESrIJQyz24j7icGeGyIZ/jIChOOvjt4M8EVi3O0Se7E6RAgVYcX+QWVp5c +Sy+l7XrrtL/pDDL9Bngnq/DVfjCzm5ZYUb1PpyvYTP7trsV+yYOCNmmwQvB4yVjf +IIpHC1OcsPBntMUGeH1Eja4D+qJYhGOxX9kpa+2wTCW06L8T6OhkpJWYn5JYiht5 +8exjAR7b8Zi3DeG9oZO5o6Qvhl3f8uGU8lK1j9jCUN/18mI/5vZJ76i+hsgdlfZB +Rh5lmAQjD80M9TY+oD4MYUqB5XrigPfFAUwXFGehhlwCVw7y6+5kpbq/NpvM5Ba8 +SeQYUUuMA8RXpTtGlrrTPqJryfa55hTuX/ThhX4gcCVkbyujo0CYr+Uuc14IOyNY +1fD0/qORbllbgV41wiy/2ZUWZQUodqHWkjT1CwIMbQOY5jmrSYGBwwIDAQABo4IB +JjCCASIwHwYDVR0jBBgwFoAUoBEKIz6W8Qfs4q8p74Klf9AwpLQwHQYDVR0OBBYE +FF86fBEQfgxncWHci6O1AANn9VccMA4GA1UdDwEB/wQEAwIBhjASBgNVHRMBAf8E +CDAGAQH/AgEAMB0GA1UdJQQWMBQGCCsGAQUFBwMBBggrBgEFBQcDAjAiBgNVHSAE +GzAZMA0GCysGAQQBsjEBAgIxMAgGBmeBDAECATBDBgNVHR8EPDA6MDigNqA0hjJo +dHRwOi8vY3JsLmNvbW9kb2NhLmNvbS9BQUFDZXJ0aWZpY2F0ZVNlcnZpY2VzLmNy +bDA0BggrBgEFBQcBAQQoMCYwJAYIKwYBBQUHMAGGGGh0dHA6Ly9vY3NwLmNvbW9k +b2NhLmNvbTANBgkqhkiG9w0BAQsFAAOCAQEAHMUom5cxIje2IiFU7mOCsBr2F6CY +eU5cyfQ/Aep9kAXYUDuWsaT85721JxeXFYkf4D/cgNd9+hxT8ZeDOJrn+ysqR7NO +2K9AdqTdIY2uZPKmvgHOkvH2gQD6jc05eSPOwdY/10IPvmpgUKaGOa/tyygL8Og4 +3tYyoHipMMnS4OiYKakDJny0XVuchIP7ZMKiP07Q3FIuSS4omzR77kmc75/6Q9dP +v4wa90UCOn1j6r7WhMmX3eT3Gsdj3WMe9bYD0AFuqa6MDyjIeXq08mVGraXiw73s +Zale8OMckn/BU3O/3aFNLHLfET2H2hT6Wb3nwxjpLIfXmSVcVd8A58XH0g== +-----END CERTIFICATE----- \ No newline at end of file diff --git a/python/fedml/core/mlops/ssl/open.chainopera.ai.local.rootca.crt b/python/fedml/core/mlops/ssl/open.chainopera.ai.local.rootca.crt new file mode 100644 index 0000000000..e87d0c5783 --- /dev/null +++ b/python/fedml/core/mlops/ssl/open.chainopera.ai.local.rootca.crt @@ -0,0 +1,29 @@ +-----BEGIN CERTIFICATE----- +MIIFBzCCA++gAwIBAgIRALIM7VUuMaC/NDp1KHQ76aswDQYJKoZIhvcNAQELBQAw +ezELMAkGA1UEBhMCR0IxGzAZBgNVBAgMEkdyZWF0ZXIgTWFuY2hlc3RlcjEQMA4G +A1UEBwwHU2FsZm9yZDEaMBgGA1UECgwRQ29tb2RvIENBIExpbWl0ZWQxITAfBgNV +BAMMGEFBQSBDZXJ0aWZpY2F0ZSBTZXJ2aWNlczAeFw0yMjAxMTAwMDAwMDBaFw0y +ODEyMzEyMzU5NTlaMFkxCzAJBgNVBAYTAkNOMSUwIwYDVQQKExxUcnVzdEFzaWEg +VGVjaG5vbG9naWVzLCBJbmMuMSMwIQYDVQQDExpUcnVzdEFzaWEgUlNBIERWIFRM +UyBDQSBHMjCCAaIwDQYJKoZIhvcNAQEBBQADggGPADCCAYoCggGBAKjGDe0GSaBs +Yl/VhMaTM6GhfR1TAt4mrhN8zfAMwEfLZth+N2ie5ULbW8YvSGzhqkDhGgSBlafm +qq05oeESrIJQyz24j7icGeGyIZ/jIChOOvjt4M8EVi3O0Se7E6RAgVYcX+QWVp5c +Sy+l7XrrtL/pDDL9Bngnq/DVfjCzm5ZYUb1PpyvYTP7trsV+yYOCNmmwQvB4yVjf +IIpHC1OcsPBntMUGeH1Eja4D+qJYhGOxX9kpa+2wTCW06L8T6OhkpJWYn5JYiht5 +8exjAR7b8Zi3DeG9oZO5o6Qvhl3f8uGU8lK1j9jCUN/18mI/5vZJ76i+hsgdlfZB +Rh5lmAQjD80M9TY+oD4MYUqB5XrigPfFAUwXFGehhlwCVw7y6+5kpbq/NpvM5Ba8 +SeQYUUuMA8RXpTtGlrrTPqJryfa55hTuX/ThhX4gcCVkbyujo0CYr+Uuc14IOyNY +1fD0/qORbllbgV41wiy/2ZUWZQUodqHWkjT1CwIMbQOY5jmrSYGBwwIDAQABo4IB +JjCCASIwHwYDVR0jBBgwFoAUoBEKIz6W8Qfs4q8p74Klf9AwpLQwHQYDVR0OBBYE +FF86fBEQfgxncWHci6O1AANn9VccMA4GA1UdDwEB/wQEAwIBhjASBgNVHRMBAf8E +CDAGAQH/AgEAMB0GA1UdJQQWMBQGCCsGAQUFBwMBBggrBgEFBQcDAjAiBgNVHSAE +GzAZMA0GCysGAQQBsjEBAgIxMAgGBmeBDAECATBDBgNVHR8EPDA6MDigNqA0hjJo +dHRwOi8vY3JsLmNvbW9kb2NhLmNvbS9BQUFDZXJ0aWZpY2F0ZVNlcnZpY2VzLmNy +bDA0BggrBgEFBQcBAQQoMCYwJAYIKwYBBQUHMAGGGGh0dHA6Ly9vY3NwLmNvbW9k +b2NhLmNvbTANBgkqhkiG9w0BAQsFAAOCAQEAHMUom5cxIje2IiFU7mOCsBr2F6CY +eU5cyfQ/Aep9kAXYUDuWsaT85721JxeXFYkf4D/cgNd9+hxT8ZeDOJrn+ysqR7NO +2K9AdqTdIY2uZPKmvgHOkvH2gQD6jc05eSPOwdY/10IPvmpgUKaGOa/tyygL8Og4 +3tYyoHipMMnS4OiYKakDJny0XVuchIP7ZMKiP07Q3FIuSS4omzR77kmc75/6Q9dP +v4wa90UCOn1j6r7WhMmX3eT3Gsdj3WMe9bYD0AFuqa6MDyjIeXq08mVGraXiw73s +Zale8OMckn/BU3O/3aFNLHLfET2H2hT6Wb3nwxjpLIfXmSVcVd8A58XH0g== +-----END CERTIFICATE----- diff --git a/python/fedml/cross_silo/client/fedml_client_master_manager.py b/python/fedml/cross_silo/client/fedml_client_master_manager.py index f03f7f03c1..fa333af2cc 100644 --- a/python/fedml/cross_silo/client/fedml_client_master_manager.py +++ b/python/fedml/cross_silo/client/fedml_client_master_manager.py @@ -78,7 +78,10 @@ def register_message_receive_handlers(self): MyMessage.MSG_TYPE_S2C_CHECK_CLIENT_STATUS, self.handle_message_check_status ) - self.register_message_receive_handler(MyMessage.MSG_TYPE_S2C_INIT_CONFIG, self.handle_message_init) + self.register_message_receive_handler( + MyMessage.MSG_TYPE_S2C_INIT_CONFIG, self.handle_message_init + ) + self.register_message_receive_handler( MyMessage.MSG_TYPE_S2C_SYNC_MODEL_TO_CLIENT, self.handle_message_receive_model_from_server, ) diff --git a/python/fedml/cross_silo/client/message_define.py b/python/fedml/cross_silo/client/message_define.py index ef482e102b..0b694c68c0 100644 --- a/python/fedml/cross_silo/client/message_define.py +++ b/python/fedml/cross_silo/client/message_define.py @@ -1,3 +1,4 @@ + class MyMessage(object): """ message type definition diff --git a/python/fedml/serving/fedml_inference_runner.py b/python/fedml/serving/fedml_inference_runner.py index 30dd9a6fca..64a8a2d596 100644 --- a/python/fedml/serving/fedml_inference_runner.py +++ b/python/fedml/serving/fedml_inference_runner.py @@ -6,8 +6,10 @@ class FedMLInferenceRunner(ABC): - def __init__(self, client_predictor): + def __init__(self, client_predictor, host="0.0.0.0", port=2345): self.client_predictor = client_predictor + self.host = host + self.port = port def run(self): api = FastAPI() @@ -43,5 +45,4 @@ async def ready(): return Response(status_code=status.HTTP_202_ACCEPTED) import uvicorn - port = 2345 - uvicorn.run(api, host="0.0.0.0", port=port) + uvicorn.run(api, host=self.host, port=self.port) diff --git a/python/fedml/serving/templates/hf_template/config.yaml b/python/fedml/serving/templates/hf_template/config.yaml index da512f4a46..72551635d4 100644 --- a/python/fedml/serving/templates/hf_template/config.yaml +++ b/python/fedml/serving/templates/hf_template/config.yaml @@ -62,7 +62,7 @@ environment_variables: VERBOSE: "True" # If you do not have any GPU resource but want to serve the model -# Try fedml® launch platform, and uncomment the following lines. +# Try TensorOpera® launch platform, and uncomment the following lines. # ------------------------------------------------------------ computing: minimum_num_gpus: 1 # minimum # of GPUs to provision diff --git a/python/fedml/train/llm/hf_trainer.py b/python/fedml/train/llm/hf_trainer.py index aaf27340fe..e287a8518f 100644 --- a/python/fedml/train/llm/hf_trainer.py +++ b/python/fedml/train/llm/hf_trainer.py @@ -64,7 +64,7 @@ def __init__( if TYPE_CHECKING: self.args: ExperimentArguments = self.args # noqa - def log(self, logs: Dict[str, float]) -> None: + def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None: # Adapted from https://github.com/huggingface/transformers/blob/b71f20a7c9f3716d30f6738501559acf863e2c5c/examples/pytorch/language-modeling/run_clm.py#L630-L634 # compute perplexity for key in tuple(logs.keys()): @@ -77,7 +77,7 @@ def log(self, logs: Dict[str, float]) -> None: perplexity = math.inf logs[f"{prefix}perplexity"] = perplexity - super().log(logs) + super().log(logs, start_time) def has_callback(self, callback: Union[Type[TrainerCallback], TrainerCallback]) -> bool: # Adapted from https://github.com/huggingface/transformers/blob/a7da2996a00c0ea083012ac86ab70f0bc4799f33/src/transformers/trainer_callback.py#L332 diff --git a/python/fedml/workflow/driver_example/customized_job_example/README.md b/python/fedml/workflow/driver_example/customized_job_example/README.md index 647cddc290..cd95ef5c75 100644 --- a/python/fedml/workflow/driver_example/customized_job_example/README.md +++ b/python/fedml/workflow/driver_example/customized_job_example/README.md @@ -1,6 +1,6 @@ # Make your own workflow with multiple jobs -## Define the job yaml based on the FEDML® Launch docs (https://doc.fedml.ai/launch) +## Define the job yaml based on the TensorOpera® Launch docs (https://doc.fedml.ai/launch) ``` working_directory = os.path.dirname(os.path.abspath(__file__)) deploy_image_job_yaml = os.path.join(working_directory, "deploy_image_job.yaml") @@ -119,7 +119,7 @@ ``` The output of the above deploy workflow is as follows. ``` -Submitting your job to FedML® Nexus AI Platform: 100%|██████████| 3.00k/3.00k [00:00<00:00, 3.10kB/s] +Submitting your job to TensorOpera® Nexus AI Platform: 100%|██████████| 3.00k/3.00k [00:00<00:00, 3.10kB/s] Final status of the workflow is as follows. JobStatus.FINISHED Output of the workflow is as follows. {'endpoint_id': 2131, 'endpoint_name': 'endpoint_test1', 'inference_url': 'https://open-test.fedml.ai/inference', 'request_body': {'arr': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0100005, -0.0100005, -0.0100005, -0.013973799, -0.0189315247, -0.023184301, -0.0360728861, -0.0392619154, -0.0380269994, -0.0390143887, -0.0346046778, -0.0257765396, -0.0209733754, -0.0217809993, -0.0144984527, -0.0118807892, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0178081425, -0.0232058779, -0.0298662898, -0.0414395151, -0.0586512813, -0.0812643979, -0.105997038, -0.121704878, -0.134457288, -0.139756261, -0.141562422, -0.135229133, -0.120246727, -0.104490087, -0.0870044931, -0.0716699334, -0.0485892545, -0.0324260775, -0.0216926329, -0.0100005, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0132956624, -0.0225936238, -0.0383702224, -0.0598206019, -0.0842014426, -0.118390816, -0.154266827, -0.188282524, -0.219803054, -0.242936317, -0.255020324, -0.259481423, -0.249404582, -0.226727106, -0.200418885, -0.16716117, -0.134317009, -0.0958717755, -0.0736565245, -0.0503983075, -0.0269783475, -0.0168919, -0.0100005, 0.0, 0.0, 0.0, 0.0, -0.0147795885, -0.025122101, -0.0381226487, -0.0786317321, -0.119593671, -0.165704529, -0.228814281, -0.288620224, -0.354491034, -0.421140618, -0.480243669, -0.527064646, -0.540807419, -0.521388017, -0.474446021, -0.403948632, -0.336571539, -0.271580657, -0.20666741, -0.154539645, -0.108856709, -0.0677589146, -0.0340327281, -0.0215091205, 0.0, 0.0, -0.0100005, -0.0107381289, -0.0260253876, -0.0570600482, -0.0914378767, -0.143000013, -0.199005834, -0.266034404, -0.353401549, -0.450251488, -0.551598332, -0.647939202, -0.743171364, -0.818162561, -0.851073275, -0.83112168, -0.763764496, -0.659992784, -0.547527626, -0.439376979, -0.33557659, -0.254856553, -0.183933732, -0.126755715, -0.0706477667, -0.0388818206, 0.0, 0.0, 0.0, -0.0134176155, -0.0390612132, -0.0873974922, -0.133107017, -0.194532142, -0.27478633, -0.369886454, -0.482920333, -0.605294063, -0.735621386, -0.869509827, -0.989564738, -1.09132506, -1.13182948, -1.09408349, -0.996373436, -0.868781173, -0.717778845, -0.570649327, -0.439021868, -0.326889344, -0.235934504, -0.167697996, -0.0995100269, -0.0479392976, -0.0187851186, 0.0, -0.0117322667, -0.0288274493, -0.0646532861, -0.118956716, -0.17783758, 1.53795878, 2.57176245, 1.53212043, 1.00392168, -0.179355647, -0.591732991, -1.05273662, -1.15378689, -1.22142979, -1.2388156, -1.21321586, -1.14302847, -1.02018313, -0.857098743, -0.676706697, -0.516203262, -0.379287244, -0.271402545, -0.189934521, -0.119940614, -0.0556340911, -0.0145752163, 0.0, -0.0206611389, -0.0437166621, -0.0808756237, -0.140488164, -0.207699245, 3.7747726, 3.14033146, 2.28939169, 1.76127332, 1.4318542, 1.1313135, 0.679164893, 0.665484747, 0.666043389, 0.680680095, 0.677305174, 0.665508286, 0.721340316, 0.883661589, 0.91751869, 0.0282541074, -0.401002939, -0.283099723, -0.194831338, -0.123075256, -0.066612686, -0.0161462821, -0.0112546885, -0.0293918605, -0.0484646663, -0.093178326, -0.146682925, -0.218121209, 0.830460131, 1.04725853, 0.147086928, 0.259684517, 0.495679969, 0.998953721, 1.29535061, 1.12204782, 1.41528197, 1.4259952, 1.36416372, 1.22805443, 1.03395727, 1.40874227, 1.73166837, 1.00260058, -0.401823716, -0.275049233, -0.181713744, -0.107567122, -0.0566041118, -0.0189159236, -0.0121427928, -0.0243168731, -0.050270377, -0.0887358114, -0.138806025, -0.212706019, -0.321729999, -0.462313723, -0.652442841, -0.845524923, -0.961258323, -0.793125052, -0.226359955, -0.640468216, -0.12372009, -0.167157468, -0.255843161, -0.441448335, -0.792766628, 1.30597044, 1.81460411, 0.691054579, -0.383665051, -0.26310513, -0.166473946, -0.0799663431, -0.0455007946, -0.0195541446, -0.0100005, -0.0186206584, -0.0414986832, -0.0722615997, -0.123238725, -0.212256343, -0.331309824, -0.491126078, -0.687704902, -0.86260267, -0.939124713, -0.869991467, -0.758168797, -0.722198511, -0.739826964, -0.809980626, -0.911188613, -1.00032001, -0.221550751, 1.53134484, 1.47605194, -0.273150738, -0.363157263, -0.252975575, -0.157152039, -0.0652009258, -0.0335283586, -0.0124209728, 0.0, -0.014849279, -0.0329699917, -0.0601451792, -0.118353377, -0.219271688, -0.354392407, -0.523006773, -0.71568287, -0.862626101, -0.90524289, -0.831592288, -0.751312636, -0.762948163, -0.825877849, -0.930232292, -1.04727288, -0.879016953, 1.11455708, 1.61660969, 0.264000765, -0.464282235, -0.354907482, -0.256014147, -0.158427696, -0.0620647188, -0.0242921899, 0.0, 0.0, -0.0117874599, -0.0252632841, -0.0502423656, -0.115068847, -0.235195531, -0.377531303, -0.547311188, -0.723069536, -0.848981953, -0.878897369, -0.826469482, -0.795496372, -0.883536617, -0.994814123, -1.13364619, -1.20871511, 5.60198157e-05, 1.28700658, 1.50082995, -0.122561277, -0.462110102, -0.360151562, -0.263898374, -0.166295096, -0.0568635009, -0.0105441394, 0.0, 0.0, 0.0, -0.016636779, -0.0423254862, -0.119931644, -0.252550583, -0.39191634, -0.556171069, -0.717849905, -0.829516019, -0.854549188, -0.84598967, -0.889246054, -1.03761315, -1.16457617, -1.30025654, -0.740699086, 1.05188993, 1.3036988, -0.163440609, -0.59058464, -0.474233049, -0.368789557, -0.274082099, -0.174264813, -0.0696188843, -0.018003151, 0.0, 0.0, 0.0, -0.0168610568, -0.0451688568, -0.131668459, -0.267838929, -0.398906806, -0.548202377, -0.690077015, -0.789823563, -0.831599129, -0.861314493, -0.95681566, -1.11036634, -1.22743073, -1.31006468, -0.02573686, 1.14239899, 0.761423491, -0.706825874, -0.608999426, -0.492457882, -0.380502867, -0.279282191, -0.173984018, -0.0767235054, -0.0195871373, -0.0100005, 0.0, -0.0100005, -0.024817808, -0.0552275065, -0.148243512, -0.283202341, -0.4022125, -0.534598048, -0.656007943, -0.738083794, -0.781657503, -0.824620535, -0.918824463, -1.04078449, -1.13391454, -1.09212795, 0.70592031, 1.17679031, -0.37378182, -0.758547572, -0.62868064, -0.501492113, -0.381043892, -0.270505206, -0.168251255, -0.0784168728, -0.022799968, -0.0157856413, 0.0, 0.0, -0.0269850288, -0.0676999793, -0.167498207, -0.298089736, -0.411096027, -0.522810883, -0.625838621, -0.693423683, -0.731704263, -0.767086709, -0.82998003, -0.921590434, -1.00562716, 0.0779492952, 1.22959017, 0.636500653, -0.901400043, -0.769630793, -0.635363773, -0.494618472, -0.369117095, -0.255794246, -0.156732083, -0.0783809414, -0.0267109338, -0.0148726634, 0.0, -0.0100005, -0.0348385687, -0.0869311199, -0.185622432, -0.311777198, -0.427690033, -0.530457702, -0.612837575, -0.669073252, -0.706628103, -0.737178903, -0.779583917, -0.866698428, -0.288157768, 1.2193059, 1.10500698, -0.50413989, -0.909137779, -0.774520432, -0.619405771, -0.472096102, -0.344822207, -0.235626373, -0.144455008, -0.0769092863, -0.0286146987, -0.0100005, 0.0, -0.0100005, -0.0342628198, -0.101174053, -0.195711272, -0.324606261, -0.442716711, -0.545960978, -0.637281741, -0.703742928, -0.753441795, -0.788772419, -0.829773267, -0.745526297, 0.949893727, 1.18293215, 0.385795002, -1.023299, -0.89872884, -0.736858006, -0.575258663, -0.430322485, -0.30912025, -0.209889823, -0.13189517, -0.0731506415, -0.0276674735, -0.0100005, 0.0, -0.0100005, -0.0400234981, -0.10709374, -0.194645695, -0.316981297, -0.440895564, -0.560086039, -0.667605659, -0.763806998, -0.843535003, -0.903604039, -0.938010529, 0.763887624, 1.12176928, 0.784111, -0.818046093, -0.991046672, -0.828340182, -0.652780006, -0.495325185, -0.364891317, -0.261772085, -0.17529887, -0.112966586, -0.0617374486, -0.0270715466, 0.0, 0.0, 0.0, -0.0406825662, -0.0978606438, -0.177848987, -0.287783481, -0.412614752, -0.543271605, -0.671018812, -0.798159188, -0.916686263, -1.02499517, -0.773682132, 1.09355574, 1.05041156, -0.498209852, -1.05256459, -0.870980804, -0.688431167, -0.523166414, -0.391308572, -0.282035183, -0.199071147, -0.13652517, -0.0893688913, -0.041317086, -0.016850831, 0.0, 0.0, 0.0, -0.0283386899, -0.0765120563, -0.141969555, -0.232658498, -0.341261378, -0.469723228, -0.606194512, -0.747366354, -0.880786554, -0.729389144, 0.895224865, 1.11943124, -0.105438374, -1.00783177, -0.859696548, -0.683890026, -0.531181637, -0.395889778, -0.289956123, -0.203267966, -0.14295145, -0.0963532989, -0.0643914026, -0.0337070214, -0.0111853003, 0.0, 0.0, -0.0100005, -0.0151722732, -0.0480051146, -0.0951161616, -0.160643556, -0.245453283, -0.353245922, -0.474265429, -0.598667391, -0.729305101, 0.389322873, 1.38694264, 1.37486731, -0.403963644, -0.77444593, -0.638730244, -0.502999283, -0.387339921, -0.279971294, -0.198381814, -0.135822721, -0.0965383286, -0.0633365644, -0.0427549534, -0.0257581657, -0.0100005, 0.0, 0.0, 0.0, 0.0, -0.0237543896, -0.0522032466, -0.0858749627, -0.140703979, -0.208515621, -0.290149335, -0.368567087, 0.334201602, 2.33307288, 2.27286258, 2.23777229, 0.0412218057, -0.494890333, -0.422342015, -0.339048837, -0.257069088, -0.185534152, -0.136577185, -0.0860242391, -0.0578259874, -0.033636416, -0.0181122384, -0.0100005, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0136274661, -0.0285803164, -0.0474793553, -0.0779785591, -0.118532172, -0.167201555, -0.214787719, 2.22171299, 4.30500754, 4.03125111, 3.36505818, 0.379953648, -0.284269948, -0.247694588, -0.205869945, -0.155925102, -0.116435448, -0.0857647974, -0.0546508166, -0.0401800073, -0.023758997, -0.0165780693, -0.0100005, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0115748833, -0.0284271584, -0.0506655656, -0.0740332846, -0.100455604, -0.124744578, 4.17363552, 7.81243004, 5.7896979, 0.322149281, -0.181506609, -0.160333393, -0.139182079, -0.118875455, -0.0873316648, -0.0700227708, -0.0540690537, -0.0384297037, -0.0265616274, -0.0161844507, -0.0119683967, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0132918601, -0.0159980455, -0.0207236291, -0.0266997366, -0.0284703819, -0.0343035092, -0.0410336906, -0.0488886427, -0.0548357917, -0.0551988782, -0.0469971082, -0.0388769026, -0.0316010302, -0.0285226846, -0.021736589, -0.0100005, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'model': 'fedml-qa-customer-0219/endpoint_test1'}, 'key_token': '5d427244128c45f58a74f3ecdb09b1e0'} Output of all jobs is as follows. {'deploy_image_job': {'endpoint_id': 2131, 'endpoint_name': 'endpoint_test1', 'inference_url': 'https://open-test.fedml.ai/inference', 'request_body': {'arr': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0100005, -0.0100005, -0.0100005, -0.013973799, -0.0189315247, -0.023184301, -0.0360728861, -0.0392619154, -0.0380269994, -0.0390143887, -0.0346046778, -0.0257765396, -0.0209733754, -0.0217809993, -0.0144984527, -0.0118807892, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0178081425, -0.0232058779, -0.0298662898, -0.0414395151, -0.0586512813, -0.0812643979, -0.105997038, -0.121704878, -0.134457288, -0.139756261, -0.141562422, -0.135229133, -0.120246727, -0.104490087, -0.0870044931, -0.0716699334, -0.0485892545, -0.0324260775, -0.0216926329, -0.0100005, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0132956624, -0.0225936238, -0.0383702224, -0.0598206019, -0.0842014426, -0.118390816, -0.154266827, -0.188282524, -0.219803054, -0.242936317, -0.255020324, -0.259481423, -0.249404582, -0.226727106, -0.200418885, -0.16716117, -0.134317009, -0.0958717755, -0.0736565245, -0.0503983075, -0.0269783475, -0.0168919, -0.0100005, 0.0, 0.0, 0.0, 0.0, -0.0147795885, -0.025122101, -0.0381226487, -0.0786317321, -0.119593671, -0.165704529, -0.228814281, -0.288620224, -0.354491034, -0.421140618, -0.480243669, -0.527064646, -0.540807419, -0.521388017, -0.474446021, -0.403948632, -0.336571539, -0.271580657, -0.20666741, -0.154539645, -0.108856709, -0.0677589146, -0.0340327281, -0.0215091205, 0.0, 0.0, -0.0100005, -0.0107381289, -0.0260253876, -0.0570600482, -0.0914378767, -0.143000013, -0.199005834, -0.266034404, -0.353401549, -0.450251488, -0.551598332, -0.647939202, -0.743171364, -0.818162561, -0.851073275, -0.83112168, -0.763764496, -0.659992784, -0.547527626, -0.439376979, -0.33557659, -0.254856553, -0.183933732, -0.126755715, -0.0706477667, -0.0388818206, 0.0, 0.0, 0.0, -0.0134176155, -0.0390612132, -0.0873974922, -0.133107017, -0.194532142, -0.27478633, -0.369886454, -0.482920333, -0.605294063, -0.735621386, -0.869509827, -0.989564738, -1.09132506, -1.13182948, -1.09408349, -0.996373436, -0.868781173, -0.717778845, -0.570649327, -0.439021868, -0.326889344, -0.235934504, -0.167697996, -0.0995100269, -0.0479392976, -0.0187851186, 0.0, -0.0117322667, -0.0288274493, -0.0646532861, -0.118956716, -0.17783758, 1.53795878, 2.57176245, 1.53212043, 1.00392168, -0.179355647, -0.591732991, -1.05273662, -1.15378689, -1.22142979, -1.2388156, -1.21321586, -1.14302847, -1.02018313, -0.857098743, -0.676706697, -0.516203262, -0.379287244, -0.271402545, -0.189934521, -0.119940614, -0.0556340911, -0.0145752163, 0.0, -0.0206611389, -0.0437166621, -0.0808756237, -0.140488164, -0.207699245, 3.7747726, 3.14033146, 2.28939169, 1.76127332, 1.4318542, 1.1313135, 0.679164893, 0.665484747, 0.666043389, 0.680680095, 0.677305174, 0.665508286, 0.721340316, 0.883661589, 0.91751869, 0.0282541074, -0.401002939, -0.283099723, -0.194831338, -0.123075256, -0.066612686, -0.0161462821, -0.0112546885, -0.0293918605, -0.0484646663, -0.093178326, -0.146682925, -0.218121209, 0.830460131, 1.04725853, 0.147086928, 0.259684517, 0.495679969, 0.998953721, 1.29535061, 1.12204782, 1.41528197, 1.4259952, 1.36416372, 1.22805443, 1.03395727, 1.40874227, 1.73166837, 1.00260058, -0.401823716, -0.275049233, -0.181713744, -0.107567122, -0.0566041118, -0.0189159236, -0.0121427928, -0.0243168731, -0.050270377, -0.0887358114, -0.138806025, -0.212706019, -0.321729999, -0.462313723, -0.652442841, -0.845524923, -0.961258323, -0.793125052, -0.226359955, -0.640468216, -0.12372009, -0.167157468, -0.255843161, -0.441448335, -0.792766628, 1.30597044, 1.81460411, 0.691054579, -0.383665051, -0.26310513, -0.166473946, -0.0799663431, -0.0455007946, -0.0195541446, -0.0100005, -0.0186206584, -0.0414986832, -0.0722615997, -0.123238725, -0.212256343, -0.331309824, -0.491126078, -0.687704902, -0.86260267, -0.939124713, -0.869991467, -0.758168797, -0.722198511, -0.739826964, -0.809980626, -0.911188613, -1.00032001, -0.221550751, 1.53134484, 1.47605194, -0.273150738, -0.363157263, -0.252975575, -0.157152039, -0.0652009258, -0.0335283586, -0.0124209728, 0.0, -0.014849279, -0.0329699917, -0.0601451792, -0.118353377, -0.219271688, -0.354392407, -0.523006773, -0.71568287, -0.862626101, -0.90524289, -0.831592288, -0.751312636, -0.762948163, -0.825877849, -0.930232292, -1.04727288, -0.879016953, 1.11455708, 1.61660969, 0.264000765, -0.464282235, -0.354907482, -0.256014147, -0.158427696, -0.0620647188, -0.0242921899, 0.0, 0.0, -0.0117874599, -0.0252632841, -0.0502423656, -0.115068847, -0.235195531, -0.377531303, -0.547311188, -0.723069536, -0.848981953, -0.878897369, -0.826469482, -0.795496372, -0.883536617, -0.994814123, -1.13364619, -1.20871511, 5.60198157e-05, 1.28700658, 1.50082995, -0.122561277, -0.462110102, -0.360151562, -0.263898374, -0.166295096, -0.0568635009, -0.0105441394, 0.0, 0.0, 0.0, -0.016636779, -0.0423254862, -0.119931644, -0.252550583, -0.39191634, -0.556171069, -0.717849905, -0.829516019, -0.854549188, -0.84598967, -0.889246054, -1.03761315, -1.16457617, -1.30025654, -0.740699086, 1.05188993, 1.3036988, -0.163440609, -0.59058464, -0.474233049, -0.368789557, -0.274082099, -0.174264813, -0.0696188843, -0.018003151, 0.0, 0.0, 0.0, -0.0168610568, -0.0451688568, -0.131668459, -0.267838929, -0.398906806, -0.548202377, -0.690077015, -0.789823563, -0.831599129, -0.861314493, -0.95681566, -1.11036634, -1.22743073, -1.31006468, -0.02573686, 1.14239899, 0.761423491, -0.706825874, -0.608999426, -0.492457882, -0.380502867, -0.279282191, -0.173984018, -0.0767235054, -0.0195871373, -0.0100005, 0.0, -0.0100005, -0.024817808, -0.0552275065, -0.148243512, -0.283202341, -0.4022125, -0.534598048, -0.656007943, -0.738083794, -0.781657503, -0.824620535, -0.918824463, -1.04078449, -1.13391454, -1.09212795, 0.70592031, 1.17679031, -0.37378182, -0.758547572, -0.62868064, -0.501492113, -0.381043892, -0.270505206, -0.168251255, -0.0784168728, -0.022799968, -0.0157856413, 0.0, 0.0, -0.0269850288, -0.0676999793, -0.167498207, -0.298089736, -0.411096027, -0.522810883, -0.625838621, -0.693423683, -0.731704263, -0.767086709, -0.82998003, -0.921590434, -1.00562716, 0.0779492952, 1.22959017, 0.636500653, -0.901400043, -0.769630793, -0.635363773, -0.494618472, -0.369117095, -0.255794246, -0.156732083, -0.0783809414, -0.0267109338, -0.0148726634, 0.0, -0.0100005, -0.0348385687, -0.0869311199, -0.185622432, -0.311777198, -0.427690033, -0.530457702, -0.612837575, -0.669073252, -0.706628103, -0.737178903, -0.779583917, -0.866698428, -0.288157768, 1.2193059, 1.10500698, -0.50413989, -0.909137779, -0.774520432, -0.619405771, -0.472096102, -0.344822207, -0.235626373, -0.144455008, -0.0769092863, -0.0286146987, -0.0100005, 0.0, -0.0100005, -0.0342628198, -0.101174053, -0.195711272, -0.324606261, -0.442716711, -0.545960978, -0.637281741, -0.703742928, -0.753441795, -0.788772419, -0.829773267, -0.745526297, 0.949893727, 1.18293215, 0.385795002, -1.023299, -0.89872884, -0.736858006, -0.575258663, -0.430322485, -0.30912025, -0.209889823, -0.13189517, -0.0731506415, -0.0276674735, -0.0100005, 0.0, -0.0100005, -0.0400234981, -0.10709374, -0.194645695, -0.316981297, -0.440895564, -0.560086039, -0.667605659, -0.763806998, -0.843535003, -0.903604039, -0.938010529, 0.763887624, 1.12176928, 0.784111, -0.818046093, -0.991046672, -0.828340182, -0.652780006, -0.495325185, -0.364891317, -0.261772085, -0.17529887, -0.112966586, -0.0617374486, -0.0270715466, 0.0, 0.0, 0.0, -0.0406825662, -0.0978606438, -0.177848987, -0.287783481, -0.412614752, -0.543271605, -0.671018812, -0.798159188, -0.916686263, -1.02499517, -0.773682132, 1.09355574, 1.05041156, -0.498209852, -1.05256459, -0.870980804, -0.688431167, -0.523166414, -0.391308572, -0.282035183, -0.199071147, -0.13652517, -0.0893688913, -0.041317086, -0.016850831, 0.0, 0.0, 0.0, -0.0283386899, -0.0765120563, -0.141969555, -0.232658498, -0.341261378, -0.469723228, -0.606194512, -0.747366354, -0.880786554, -0.729389144, 0.895224865, 1.11943124, -0.105438374, -1.00783177, -0.859696548, -0.683890026, -0.531181637, -0.395889778, -0.289956123, -0.203267966, -0.14295145, -0.0963532989, -0.0643914026, -0.0337070214, -0.0111853003, 0.0, 0.0, -0.0100005, -0.0151722732, -0.0480051146, -0.0951161616, -0.160643556, -0.245453283, -0.353245922, -0.474265429, -0.598667391, -0.729305101, 0.389322873, 1.38694264, 1.37486731, -0.403963644, -0.77444593, -0.638730244, -0.502999283, -0.387339921, -0.279971294, -0.198381814, -0.135822721, -0.0965383286, -0.0633365644, -0.0427549534, -0.0257581657, -0.0100005, 0.0, 0.0, 0.0, 0.0, -0.0237543896, -0.0522032466, -0.0858749627, -0.140703979, -0.208515621, -0.290149335, -0.368567087, 0.334201602, 2.33307288, 2.27286258, 2.23777229, 0.0412218057, -0.494890333, -0.422342015, -0.339048837, -0.257069088, -0.185534152, -0.136577185, -0.0860242391, -0.0578259874, -0.033636416, -0.0181122384, -0.0100005, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0136274661, -0.0285803164, -0.0474793553, -0.0779785591, -0.118532172, -0.167201555, -0.214787719, 2.22171299, 4.30500754, 4.03125111, 3.36505818, 0.379953648, -0.284269948, -0.247694588, -0.205869945, -0.155925102, -0.116435448, -0.0857647974, -0.0546508166, -0.0401800073, -0.023758997, -0.0165780693, -0.0100005, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0115748833, -0.0284271584, -0.0506655656, -0.0740332846, -0.100455604, -0.124744578, 4.17363552, 7.81243004, 5.7896979, 0.322149281, -0.181506609, -0.160333393, -0.139182079, -0.118875455, -0.0873316648, -0.0700227708, -0.0540690537, -0.0384297037, -0.0265616274, -0.0161844507, -0.0119683967, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0132918601, -0.0159980455, -0.0207236291, -0.0266997366, -0.0284703819, -0.0343035092, -0.0410336906, -0.0488886427, -0.0548357917, -0.0551988782, -0.0469971082, -0.0388769026, -0.0316010302, -0.0285226846, -0.021736589, -0.0100005, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'model': 'fedml-qa-customer-0219/endpoint_test1'}, 'key_token': '5d427244128c45f58a74f3ecdb09b1e0'}} diff --git a/python/fedml/workflow/driver_example/customized_job_example/deploy_image_job.yaml b/python/fedml/workflow/driver_example/customized_job_example/deploy_image_job.yaml index 1b6bab3582..917fa5b0d4 100755 --- a/python/fedml/workflow/driver_example/customized_job_example/deploy_image_job.yaml +++ b/python/fedml/workflow/driver_example/customized_job_example/deploy_image_job.yaml @@ -6,12 +6,12 @@ workspace: deploy_image_job # Running entry commands which will be executed as the job entry point. # Support multiple lines, which can not be empty. job: | - echo "current job id: $FEDML_CURRENT_RUN_ID" - echo "current edge id: $FEDML_CURRENT_EDGE_ID" - echo "Hello, Here is the FedML Nexus AI platform." - echo "Current directory is as follows." - pwd - sleep 3 + echo "current job id: $FEDML_CURRENT_RUN_ID" + echo "current edge id: $FEDML_CURRENT_EDGE_ID" + echo "Hello, Here is the FedML Nexus AI platform." + echo "Current directory is as follows." + pwd + sleep 3 job_type: deploy # options: train, deploy, federate diff --git a/python/fedml/workflow/driver_example/customized_job_example/deploy_image_job/fedml_model_config.yaml b/python/fedml/workflow/driver_example/customized_job_example/deploy_image_job/fedml_model_config.yaml index fdd6e3b95f..6d6a1cd0ee 100644 --- a/python/fedml/workflow/driver_example/customized_job_example/deploy_image_job/fedml_model_config.yaml +++ b/python/fedml/workflow/driver_example/customized_job_example/deploy_image_job/fedml_model_config.yaml @@ -1,7 +1,8 @@ -workspace: "." -entry_point: "main_entry.py" +workspace: "./" +entry_point: "mnist_serve_main.py" data_cache_dir: "" + bootstrap: | echo "Bootstrap start..." pip install fedml==0.8.29 diff --git a/python/fedml/workflow/driver_example/customized_job_example/deploy_image_job/mnist_serve_main.py b/python/fedml/workflow/driver_example/customized_job_example/deploy_image_job/mnist_serve_main.py new file mode 100644 index 0000000000..6367ea487f --- /dev/null +++ b/python/fedml/workflow/driver_example/customized_job_example/deploy_image_job/mnist_serve_main.py @@ -0,0 +1,37 @@ +from fedml.serving import FedMLPredictor +from fedml.serving import FedMLInferenceRunner +from model.minist_model import LogisticRegression + +# This is the model file that will upload to MLOps +MODEL_PARMS_DIR = "./model/model_parms_from_mlops" +# If you do not want to upload the model file to MLOps, +# (i.e., you want to use the model file in the lcoal DATA_CACHE_DIR) +# Please use the DATA_CACHE_DIR and specify DATA_CACHE_DIR +# in the fedml_model_config.yaml +# DATA_CACHE_DIR = "" + +class MnistPredictor(FedMLPredictor): + def __init__(self): + import pickle + import torch + + with open(MODEL_PARMS_DIR, 'rb') as model_file_obj: + model_params = pickle.load(model_file_obj) + + output_dim = 10 + + self.model = LogisticRegression(28 * 28, output_dim) + + self.model.load_state_dict(model_params) + + self.list_to_tensor_func = torch.tensor + + def predict(self, request): + arr = request["arr"] + input_tensor = self.list_to_tensor_func(arr) + return self.model(input_tensor) + +if __name__ == "__main__": + predictor = MnistPredictor() + fedml_inference_runner = FedMLInferenceRunner(predictor) + fedml_inference_runner.run() \ No newline at end of file diff --git a/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job.yaml b/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job.yaml new file mode 100755 index 0000000000..8ac9300165 --- /dev/null +++ b/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job.yaml @@ -0,0 +1,29 @@ +# Local directory where your source code resides. +# It should be the relative path to this job yaml file or the absolute path. +# If your job doesn't contain any source code, it can be empty. +workspace: deploy_llm_job + +# Running entry commands which will be executed as the job entry point. +# Support multiple lines, which can not be empty. +job: | + echo "current job id: $FEDML_CURRENT_RUN_ID" + echo "current edge id: $FEDML_CURRENT_EDGE_ID" + echo "Hello, Here is the FedML Nexus AI platform." + echo "Current directory is as follows." + pwd + sleep 3 + +job_type: deploy # options: train, deploy, federate + +# Bootstrap shell commands which will be executed before running entry commands. +# Support multiple lines, which can be empty. +bootstrap: | + pip install -r requirements.txt + echo "Bootstrap finished." + +computing: + #resource_type: RTX-3090 # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type + resource_type: A100-80GB-SXM + minimum_num_gpus: 1 # minimum # of GPUs to provision + maximum_cost_per_hour: $10 # max cost per hour of all machines for your job + # device_type: GPU # GPU or CPU diff --git a/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/.gitignore b/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/.gitignore new file mode 100644 index 0000000000..0d20b6487c --- /dev/null +++ b/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/python/examples/deploy/quick_start/src/app/pipe/__init__.py b/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/__init__.py similarity index 100% rename from python/examples/deploy/quick_start/src/app/pipe/__init__.py rename to python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/__init__.py diff --git a/python/fedml/workflow/driver_example/customized_job_example/deploy_image_job/__init__.py b/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/app/__init__.py similarity index 100% rename from python/fedml/workflow/driver_example/customized_job_example/deploy_image_job/__init__.py rename to python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/app/__init__.py diff --git a/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/app/pipe/__init__.py b/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/app/pipe/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/examples/deploy/quick_start/src/app/pipe/constants.py b/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/app/pipe/constants.py similarity index 100% rename from python/examples/deploy/quick_start/src/app/pipe/constants.py rename to python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/app/pipe/constants.py diff --git a/python/examples/deploy/quick_start/src/app/pipe/instruct_pipeline.py b/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/app/pipe/instruct_pipeline.py similarity index 100% rename from python/examples/deploy/quick_start/src/app/pipe/instruct_pipeline.py rename to python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/app/pipe/instruct_pipeline.py diff --git a/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/config/__init__.py b/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/config/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/fedml_model_config.yaml b/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/fedml_model_config.yaml new file mode 100644 index 0000000000..bff517ef6d --- /dev/null +++ b/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/fedml_model_config.yaml @@ -0,0 +1,12 @@ +workspace: "." +entry_point: "main_entry.py" + +auto_detect_public_ip: true +server_external_port: 20203 +server_internal_port: 2203 + +bootstrap: | + echo "Bootstrap start..." + pip install -U fedml + sh ./config/bootstrap.sh + echo "Bootstrap finished" diff --git a/python/fedml/workflow/driver_example/customized_job_example/deploy_image_job/main_entry.py b/python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/main_entry.py similarity index 100% rename from python/fedml/workflow/driver_example/customized_job_example/deploy_image_job/main_entry.py rename to python/fedml/workflow/driver_example/customized_job_example/deploy_llm_job/main_entry.py diff --git a/python/fedml/workflow/driver_example/customized_job_example/train_job.yaml b/python/fedml/workflow/driver_example/customized_job_example/train_job.yaml index ab6ab29c0e..86c9df6594 100755 --- a/python/fedml/workflow/driver_example/customized_job_example/train_job.yaml +++ b/python/fedml/workflow/driver_example/customized_job_example/train_job.yaml @@ -7,7 +7,7 @@ workspace: train_job # It should be the full name of the image with tag. # If you want to use the default image, it can be empty. docker: - image: fedml/fedml-default-launch:cu12.1-u22.04 + image: fedml/fedml-launch-job:cu12.1-u22.04 # Running entry commands which will be executed as the job entry point. # Support multiple lines, which can not be empty. @@ -30,8 +30,7 @@ bootstrap: | echo "Bootstrap finished." computing: - #resource_type: RTX-4090 # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type - resource_type: A100-80GB-SXM + resource_type: A100-80GB-SXM # e.g., A100-80G, please check the resource type list by "fedml show-resource-type" or visiting URL: https://open.fedml.ai/accelerator_resource_type minimum_num_gpus: 1 # minimum # of GPUs to provision maximum_cost_per_hour: $10 # max cost per hour of all machines for your job # device_type: GPU # GPU or CPU diff --git a/python/fedml/workflow/driver_example/hello_world_job.yaml b/python/fedml/workflow/driver_example/hello_world_job.yaml index e1dcb02f7e..e63712f99a 100755 --- a/python/fedml/workflow/driver_example/hello_world_job.yaml +++ b/python/fedml/workflow/driver_example/hello_world_job.yaml @@ -10,7 +10,7 @@ workspace: hello_world # It should be the full name of the image with tag. # If you want to use the default image, it can be empty. #docker: -# image: fedml/fedml-default-launch:cu12.1-u22.04 +# image: fedml/fedml-launch-job:cu12.1-u22.04 # Running entry commands which will be executed as the job entry point. # Support multiple lines, which can not be empty. diff --git a/python/setup.py b/python/setup.py index ae1efc0dff..032bdb4eed 100644 --- a/python/setup.py +++ b/python/setup.py @@ -40,21 +40,19 @@ def finalize_options(self): 'multiprocess', 'networkx<3.0', 'ntplib', - 'numpy>=1.21', + 'numpy<2.0.0', 'onnx', 'paho-mqtt<2.0.0', 'pandas', 'prettytable', 'py-machineid', 'pydantic', - 'pydantic-settings', 'pytest', 'pytest-mock', 'python-rapidjson>=0.9.1', 'redis', 'scikit-learn', 'smart-open==6.3.0', - 'spacy', 'sqlalchemy', 'toposort', 'torch>=1.13.1', @@ -66,6 +64,12 @@ def finalize_options(self): 'uvicorn', 'wandb==0.13.2', 'wget', + # Need to pin this version due to breaking change released in python docker sdk + 'requests<2.32', + 'python-dotenv', + 'protobuf>=3.20.2,<4.0dev', + 'typer<0.10.0,>=0.3.0', + 'fastapi-cli==0.0.1' ] requirements_extra_mpi = [ @@ -113,12 +117,16 @@ def finalize_options(self): "deepspeed>=0.10.2", ] +requirements_extra_nlp = [ + 'spacy>=3.2.0,<3.3.0', +] + # if platform.machine() == "x86_64": # requirements.append("MNN==1.1.6") setup( name="fedml", - version="0.8.30", + version="0.9.2", author="FedML Team", author_email="ch@fedml.ai", description="A research and production integrated edge-cloud library for " @@ -165,6 +173,8 @@ def finalize_options(self): "fedml/core/mlops/ssl/open-test.fedml.ai_bundle.crt", "fedml/core/mlops/ssl/open-release.fedml.ai_bundle.crt", "fedml/core/mlops/ssl/open-root-ca.crt", + "fedml/core/mlops/ssl/open.chainopera.ai.local.crt", + "fedml/core/mlops/ssl/open.chainopera.ai.local.rootca.crt", ], ) ], @@ -178,6 +188,7 @@ def finalize_options(self): "llm": requirements_extra_llm, "mxnet": requirements_extra_mxnet, "tensorflow": requirements_extra_tf, + "nlp": requirements_extra_nlp, }, package_data={"": ["py.typed"]}, license="Apache 2.0", diff --git a/python/spotlight_prj/unitedllm/README.md b/python/spotlight_prj/unitedllm/README.md index 5d5972cef1..5a300f2cfc 100644 --- a/python/spotlight_prj/unitedllm/README.md +++ b/python/spotlight_prj/unitedllm/README.md @@ -4,7 +4,7 @@ # UnitedLLM: Training and Serving LLM Collaboratively on Decentralized GPU Clouds -[FEDML® UnitedLLM](https://blog.fedml.ai/releasing-fedllm-build-your-own-large-language-models-on-proprietary-data-using-the-fedml-platform/) +[TensorOpera® UnitedLLM](https://blog.fedml.ai/releasing-fedllm-build-your-own-large-language-models-on-proprietary-data-using-the-fedml-platform/) is an MLOps-supported training pipeline for decentralized pretraining and finetuning of large language models. ## Getting Started