From a912c5738a90992fd06fd2394e068dcde9139aa0 Mon Sep 17 00:00:00 2001 From: charlieyl Date: Tue, 11 Feb 2025 12:53:28 +0800 Subject: [PATCH] [bugfix] Handle deployment failure by deleting deployed replicas and releasing GPU --- .../computing/scheduler/model_scheduler/master_job_runner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py index 00b08acfb..bc943307f 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py @@ -11,6 +11,7 @@ import fedml from fedml.core.mlops import MLOpsRuntimeLog, MLOpsConfigs from fedml.core.mlops.mlops_runtime_log import MLOpsFormatter +from .device_model_msg_object import FedMLModelMsgObject from .device_client_constants import ClientConstants from .device_model_cache import FedMLModelCache from .device_server_constants import ServerConstants @@ -278,6 +279,9 @@ def process_deployment_result_message(self, topic=None, payload=None): end_point_id, end_point_name, payload_json["model_name"], "", ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED, message_center=self.message_center) + # when report failed to the MLOps, need to delete the replica has successfully deployed and release the gpu + model_msg_object = FedMLModelMsgObject(topic, payload) + self.send_deployment_delete_request_to_edges(payload, model_msg_object, message_center=self.message_center) return # Failure handler, send the rollback message to the worker devices only if it has not been rollback