FedML-AI
diff --git a/‎python/examples/launch/dump.rdb‎
215 KB b/‎python/examples/launch/dump.rdb‎
215 KB
diff --git a/‎python/examples/launch/hello_job_with_container.yaml‎
Lines changed: 1 addition & 1 deletion b/‎python/examples/launch/hello_job_with_container.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/fedml/__init__.py‎
Lines changed: 7 additions & 3 deletions b/‎python/fedml/__init__.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎python/fedml/api/api_test.py‎
Lines changed: 12 additions & 9 deletions b/‎python/fedml/api/api_test.py‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎python/fedml/computing/scheduler/comm_utils/constants.py‎
Lines changed: 1 addition & 1 deletion b/‎python/fedml/computing/scheduler/comm_utils/constants.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/fedml/computing/scheduler/comm_utils/container_utils.py‎
Lines changed: 2 additions & 2 deletions b/‎python/fedml/computing/scheduler/comm_utils/container_utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/fedml/computing/scheduler/comm_utils/job_cleanup.py‎
Lines changed: 1 addition & 0 deletions b/‎python/fedml/computing/scheduler/comm_utils/job_cleanup.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/fedml/computing/scheduler/comm_utils/job_monitor.py‎
Lines changed: 15 additions & 10 deletions b/‎python/fedml/computing/scheduler/comm_utils/job_monitor.py‎
Lines changed: 15 additions & 10 deletions
diff --git a/‎python/fedml/computing/scheduler/comm_utils/job_utils.py‎
Lines changed: 6 additions & 2 deletions b/‎python/fedml/computing/scheduler/comm_utils/job_utils.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎python/fedml/computing/scheduler/master/base_master_agent.py‎
Lines changed: 126 additions & 0 deletions b/‎python/fedml/computing/scheduler/master/base_master_agent.py‎
Lines changed: 126 additions & 0 deletions
@@ -43,7 +43,7 @@ job_type: train              # options: train, deploy, federate
 job_subtype: generate_training
 
 docker:
-  image: fedml/fedml-default-launch:cu12.1-u22.04
+  image: fedml/fedml-launch-job:cu12.1-u22.04
   #registry: docker.io
   #username: my_hub_user
   #password: my_hub_password
 
@@ -90,9 +90,13 @@ def init(args=None, check_env=True, should_init_logs=True):
     # Windows/Linux/MacOS compatability issues on multi-processing
     # https://github.com/pytorch/pytorch/issues/3492
     """
-    if multiprocessing.get_start_method() != "spawn":
-        # force all platforms (Windows/Linux/MacOS) to use the same way (spawn) for multiprocessing
-        multiprocessing.set_start_method("spawn", force=True)
+    if multiprocessing.get_start_method() != "fork":
+        # force all platforms (Windows/Linux/macOS) to use the same way (fork) for multiprocessing
+        multiprocessing.set_start_method("fork", force=True)
+
+    # if multiprocessing.get_start_method() != "spawn":
+    #     # force all platforms (Windows/Linux/MacOS) to use the same way (spawn) for multiprocessing
+    #     multiprocessing.set_start_method("spawn", force=True)
 
     """
     # https://stackoverflow.com/questions/53014306/error-15-initializing-libiomp5-dylib-but-found-libiomp5-dylib-already-initial
 
@@ -4,9 +4,9 @@
 import fedml
 
 # Login
-fedml.set_env_version("local")
+fedml.set_env_version("test")
 fedml.set_local_on_premise_platform_port(18080)
-error_code, error_msg = fedml.api.fedml_login(api_key="1316b93c82da40ce90113a2ed12f0b14")
+error_code, error_msg = fedml.api.fedml_login(api_key="")
 if error_code != 0:
     print("API Key is invalid!")
     exit(1)
@@ -18,20 +18,23 @@
 yaml_file = os.path.join(python_dir, "examples", "launch", "hello_job.yaml")
 
 # Launch job
+launch_result_list = list()
 for i in range(0, 10):
     launch_result = fedml.api.launch_job(yaml_file)
+    launch_result_list.append(launch_result)
     # launch_result = fedml.api.launch_job_on_cluster(yaml_file, "alex-cluster")
     if launch_result.result_code != 0:
         print(f"Failed to launch job. Reason: {launch_result.result_message}")
 
-exit(1)
-
 # Get job status
-log_result = fedml.api.run_logs(launch_result.run_id, 1, 100)
-if log_result is None or log_result.run_status is None:
-    print(f"Failed to get job status.")
-    exit(1)
-print(f"Run status {log_result.run_status}")
+while len(launch_result_list) > 0:
+    for launch_result in launch_result_list:
+        log_result = fedml.api.run_logs(launch_result.run_id, 1, 5)
+        if log_result is None or log_result.run_status is None:
+            print(f"Failed to get job status.")
+            #exit(1)
+        print(f"Run {launch_result.run_id}, status {log_result.run_status}")
+        time.sleep(0.5)
 
 # Get job logs
 time.sleep(30)
 
@@ -103,7 +103,7 @@ class SchedulerConstants:
     RUN_PROCESS_TYPE_BOOTSTRAP_PROCESS = "bootstrap-process"
 
     FEDML_DEFAULT_LAUNCH_CONTAINER_PREFIX = "fedml_default_launch_container"
-    FEDML_DEFAULT_LAUNCH_IMAGE = "fedml/fedml-default-launch:cu12.1-u22.04"
+    FEDML_DEFAULT_LAUNCH_IMAGE = "fedml/fedml-launch-job:cu12.1-u22.04"
     FEDML_DEFAULT_LOG_DIR = ".fedml/fedml-client/fedml/logs"
     FEDML_DEFAULT_DATA_DIR = ".fedml/fedml-client/fedml/data"
 
 
@@ -26,7 +26,7 @@ def get_instance():
 
     def get_docker_client(self):
         try:
-            client = docker.from_env()
+            client = docker.from_env(timeout=5, version="auto")
         except Exception:
             logging.error("Failed to connect to the docker daemon, please ensure that you have "
                           "installed Docker Desktop or Docker Engine, and the docker is running")
@@ -180,7 +180,7 @@ def get_container_rank_same_model(prefix: str):
         running_model_name = hash("model_endpoint_id_{}_name_{}_model_id_{}_name_{}_ver_{}")
         """
         try:
-            client = docker.from_env()
+            client = docker.from_env(timeout=5, version="auto")
         except Exception:
             logging.error("Failed to connect to the docker daemon, please ensure that you have "
                           "installed Docker Desktop or Docker Engine, and the docker is running")
 
@@ -44,6 +44,7 @@ def sync_run_process_gpu(self):
                     ComputeCacheManager.get_instance().get_gpu_cache().get_run_info_sync_lock_key("")
             ):
                 count = 0
+                client_data_interface.FedMLClientDataInterface.get_instance().create_job_table()
                 job_list = client_data_interface.FedMLClientDataInterface.get_instance().get_jobs_from_db()
                 for job in job_list.job_list:
                     count += 1
 
@@ -208,6 +208,8 @@ def monitor_replicas_number():
             endpoint_replicas_details = {}
             if isinstance(endpoint_detail, str):
                 endpoint_replicas_details = json.loads(endpoint_detail)
+                if isinstance(endpoint_replicas_details, str):
+                    endpoint_replicas_details = json.loads(endpoint_replicas_details)
 
             if "result" in endpoint_replicas_details:
                 endpoint_replica_details = {}
@@ -220,13 +222,7 @@ def monitor_replicas_number():
         for endpoint_id, num_replica in res_to_mlops.items():
             curr_version = fedml.get_env_version()
             num_replica_url_path = "fedmlModelServer/api/v1/endpoint/replica-info"
-            if curr_version == "release":
-                mlops_prefix = "https://open.fedml.ai/"
-            elif curr_version == "test":
-                mlops_prefix = "https://open-test.fedml.ai/"
-            else:
-                logging.error(f"Do not support the version {curr_version}.")
-                return
+            mlops_prefix = fedml._get_backend_service()
             url = f"{mlops_prefix}{num_replica_url_path}"
 
             cached_token = FedMLModelCache.get_instance().get_end_point_token_with_eid(endpoint_id)
@@ -348,7 +344,7 @@ def monitor_slave_run_process_status(self):
                     break
 
                 # Calc the timeout
-                started_time = int(float(job.started_time))
+                started_time = JobMonitor.get_started_time(job)
                 timeout = time.time() - started_time
 
                 job_type = JobRunnerUtils.parse_job_type(job.running_json)
@@ -436,6 +432,15 @@ def monitor_slave_run_process_status(self):
             logging.error(f"Exception when monitoring endpoint process on the slave agent.{traceback.format_exc()}")
             pass
 
+    @staticmethod
+    def get_started_time(job):
+        started_time = int(float(job.started_time))
+        if started_time <= 0:
+            started_time = int(float(job.updated_time))
+            if started_time <= 0:
+                started_time = time.time()
+        return started_time
+
     def monitor_master_run_process_status(self, server_id, device_info_reporter=None):
         try:
             ComputeCacheManager.get_instance().set_redis_params()
@@ -447,7 +452,7 @@ def monitor_master_run_process_status(self, server_id, device_info_reporter=None
                     break
 
                 # Calc the timeout
-                started_time = int(float(job.started_time))
+                started_time = JobMonitor.get_started_time(job)
                 timeout = time.time() - started_time
 
                 # Get the timeout threshold
@@ -704,7 +709,7 @@ def monitor_slave_endpoint_status(self):
                         endpoint_name = endpoint_json.get("end_point_name", None)
                         device_ids = endpoint_json.get("device_ids", [])
 
-                        started_time = int(float(job.started_time))
+                        started_time = JobMonitor.get_started_time(job)
                         timeout = time.time() - started_time
                         if timeout > SchedulerConstants.ENDPOINT_DEPLOYMENT_DEPLOYING_TIMEOUT:
                             print(f"[Worker][{job.job_id}:{job.edge_id}] Due to timeout, "
 
@@ -570,8 +570,9 @@ def get_run_container_name(run_id: int) -> str:
     @staticmethod
     def get_docker_client(docker_args: DockerArgs) -> DockerClient:
         try:
-            client = docker.from_env()
-            client.login(username=docker_args.username, password=docker_args.password, registry=docker_args.registry)
+            client = docker.from_env(timeout=5, version="auto")
+            if docker_args.username != "" and docker_args.registry != "":
+                client.login(username=docker_args.username, password=docker_args.password, registry=docker_args.registry)
         except Exception as e:
             raise Exception(f"Failed to connect to the docker daemon, please ensure that you have "
                             f"installed Docker Desktop or Docker Engine, and the docker is running. Exception {e}")
@@ -711,6 +712,9 @@ def parse_job_type(running_json):
         job_type = job_yaml.get("job_type", None)
         job_type = job_yaml.get("task_type",
                                 SchedulerConstants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type
+        model_config = running_json_obj.get("model_config", None)
+        if model_config is not None:
+            job_type = SchedulerConstants.JOB_TASK_TYPE_DEPLOY
         return job_type
 
     @staticmethod
 
@@ -0,0 +1,126 @@
+
+from multiprocessing import Process
+from ..comm_utils import sys_utils
+from ..comm_utils.job_cleanup import JobCleanup
+from ....core.mlops import MLOpsRuntimeLog, MLOpsMetrics
+from ..scheduler_core.master_api_daemon import MasterApiDaemon
+from ..scheduler_core.account_manager import FedMLAccountManager
+from ..scheduler_core.general_constants import GeneralConstants
+from abc import ABC, abstractmethod
+
+
+class FedMLBaseMasterAgent(ABC):
+
+    def __init__(self):
+        self.agent_args = None
+        self.master_api_daemon = None
+        self.master_api_process = None
+        self.mlops_metrics = MLOpsMetrics()
+        self.status_reporter = None
+        self.enable_simulation_cloud_agent = True
+        self.use_local_process_as_cloud_server = False
+        self.protocol_mgr = None
+
+    def login(
+            self, user_id, api_key=None, device_id=None,
+            os_name=None, role=None
+    ):
+        # Login account
+        login_result = FedMLAccountManager.get_instance().login(
+            user_id, api_key=api_key, device_id=device_id,
+            os_name=os_name, role=role
+        )
+        if login_result is not None:
+            self.agent_args = login_result
+        else:
+            return None
+
+        # Save the bound info
+        self._save_agent_info(
+            login_result.current_device_id + "." + login_result.os_name, login_result.edge_id)
+
+        # Init the logs for protocol manager
+        self._init_logs(login_result, login_result.edge_id)
+
+        # Create the protocol manager to communicate with the slave agents and MLOps.
+        self._create_protocol_manager(role, login_result)
+
+        # Initialize the protocol manager
+        # noinspection PyBoardException
+        try:
+            self._initialize_protocol_manager()
+        except Exception as e:
+            FedMLAccountManager.write_login_failed_file(is_client=False)
+            self.protocol_mgr.stop()
+            raise e
+
+        # Start the protocol manager to process the messages from MLOps and slave agents.
+        self.protocol_mgr.start()
+
+    @staticmethod
+    def logout():
+        GeneralConstants.cleanup_run_process(None, is_master=True)
+        sys_utils.cleanup_all_fedml_server_api_processes()
+
+    def _create_protocol_manager(self, role, login_result):
+        if self.protocol_mgr is not None:
+            return
+        self.protocol_mgr = self._generate_protocol_manager_instance(
+            login_result, agent_config=login_result.agent_config)
+        self.protocol_mgr.run_as_edge_server_and_agent = True \
+            if role == FedMLAccountManager.ROLE_EDGE_SERVER else False
+        self.protocol_mgr.run_as_cloud_agent = True if role == FedMLAccountManager.ROLE_CLOUD_AGENT else False
+        self.protocol_mgr.run_as_cloud_server = True if role == FedMLAccountManager.ROLE_CLOUD_SERVER else False
+        self.protocol_mgr.args = login_result
+        self.protocol_mgr.edge_id = login_result.edge_id
+        self.protocol_mgr.unique_device_id = login_result.unique_device_id
+        self.protocol_mgr.user_name = login_result.user_name
+        self.protocol_mgr.agent_config = login_result.agent_config
+        self.protocol_mgr.enable_simulation_cloud_agent = self.enable_simulation_cloud_agent
+        self.protocol_mgr.use_local_process_as_cloud_server = self.use_local_process_as_cloud_server
+
+    def _initialize_protocol_manager(self):
+        # Init local database
+        self._init_database()
+
+        # Initialize the master protocol
+        self.protocol_mgr.initialize()
+
+        # Report the IDLE status to MLOps
+        self.mlops_metrics.report_server_training_status(
+            None, GeneralConstants.MSG_MLOPS_SERVER_STATUS_IDLE, edge_id=self.agent_args.edge_id)
+
+        # Cleanup data when startup
+        JobCleanup.get_instance().sync_data_on_startup(self.agent_args.edge_id, is_client=False)
+
+        # Start the API server on master agent
+        self.master_api_daemon = MasterApiDaemon()
+        self.master_api_process = Process(target=self.master_api_daemon.run)
+        self.master_api_process.start()
+
+    def _init_logs(self, agent_args, edge_id):
+        # Init runtime logs
+        in_args = agent_args
+        in_args.log_file_dir = self._get_log_file_dir()
+        in_args.run_id = 0
+        in_args.role = "server"
+        in_args.edge_id = edge_id
+        in_args.using_mlops = True
+        in_args.server_agent_id = edge_id
+        MLOpsRuntimeLog.get_instance(in_args).init_logs()
+
+    @abstractmethod
+    def _get_log_file_dir(self):
+        pass
+
+    @abstractmethod
+    def _save_agent_info(self, unique_device_id, edge_id):
+        pass
+
+    @abstractmethod
+    def _init_database(self):
+        pass
+
+    @abstractmethod
+    def _generate_protocol_manager_instance(self, args, agent_config=None):
+        return None