Skip to content

Commit 2bbe2f9

Browse files
authored
Merge pull request #2095 from FedML-AI/alexleung/dev_v070_for_refactor
Alexleung/dev v070 for refactor
2 parents 3792255 + b19eaf1 commit 2bbe2f9

File tree

80 files changed

+8943
-1174
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+8943
-1174
lines changed

python/examples/launch/dump.rdb

215 KB
Binary file not shown.

python/examples/launch/hello_job_with_container.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ job_type: train # options: train, deploy, federate
4343
job_subtype: generate_training
4444

4545
docker:
46-
image: fedml/fedml-default-launch:cu12.1-u22.04
46+
image: fedml/fedml-launch-job:cu12.1-u22.04
4747
#registry: docker.io
4848
#username: my_hub_user
4949
#password: my_hub_password

python/fedml/__init__.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,13 @@ def init(args=None, check_env=True, should_init_logs=True):
9090
# Windows/Linux/MacOS compatability issues on multi-processing
9191
# https://github.com/pytorch/pytorch/issues/3492
9292
"""
93-
if multiprocessing.get_start_method() != "spawn":
94-
# force all platforms (Windows/Linux/MacOS) to use the same way (spawn) for multiprocessing
95-
multiprocessing.set_start_method("spawn", force=True)
93+
if multiprocessing.get_start_method() != "fork":
94+
# force all platforms (Windows/Linux/macOS) to use the same way (fork) for multiprocessing
95+
multiprocessing.set_start_method("fork", force=True)
96+
97+
# if multiprocessing.get_start_method() != "spawn":
98+
# # force all platforms (Windows/Linux/MacOS) to use the same way (spawn) for multiprocessing
99+
# multiprocessing.set_start_method("spawn", force=True)
96100

97101
"""
98102
# https://stackoverflow.com/questions/53014306/error-15-initializing-libiomp5-dylib-but-found-libiomp5-dylib-already-initial

python/fedml/api/api_test.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
import fedml
55

66
# Login
7-
fedml.set_env_version("local")
7+
fedml.set_env_version("test")
88
fedml.set_local_on_premise_platform_port(18080)
9-
error_code, error_msg = fedml.api.fedml_login(api_key="1316b93c82da40ce90113a2ed12f0b14")
9+
error_code, error_msg = fedml.api.fedml_login(api_key="")
1010
if error_code != 0:
1111
print("API Key is invalid!")
1212
exit(1)
@@ -18,20 +18,23 @@
1818
yaml_file = os.path.join(python_dir, "examples", "launch", "hello_job.yaml")
1919

2020
# Launch job
21+
launch_result_list = list()
2122
for i in range(0, 10):
2223
launch_result = fedml.api.launch_job(yaml_file)
24+
launch_result_list.append(launch_result)
2325
# launch_result = fedml.api.launch_job_on_cluster(yaml_file, "alex-cluster")
2426
if launch_result.result_code != 0:
2527
print(f"Failed to launch job. Reason: {launch_result.result_message}")
2628

27-
exit(1)
28-
2929
# Get job status
30-
log_result = fedml.api.run_logs(launch_result.run_id, 1, 100)
31-
if log_result is None or log_result.run_status is None:
32-
print(f"Failed to get job status.")
33-
exit(1)
34-
print(f"Run status {log_result.run_status}")
30+
while len(launch_result_list) > 0:
31+
for launch_result in launch_result_list:
32+
log_result = fedml.api.run_logs(launch_result.run_id, 1, 5)
33+
if log_result is None or log_result.run_status is None:
34+
print(f"Failed to get job status.")
35+
#exit(1)
36+
print(f"Run {launch_result.run_id}, status {log_result.run_status}")
37+
time.sleep(0.5)
3538

3639
# Get job logs
3740
time.sleep(30)

python/fedml/computing/scheduler/comm_utils/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ class SchedulerConstants:
103103
RUN_PROCESS_TYPE_BOOTSTRAP_PROCESS = "bootstrap-process"
104104

105105
FEDML_DEFAULT_LAUNCH_CONTAINER_PREFIX = "fedml_default_launch_container"
106-
FEDML_DEFAULT_LAUNCH_IMAGE = "fedml/fedml-default-launch:cu12.1-u22.04"
106+
FEDML_DEFAULT_LAUNCH_IMAGE = "fedml/fedml-launch-job:cu12.1-u22.04"
107107
FEDML_DEFAULT_LOG_DIR = ".fedml/fedml-client/fedml/logs"
108108
FEDML_DEFAULT_DATA_DIR = ".fedml/fedml-client/fedml/data"
109109

python/fedml/computing/scheduler/comm_utils/container_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def get_instance():
2626

2727
def get_docker_client(self):
2828
try:
29-
client = docker.from_env()
29+
client = docker.from_env(timeout=5, version="auto")
3030
except Exception:
3131
logging.error("Failed to connect to the docker daemon, please ensure that you have "
3232
"installed Docker Desktop or Docker Engine, and the docker is running")
@@ -180,7 +180,7 @@ def get_container_rank_same_model(prefix: str):
180180
running_model_name = hash("model_endpoint_id_{}_name_{}_model_id_{}_name_{}_ver_{}")
181181
"""
182182
try:
183-
client = docker.from_env()
183+
client = docker.from_env(timeout=5, version="auto")
184184
except Exception:
185185
logging.error("Failed to connect to the docker daemon, please ensure that you have "
186186
"installed Docker Desktop or Docker Engine, and the docker is running")

python/fedml/computing/scheduler/comm_utils/job_cleanup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def sync_run_process_gpu(self):
4444
ComputeCacheManager.get_instance().get_gpu_cache().get_run_info_sync_lock_key("")
4545
):
4646
count = 0
47+
client_data_interface.FedMLClientDataInterface.get_instance().create_job_table()
4748
job_list = client_data_interface.FedMLClientDataInterface.get_instance().get_jobs_from_db()
4849
for job in job_list.job_list:
4950
count += 1

python/fedml/computing/scheduler/comm_utils/job_monitor.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,8 @@ def monitor_replicas_number():
208208
endpoint_replicas_details = {}
209209
if isinstance(endpoint_detail, str):
210210
endpoint_replicas_details = json.loads(endpoint_detail)
211+
if isinstance(endpoint_replicas_details, str):
212+
endpoint_replicas_details = json.loads(endpoint_replicas_details)
211213

212214
if "result" in endpoint_replicas_details:
213215
endpoint_replica_details = {}
@@ -220,13 +222,7 @@ def monitor_replicas_number():
220222
for endpoint_id, num_replica in res_to_mlops.items():
221223
curr_version = fedml.get_env_version()
222224
num_replica_url_path = "fedmlModelServer/api/v1/endpoint/replica-info"
223-
if curr_version == "release":
224-
mlops_prefix = "https://open.fedml.ai/"
225-
elif curr_version == "test":
226-
mlops_prefix = "https://open-test.fedml.ai/"
227-
else:
228-
logging.error(f"Do not support the version {curr_version}.")
229-
return
225+
mlops_prefix = fedml._get_backend_service()
230226
url = f"{mlops_prefix}{num_replica_url_path}"
231227

232228
cached_token = FedMLModelCache.get_instance().get_end_point_token_with_eid(endpoint_id)
@@ -348,7 +344,7 @@ def monitor_slave_run_process_status(self):
348344
break
349345

350346
# Calc the timeout
351-
started_time = int(float(job.started_time))
347+
started_time = JobMonitor.get_started_time(job)
352348
timeout = time.time() - started_time
353349

354350
job_type = JobRunnerUtils.parse_job_type(job.running_json)
@@ -436,6 +432,15 @@ def monitor_slave_run_process_status(self):
436432
logging.error(f"Exception when monitoring endpoint process on the slave agent.{traceback.format_exc()}")
437433
pass
438434

435+
@staticmethod
436+
def get_started_time(job):
437+
started_time = int(float(job.started_time))
438+
if started_time <= 0:
439+
started_time = int(float(job.updated_time))
440+
if started_time <= 0:
441+
started_time = time.time()
442+
return started_time
443+
439444
def monitor_master_run_process_status(self, server_id, device_info_reporter=None):
440445
try:
441446
ComputeCacheManager.get_instance().set_redis_params()
@@ -447,7 +452,7 @@ def monitor_master_run_process_status(self, server_id, device_info_reporter=None
447452
break
448453

449454
# Calc the timeout
450-
started_time = int(float(job.started_time))
455+
started_time = JobMonitor.get_started_time(job)
451456
timeout = time.time() - started_time
452457

453458
# Get the timeout threshold
@@ -704,7 +709,7 @@ def monitor_slave_endpoint_status(self):
704709
endpoint_name = endpoint_json.get("end_point_name", None)
705710
device_ids = endpoint_json.get("device_ids", [])
706711

707-
started_time = int(float(job.started_time))
712+
started_time = JobMonitor.get_started_time(job)
708713
timeout = time.time() - started_time
709714
if timeout > SchedulerConstants.ENDPOINT_DEPLOYMENT_DEPLOYING_TIMEOUT:
710715
print(f"[Worker][{job.job_id}:{job.edge_id}] Due to timeout, "

python/fedml/computing/scheduler/comm_utils/job_utils.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -570,8 +570,9 @@ def get_run_container_name(run_id: int) -> str:
570570
@staticmethod
571571
def get_docker_client(docker_args: DockerArgs) -> DockerClient:
572572
try:
573-
client = docker.from_env()
574-
client.login(username=docker_args.username, password=docker_args.password, registry=docker_args.registry)
573+
client = docker.from_env(timeout=5, version="auto")
574+
if docker_args.username != "" and docker_args.registry != "":
575+
client.login(username=docker_args.username, password=docker_args.password, registry=docker_args.registry)
575576
except Exception as e:
576577
raise Exception(f"Failed to connect to the docker daemon, please ensure that you have "
577578
f"installed Docker Desktop or Docker Engine, and the docker is running. Exception {e}")
@@ -711,6 +712,9 @@ def parse_job_type(running_json):
711712
job_type = job_yaml.get("job_type", None)
712713
job_type = job_yaml.get("task_type",
713714
SchedulerConstants.JOB_TASK_TYPE_TRAIN) if job_type is None else job_type
715+
model_config = running_json_obj.get("model_config", None)
716+
if model_config is not None:
717+
job_type = SchedulerConstants.JOB_TASK_TYPE_DEPLOY
714718
return job_type
715719

716720
@staticmethod
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
2+
from multiprocessing import Process
3+
from ..comm_utils import sys_utils
4+
from ..comm_utils.job_cleanup import JobCleanup
5+
from ....core.mlops import MLOpsRuntimeLog, MLOpsMetrics
6+
from ..scheduler_core.master_api_daemon import MasterApiDaemon
7+
from ..scheduler_core.account_manager import FedMLAccountManager
8+
from ..scheduler_core.general_constants import GeneralConstants
9+
from abc import ABC, abstractmethod
10+
11+
12+
class FedMLBaseMasterAgent(ABC):
13+
14+
def __init__(self):
15+
self.agent_args = None
16+
self.master_api_daemon = None
17+
self.master_api_process = None
18+
self.mlops_metrics = MLOpsMetrics()
19+
self.status_reporter = None
20+
self.enable_simulation_cloud_agent = True
21+
self.use_local_process_as_cloud_server = False
22+
self.protocol_mgr = None
23+
24+
def login(
25+
self, user_id, api_key=None, device_id=None,
26+
os_name=None, role=None
27+
):
28+
# Login account
29+
login_result = FedMLAccountManager.get_instance().login(
30+
user_id, api_key=api_key, device_id=device_id,
31+
os_name=os_name, role=role
32+
)
33+
if login_result is not None:
34+
self.agent_args = login_result
35+
else:
36+
return None
37+
38+
# Save the bound info
39+
self._save_agent_info(
40+
login_result.current_device_id + "." + login_result.os_name, login_result.edge_id)
41+
42+
# Init the logs for protocol manager
43+
self._init_logs(login_result, login_result.edge_id)
44+
45+
# Create the protocol manager to communicate with the slave agents and MLOps.
46+
self._create_protocol_manager(role, login_result)
47+
48+
# Initialize the protocol manager
49+
# noinspection PyBoardException
50+
try:
51+
self._initialize_protocol_manager()
52+
except Exception as e:
53+
FedMLAccountManager.write_login_failed_file(is_client=False)
54+
self.protocol_mgr.stop()
55+
raise e
56+
57+
# Start the protocol manager to process the messages from MLOps and slave agents.
58+
self.protocol_mgr.start()
59+
60+
@staticmethod
61+
def logout():
62+
GeneralConstants.cleanup_run_process(None, is_master=True)
63+
sys_utils.cleanup_all_fedml_server_api_processes()
64+
65+
def _create_protocol_manager(self, role, login_result):
66+
if self.protocol_mgr is not None:
67+
return
68+
self.protocol_mgr = self._generate_protocol_manager_instance(
69+
login_result, agent_config=login_result.agent_config)
70+
self.protocol_mgr.run_as_edge_server_and_agent = True \
71+
if role == FedMLAccountManager.ROLE_EDGE_SERVER else False
72+
self.protocol_mgr.run_as_cloud_agent = True if role == FedMLAccountManager.ROLE_CLOUD_AGENT else False
73+
self.protocol_mgr.run_as_cloud_server = True if role == FedMLAccountManager.ROLE_CLOUD_SERVER else False
74+
self.protocol_mgr.args = login_result
75+
self.protocol_mgr.edge_id = login_result.edge_id
76+
self.protocol_mgr.unique_device_id = login_result.unique_device_id
77+
self.protocol_mgr.user_name = login_result.user_name
78+
self.protocol_mgr.agent_config = login_result.agent_config
79+
self.protocol_mgr.enable_simulation_cloud_agent = self.enable_simulation_cloud_agent
80+
self.protocol_mgr.use_local_process_as_cloud_server = self.use_local_process_as_cloud_server
81+
82+
def _initialize_protocol_manager(self):
83+
# Init local database
84+
self._init_database()
85+
86+
# Initialize the master protocol
87+
self.protocol_mgr.initialize()
88+
89+
# Report the IDLE status to MLOps
90+
self.mlops_metrics.report_server_training_status(
91+
None, GeneralConstants.MSG_MLOPS_SERVER_STATUS_IDLE, edge_id=self.agent_args.edge_id)
92+
93+
# Cleanup data when startup
94+
JobCleanup.get_instance().sync_data_on_startup(self.agent_args.edge_id, is_client=False)
95+
96+
# Start the API server on master agent
97+
self.master_api_daemon = MasterApiDaemon()
98+
self.master_api_process = Process(target=self.master_api_daemon.run)
99+
self.master_api_process.start()
100+
101+
def _init_logs(self, agent_args, edge_id):
102+
# Init runtime logs
103+
in_args = agent_args
104+
in_args.log_file_dir = self._get_log_file_dir()
105+
in_args.run_id = 0
106+
in_args.role = "server"
107+
in_args.edge_id = edge_id
108+
in_args.using_mlops = True
109+
in_args.server_agent_id = edge_id
110+
MLOpsRuntimeLog.get_instance(in_args).init_logs()
111+
112+
@abstractmethod
113+
def _get_log_file_dir(self):
114+
pass
115+
116+
@abstractmethod
117+
def _save_agent_info(self, unique_device_id, edge_id):
118+
pass
119+
120+
@abstractmethod
121+
def _init_database(self):
122+
pass
123+
124+
@abstractmethod
125+
def _generate_protocol_manager_instance(self, args, agent_config=None):
126+
return None

0 commit comments

Comments
 (0)