Skip to content

Commit

Permalink
[bug] distinguish its run_id to avoid incompatible message across ML …
Browse files Browse the repository at this point in the history
…frameworks.
  • Loading branch information
chaoyanghe committed Aug 27, 2022
1 parent a71ccf1 commit 5d025f8
Show file tree
Hide file tree
Showing 9 changed files with 43 additions and 52 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ jobs:
pip install mxnet
pip install jax
pip install ptflops
pylint --rcfile=build_tools/lint/.pylintrc --disable=C,R,W,I ./fedml
pylint --rcfile=build_tools/lint/.pylintrc --disable=C,R,W,I ./
4 changes: 2 additions & 2 deletions doc/en/starter/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ pip install "fedml[tensorflow]"
pip install "fedml[jax]"
pip install "fedml[mxnet]"
```
Note that the commands above only install the CPU version.
If you need GPU/TPU version, please follow TensorFlow/Jax/MXNet official guidance.
The above commands work properly in Linux environment.
For Windows/Mac OS (Intel)/Mac OS (M1), you may need to follow TensorFlow/Jax/MXNet official guidance to fix related installation issues.

## Installing FedML with Anaconda

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/usr/bin/env bash
RANK=$1
python3 jax_haiku_client.py --cf config/fedml_config.yaml --rank $RANK --role client
python3 jax_haiku_client.py --cf config/fedml_config.yaml --rank $RANK --role client --run_id jax_run_example
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/usr/bin/env bash

python3 jax_haiku_server.py --cf config/fedml_config.yaml --rank 0 --role server
python3 jax_haiku_server.py --cf config/fedml_config.yaml --rank 0 --role server --run_id jax_run_example
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/usr/bin/env bash
RANK=$1
python3 mxnet_client.py --cf config/fedml_config.yaml --rank $RANK --role client
python3 mxnet_client.py --cf config/fedml_config.yaml --rank $RANK --role client --run_id mxnet_run_example
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/usr/bin/env bash

python3 mxnet_server.py --cf config/fedml_config.yaml --rank 0 --role server
python3 mxnet_server.py --cf config/fedml_config.yaml --rank 0 --role server --run_id mxnet_run_example
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/usr/bin/env bash
RANK=$1
python3 tf_client.py --cf config/fedml_config.yaml --rank $RANK --role client
python3 tf_client.py --cf config/fedml_config.yaml --rank $RANK --role client --run_id tf_run_example
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/usr/bin/env bash

python tf_server.py --cf config/fedml_config.yaml --rank 0 --role server
python tf_server.py --cf config/fedml_config.yaml --rank 0 --role server --run_id tf_run_example
77 changes: 34 additions & 43 deletions python/fedml/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import logging
import multiprocessing
import os
import random

import multiprocessing
import numpy as np
import torch


import fedml
from .cli.env.collect_env import collect_env
from .constants import (
Expand All @@ -17,6 +16,7 @@
FEDML_TRAINING_PLATFORM_CROSS_SILO,
FEDML_TRAINING_PLATFORM_CROSS_DEVICE,
)
from .core.common.ml_engine_backend import MLEngineBackend

_global_training_type = None
_global_comm_backend = None
Expand Down Expand Up @@ -45,7 +45,7 @@ def init(args=None):
"""
# https://stackoverflow.com/questions/53014306/error-15-initializing-libiomp5-dylib-but-found-libiomp5-dylib-already-initial
"""
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

seed = args.random_seed
random.seed(seed)
Expand All @@ -56,18 +56,10 @@ def init(args=None):

mlops.pre_setup(args)

if (
args.training_type == FEDML_TRAINING_PLATFORM_SIMULATION
and hasattr(args, "backend")
and args.backend == "MPI"
):
if args.training_type == FEDML_TRAINING_PLATFORM_SIMULATION and hasattr(args, "backend") and args.backend == "MPI":
args = init_simulation_mpi(args)

elif (
args.training_type == FEDML_TRAINING_PLATFORM_SIMULATION
and hasattr(args, "backend")
and args.backend == "sp"
):
elif args.training_type == FEDML_TRAINING_PLATFORM_SIMULATION and hasattr(args, "backend") and args.backend == "sp":
args = init_simulation_sp(args)
elif (
args.training_type == FEDML_TRAINING_PLATFORM_SIMULATION
Expand All @@ -93,6 +85,7 @@ def init(args=None):

manage_profiling_args(args)

update_run_id(args)
update_client_id_list(args)

mlops.init(args)
Expand Down Expand Up @@ -158,8 +151,9 @@ def manage_profiling_args(args):
wandb_args["group"] = "Test1"
wandb_args["name"] = f"Client {args.rank}"
wandb_args["job_type"] = str(args.rank)

import wandb

wandb.init(**wandb_args)

from .core.mlops.mlops_profiler_event import MLOpsProfilerEvent
Expand All @@ -174,9 +168,7 @@ def manage_cuda_rpc_args(args):

if args.enable_cuda_rpc and args.backend != "TRPC":
args.enable_cuda_rpc = False
print(
"Argument enable_cuda_rpc is ignored. Cuda RPC only works with TRPC backend."
)
print("Argument enable_cuda_rpc is ignored. Cuda RPC only works with TRPC backend.")

# When Cuda RPC is not used, tensors should be moved to cpu before transfer with TRPC
if (not args.enable_cuda_rpc) and args.backend == "TRPC":
Expand All @@ -188,9 +180,7 @@ def manage_cuda_rpc_args(args):
if args.enable_cuda_rpc:
if not hasattr(args, "cuda_rpc_gpu_mapping"):
raise Exception("Invalid config. cuda_rpc_gpu_mapping is required when enable_cuda_rpc=True")
assert (
type(args.cuda_rpc_gpu_mapping) is dict
), "Invalid cuda_rpc_gpu_mapping type. Expected dict"
assert type(args.cuda_rpc_gpu_mapping) is dict, "Invalid cuda_rpc_gpu_mapping type. Expected dict"
assert (
len(args.cuda_rpc_gpu_mapping) == args.worker_num + 1
), f"Invalid cuda_rpc_gpu_mapping. Expected list of size {args.worker_num + 1}"
Expand All @@ -211,9 +201,7 @@ def manage_mpi_args(args):
if process_id == 0:
args.role = "server"
# args.worker_num = worker_num
assert (
args.worker_num + 1 == world_size
), f"Invalid number of mpi processes. Expected {args.worker_num + 1}"
assert args.worker_num + 1 == world_size, f"Invalid number of mpi processes. Expected {args.worker_num + 1}"
else:
args.comm = None

Expand Down Expand Up @@ -281,15 +269,9 @@ def update_client_id_list(args):
generate args.client_id_list for CLI mode where args.client_id_list is set to None
In MLOps mode, args.client_id_list will be set to real-time client id list selected by UI (not starting from 1)
"""
if not hasattr(args, "using_mlops") or (
hasattr(args, "using_mlops") and not args.using_mlops
):
if not hasattr(args, "using_mlops") or (hasattr(args, "using_mlops") and not args.using_mlops):
print("args.client_id_list = {}".format(print(args.client_id_list)))
if (
args.client_id_list is None
or args.client_id_list == "None"
or args.client_id_list == "[]"
):
if args.client_id_list is None or args.client_id_list == "None" or args.client_id_list == "[]":
if (
args.training_type == FEDML_TRAINING_PLATFORM_CROSS_DEVICE
or args.training_type == FEDML_TRAINING_PLATFORM_CROSS_SILO
Expand All @@ -299,21 +281,13 @@ def update_client_id_list(args):
for client_idx in range(args.client_num_per_round):
client_id_list.append(client_idx + 1)
args.client_id_list = str(client_id_list)
print(
"------------------server client_id_list = {}-------------------".format(
args.client_id_list
)
)
print("------------------server client_id_list = {}-------------------".format(args.client_id_list))
else:
# for the client, we only specify its client id in the list, not including others.
client_id_list = []
client_id_list.append(args.rank)
args.client_id_list = str(client_id_list)
print(
"------------------client client_id_list = {}-------------------".format(
args.client_id_list
)
)
print("------------------client client_id_list = {}-------------------".format(args.client_id_list))
else:
print(
"training_type != FEDML_TRAINING_PLATFORM_CROSS_DEVICE and training_type != FEDML_TRAINING_PLATFORM_CROSS_SILO"
Expand All @@ -324,6 +298,24 @@ def update_client_id_list(args):
print("using_mlops = true")


def update_run_id(args):
"""
for different ML frameworks (e.g., TF, PyTorch, Jax, MXNet, etc.), we need to distinguish its run_id to
avoid incompatible message across ML frameworks.
"""
if hasattr(args, MLEngineBackend.ml_engine_args_flag):
if args.ml_engine == MLEngineBackend.ml_engine_backend_tf:
args.run_id += "tf"
elif args.ml_engine == MLEngineBackend.ml_engine_backend_jax:
args.run_id += "jax"
elif args.ml_engine == MLEngineBackend.ml_engine_backend_mxnet:
args.run_id += "mxnet"
else:
args.run_id += "torch"
else:
args.run_id += "torch"


def init_cross_device(args):
args.rank = 0 # only server runs on Python package
return args
Expand Down Expand Up @@ -351,7 +343,6 @@ def run_distributed():
from .launch_cross_device import run_mnn_server



from .runner import FedMLRunner

__all__ = [
Expand All @@ -366,4 +357,4 @@ def run_distributed():
"run_hierarchical_cross_silo_server",
"run_hierarchical_cross_silo_client",
"run_mnn_server",
]
]

0 comments on commit 5d025f8

Please sign in to comment.