[bug] distinguish its run_id to avoid incompatible message across ML …

…frameworks.
tony92151 · Aug 27, 2022 · 5d025f8 · 5d025f8
1 parent a71ccf1
commit 5d025f8
Show file tree

Hide file tree

Showing 9 changed files with 43 additions and 52 deletions.
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -48,4 +48,4 @@ jobs:
         pip install mxnet
         pip install jax
         pip install ptflops
-        pylint --rcfile=build_tools/lint/.pylintrc --disable=C,R,W,I ./fedml
+        pylint --rcfile=build_tools/lint/.pylintrc --disable=C,R,W,I ./
diff --git a/doc/en/starter/installation.md b/doc/en/starter/installation.md
@@ -19,8 +19,8 @@ pip install "fedml[tensorflow]"
 pip install "fedml[jax]"
 pip install "fedml[mxnet]"
 ```
-Note that the commands above only install the CPU version. 
-If you need GPU/TPU version, please follow TensorFlow/Jax/MXNet official guidance.
+The above commands work properly in Linux environment. 
+For Windows/Mac OS (Intel)/Mac OS (M1), you may need to follow TensorFlow/Jax/MXNet official guidance to fix related installation issues.
 
 ## Installing FedML with Anaconda
 

diff --git a/python/examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example/run_client.sh b/python/examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example/run_client.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env bash
 RANK=$1
-python3 jax_haiku_client.py --cf config/fedml_config.yaml --rank $RANK --role client
+python3 jax_haiku_client.py --cf config/fedml_config.yaml --rank $RANK --role client --run_id jax_run_example
diff --git a/python/examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example/run_server.sh b/python/examples/cross_silo/jax_haiku_mqtt_s3_fedavg_mnist_lr_example/run_server.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env bash
 
-python3 jax_haiku_server.py --cf config/fedml_config.yaml --rank 0 --role server
+python3 jax_haiku_server.py --cf config/fedml_config.yaml --rank 0 --role server --run_id jax_run_example
diff --git a/python/examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example/run_client.sh b/python/examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example/run_client.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env bash
 RANK=$1
-python3 mxnet_client.py --cf config/fedml_config.yaml --rank $RANK --role client
+python3 mxnet_client.py --cf config/fedml_config.yaml --rank $RANK --role client --run_id mxnet_run_example
diff --git a/python/examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example/run_server.sh b/python/examples/cross_silo/mxnet_mqtt_s3_fedavg_mnist_lr_example/run_server.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env bash
 
-python3 mxnet_server.py --cf config/fedml_config.yaml --rank 0 --role server
+python3 mxnet_server.py --cf config/fedml_config.yaml --rank 0 --role server --run_id mxnet_run_example
diff --git a/python/examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example/run_client.sh b/python/examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example/run_client.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env bash
 RANK=$1
-python3 tf_client.py --cf config/fedml_config.yaml --rank $RANK --role client
+python3 tf_client.py --cf config/fedml_config.yaml --rank $RANK --role client --run_id tf_run_example
diff --git a/python/examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example/run_server.sh b/python/examples/cross_silo/tf_mqtt_s3_fedavg_mnist_lr_example/run_server.sh
@@ -1,3 +1,3 @@
 #!/usr/bin/env bash
 
-python tf_server.py --cf config/fedml_config.yaml --rank 0 --role server
+python tf_server.py --cf config/fedml_config.yaml --rank 0 --role server --run_id tf_run_example
diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py
@@ -1,12 +1,11 @@
 import logging
+import multiprocessing
 import os
 import random
 
-import multiprocessing
 import numpy as np
 import torch
 
-
 import fedml
 from .cli.env.collect_env import collect_env
 from .constants import (
@@ -17,6 +16,7 @@
     FEDML_TRAINING_PLATFORM_CROSS_SILO,
     FEDML_TRAINING_PLATFORM_CROSS_DEVICE,
 )
+from .core.common.ml_engine_backend import MLEngineBackend
 
 _global_training_type = None
 _global_comm_backend = None
@@ -45,7 +45,7 @@ def init(args=None):
     """
     # https://stackoverflow.com/questions/53014306/error-15-initializing-libiomp5-dylib-but-found-libiomp5-dylib-already-initial
     """
-    os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
+    os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
 
     seed = args.random_seed
     random.seed(seed)
@@ -56,18 +56,10 @@ def init(args=None):
 
     mlops.pre_setup(args)
 
-    if (
-        args.training_type == FEDML_TRAINING_PLATFORM_SIMULATION
-        and hasattr(args, "backend")
-        and args.backend == "MPI"
-    ):
+    if args.training_type == FEDML_TRAINING_PLATFORM_SIMULATION and hasattr(args, "backend") and args.backend == "MPI":
         args = init_simulation_mpi(args)
 
-    elif (
-        args.training_type == FEDML_TRAINING_PLATFORM_SIMULATION
-        and hasattr(args, "backend")
-        and args.backend == "sp"
-    ):
+    elif args.training_type == FEDML_TRAINING_PLATFORM_SIMULATION and hasattr(args, "backend") and args.backend == "sp":
         args = init_simulation_sp(args)
     elif (
         args.training_type == FEDML_TRAINING_PLATFORM_SIMULATION
@@ -93,6 +85,7 @@ def init(args=None):
 
     manage_profiling_args(args)
 
+    update_run_id(args)
     update_client_id_list(args)
 
     mlops.init(args)
@@ -158,8 +151,9 @@ def manage_profiling_args(args):
                 wandb_args["group"] = "Test1"
                 wandb_args["name"] = f"Client {args.rank}"
                 wandb_args["job_type"] = str(args.rank)
-                
+
             import wandb
+
             wandb.init(**wandb_args)
 
             from .core.mlops.mlops_profiler_event import MLOpsProfilerEvent
@@ -174,9 +168,7 @@ def manage_cuda_rpc_args(args):
 
     if args.enable_cuda_rpc and args.backend != "TRPC":
         args.enable_cuda_rpc = False
-        print(
-            "Argument enable_cuda_rpc is ignored. Cuda RPC only works with TRPC backend."
-        )
+        print("Argument enable_cuda_rpc is ignored. Cuda RPC only works with TRPC backend.")
 
     # When Cuda RPC is not used, tensors should be moved to cpu before transfer with TRPC
     if (not args.enable_cuda_rpc) and args.backend == "TRPC":
@@ -188,9 +180,7 @@ def manage_cuda_rpc_args(args):
     if args.enable_cuda_rpc:
         if not hasattr(args, "cuda_rpc_gpu_mapping"):
             raise Exception("Invalid config. cuda_rpc_gpu_mapping is required when enable_cuda_rpc=True")
-        assert (
-            type(args.cuda_rpc_gpu_mapping) is dict
-        ), "Invalid cuda_rpc_gpu_mapping type. Expected dict"
+        assert type(args.cuda_rpc_gpu_mapping) is dict, "Invalid cuda_rpc_gpu_mapping type. Expected dict"
         assert (
             len(args.cuda_rpc_gpu_mapping) == args.worker_num + 1
         ), f"Invalid cuda_rpc_gpu_mapping. Expected list of size {args.worker_num + 1}"
@@ -211,9 +201,7 @@ def manage_mpi_args(args):
         if process_id == 0:
             args.role = "server"
         # args.worker_num = worker_num
-        assert (
-            args.worker_num + 1 == world_size
-        ), f"Invalid number of mpi processes. Expected {args.worker_num + 1}"
+        assert args.worker_num + 1 == world_size, f"Invalid number of mpi processes. Expected {args.worker_num + 1}"
     else:
         args.comm = None
 
@@ -281,15 +269,9 @@ def update_client_id_list(args):
         generate args.client_id_list for CLI mode where args.client_id_list is set to None
         In MLOps mode, args.client_id_list will be set to real-time client id list selected by UI (not starting from 1)
     """
-    if not hasattr(args, "using_mlops") or (
-        hasattr(args, "using_mlops") and not args.using_mlops
-    ):
+    if not hasattr(args, "using_mlops") or (hasattr(args, "using_mlops") and not args.using_mlops):
         print("args.client_id_list = {}".format(print(args.client_id_list)))
-        if (
-            args.client_id_list is None
-            or args.client_id_list == "None"
-            or args.client_id_list == "[]"
-        ):
+        if args.client_id_list is None or args.client_id_list == "None" or args.client_id_list == "[]":
             if (
                 args.training_type == FEDML_TRAINING_PLATFORM_CROSS_DEVICE
                 or args.training_type == FEDML_TRAINING_PLATFORM_CROSS_SILO
@@ -299,21 +281,13 @@ def update_client_id_list(args):
                     for client_idx in range(args.client_num_per_round):
                         client_id_list.append(client_idx + 1)
                     args.client_id_list = str(client_id_list)
-                    print(
-                        "------------------server client_id_list = {}-------------------".format(
-                            args.client_id_list
-                        )
-                    )
+                    print("------------------server client_id_list = {}-------------------".format(args.client_id_list))
                 else:
                     # for the client, we only specify its client id in the list, not including others.
                     client_id_list = []
                     client_id_list.append(args.rank)
                     args.client_id_list = str(client_id_list)
-                    print(
-                        "------------------client client_id_list = {}-------------------".format(
-                            args.client_id_list
-                        )
-                    )
+                    print("------------------client client_id_list = {}-------------------".format(args.client_id_list))
             else:
                 print(
                     "training_type != FEDML_TRAINING_PLATFORM_CROSS_DEVICE and training_type != FEDML_TRAINING_PLATFORM_CROSS_SILO"
@@ -324,6 +298,24 @@ def update_client_id_list(args):
         print("using_mlops = true")
 
 
+def update_run_id(args):
+    """
+        for different ML frameworks (e.g., TF, PyTorch, Jax, MXNet, etc.), we need to distinguish its run_id to
+        avoid incompatible message across ML frameworks.
+    """
+    if hasattr(args, MLEngineBackend.ml_engine_args_flag):
+        if args.ml_engine == MLEngineBackend.ml_engine_backend_tf:
+            args.run_id += "tf"
+        elif args.ml_engine == MLEngineBackend.ml_engine_backend_jax:
+            args.run_id += "jax"
+        elif args.ml_engine == MLEngineBackend.ml_engine_backend_mxnet:
+            args.run_id += "mxnet"
+        else:
+            args.run_id += "torch"
+    else:
+        args.run_id += "torch"
+
+
 def init_cross_device(args):
     args.rank = 0  # only server runs on Python package
     return args
@@ -351,7 +343,6 @@ def run_distributed():
 from .launch_cross_device import run_mnn_server
 
 
-
 from .runner import FedMLRunner
 
 __all__ = [
@@ -366,4 +357,4 @@ def run_distributed():
     "run_hierarchical_cross_silo_server",
     "run_hierarchical_cross_silo_client",
     "run_mnn_server",
-]
+]