microsoft · Jinyu-W · May 10, 2022 · Apr 24, 2022 · Apr 24, 2022 · Apr 24, 2022
diff --git a/examples/cim/rl/README.md b/examples/cim/rl/README.md
@@ -1,10 +1,9 @@
 # Container Inventory Management
 
 This example demonstrates the use of MARO's RL toolkit to optimize container inventory management. The scenario consists of a set of ports, each acting as a learning agent, and vessels that transfer empty containers among them. Each port must decide 1) whether to load or discharge containers when a vessel arrives and 2) how many containers to be loaded or discharged. The objective is to minimize the overall container shortage over a certain period of time. In this folder you can find:
+* ``__init__.py``, the entrance of this example. You must expose a `rl_component_bundle_cls` interface in `__init__.py` (see the example file for details);
 * ``config.py``, which contains general configurations for the scenario;
-* ``algorithms``, which contains configurations for the Actor-Critic, DQN and discrete-MADDPG algorithms, including network configurations;
-* ``env_sampler.py``, which defines state, action and reward shaping in the ``CIMEnvSampler`` class;
-* ``policy_trainer.py``, which contains a registry for the policies and algorithms defined in ``algorithms``;
-* ``callbacks.py``, which defines routines to be invoked at the end of training or evaluation episodes.
+* ``algorithms/``, which contains configurations for the PPO, Actor-Critic, DQN and discrete-MADDPG algorithms, including network configurations;
+* ``rl_componenet_bundle.py``, which defines all necessary components to run a RL job. You can go through the doc string of `RLComponentBundle` for detailed explanation, or just read `CIMBundle` to learn its basic usage.
 
-See ``README.md`` under ``examples/rl`` for details about running the single-threaded learning workflow. We recommend that you follow this example to write your own scenarios.
+We recommend that you follow this example to write your own scenarios.
diff --git a/examples/cim/rl/__init__.py b/examples/cim/rl/__init__.py
@@ -1,16 +1,8 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from .callbacks import post_collect, post_evaluate
-from .env_sampler import agent2policy, env_sampler_creator
-from .policy_trainer import device_mapping, policy_creator, trainer_creator
+from .rl_component_bundle import CIMBundle as rl_component_bundle_cls
 
 __all__ = [
-    "agent2policy",
-    "device_mapping",
-    "env_sampler_creator",
-    "policy_creator",
-    "post_collect",
-    "post_evaluate",
-    "trainer_creator",
+    "rl_component_bundle_cls",
 ]
diff --git a/examples/cim/rl/algorithms/ac.py b/examples/cim/rl/algorithms/ac.py
@@ -49,7 +49,7 @@ def _get_v_values(self, states: torch.Tensor) -> torch.Tensor:
         return self._critic(states).squeeze(-1)
 
 
-def get_policy(state_dim: int, action_num: int, name: str) -> DiscretePolicyGradient:
+def get_ac_policy(state_dim: int, action_num: int, name: str) -> DiscretePolicyGradient:
     return DiscretePolicyGradient(name=name, policy_net=MyActorNet(state_dim, action_num))
 
 

diff --git a/examples/cim/rl/algorithms/dqn.py b/examples/cim/rl/algorithms/dqn.py
@@ -33,7 +33,7 @@ def _get_q_values_for_all_actions(self, states: torch.Tensor) -> torch.Tensor:
         return self._fc(states)
 
 
-def get_policy(state_dim: int, action_num: int, name: str) -> ValueBasedPolicy:
+def get_dqn_policy(state_dim: int, action_num: int, name: str) -> ValueBasedPolicy:
     return ValueBasedPolicy(
         name=name,
         q_net=MyQNet(state_dim, action_num),

diff --git a/examples/cim/rl/algorithms/maddpg.py b/examples/cim/rl/algorithms/maddpg.py
@@ -56,7 +56,7 @@ def get_multi_critic_net(state_dim: int, action_dims: List[int]) -> MyMultiCriti
     return MyMultiCriticNet(state_dim, action_dims)
 
 
-def get_policy(state_dim: int, action_num: int, name: str) -> DiscretePolicyGradient:
+def get_maddpg_policy(state_dim: int, action_num: int, name: str) -> DiscretePolicyGradient:
     return DiscretePolicyGradient(name=name, policy_net=MyActorNet(state_dim, action_num))
 
 

diff --git a/examples/cim/rl/algorithms/ppo.py b/examples/cim/rl/algorithms/ppo.py
@@ -6,7 +6,7 @@
 from .ac import MyActorNet, MyCriticNet
 
 
-def get_policy(state_dim: int, action_num: int, name: str) -> DiscretePolicyGradient:
+def get_ppo_policy(state_dim: int, action_num: int, name: str) -> DiscretePolicyGradient:
     return DiscretePolicyGradient(name=name, policy_net=MyActorNet(state_dim, action_num))
 
 

diff --git a/examples/cim/rl/callbacks.py b/examples/cim/rl/callbacks.py
diff --git a/examples/cim/rl/config.py b/examples/cim/rl/config.py
@@ -39,4 +39,6 @@
     + len(vessel_attributes)
 )
 
+action_num = len(action_shaping_conf["action_space"])
+
 algorithm = "ppo"  # ac, ppo, dqn or discrete_maddpg
diff --git a/examples/cim/rl/env_sampler.py b/examples/cim/rl/env_sampler.py
@@ -1,18 +1,16 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union
 
 import numpy as np
 
-from maro.rl.policy import RLPolicy
 from maro.rl.rollout import AbsEnvSampler, CacheElement
-from maro.simulator import Env
 from maro.simulator.scenarios.cim.common import Action, ActionType, DecisionEvent
 
 from .config import (
-    action_shaping_conf, algorithm, env_conf, port_attributes, reward_shaping_conf, state_shaping_conf,
-    vessel_attributes
+    action_shaping_conf, port_attributes, reward_shaping_conf, state_shaping_conf,
+    vessel_attributes,
 )
 
 
@@ -82,13 +80,16 @@ def _post_step(self, cache_element: CacheElement, reward: Dict[Any, float]) -> N
     def _post_eval_step(self, cache_element: CacheElement, reward: Dict[Any, float]) -> None:
         self._post_step(cache_element, reward)
 
+    def post_collect(self, info_list: list, ep: int) -> None:
+        # print the env metric from each rollout worker
+        for info in info_list:
+            print(f"env summary (episode {ep}): {info['env_metric']}")
 
-agent2policy = {agent: f"{algorithm}_{agent}.policy" for agent in Env(**env_conf).agent_idx_list}
+        # print the average env metric
+        if len(info_list) > 1:
+            metric_keys, num_envs = info_list[0]["env_metric"].keys(), len(info_list)
+            avg_metric = {key: sum(info["env_metric"][key] for info in info_list) / num_envs for key in metric_keys}
+            print(f"average env summary (episode {ep}): {avg_metric}")
 
-
-def env_sampler_creator(policy_creator: Dict[str, Callable[[str], RLPolicy]]) -> CIMEnvSampler:
-    return CIMEnvSampler(
-        get_env=lambda: Env(**env_conf),
-        policy_creator=policy_creator,
-        agent2policy=agent2policy,
-    )
+    def post_evaluate(self, info_list: list, ep: int) -> None:
+        self.post_collect(info_list, ep)
diff --git a/examples/cim/rl/policy_trainer.py b/examples/cim/rl/policy_trainer.py
diff --git a/examples/cim/rl/rl_component_bundle.py b/examples/cim/rl/rl_component_bundle.py
@@ -0,0 +1,84 @@
+from functools import partial
+from typing import Any, Callable, Dict, Optional
+
+from examples.cim.rl.config import action_num, algorithm, env_conf, num_agents, state_dim
+from examples.cim.rl.env_sampler import CIMEnvSampler
+from maro.rl.policy import AbsPolicy
+from maro.rl.rl_component.rl_component_bundle import RLComponentBundle
+from maro.rl.rollout import AbsEnvSampler
+from maro.rl.training import AbsTrainer
+
+from .algorithms.ac import get_ac_policy
+from .algorithms.dqn import get_dqn_policy
+from .algorithms.maddpg import get_maddpg_policy
+from .algorithms.ppo import get_ppo_policy
+from .algorithms.ac import get_ac
+from .algorithms.ppo import get_ppo
+from .algorithms.dqn import get_dqn
+from .algorithms.maddpg import get_maddpg
+
+
+class CIMBundle(RLComponentBundle):
+    def get_env_config(self) -> dict:
+        return env_conf
+
+    def get_test_env_config(self) -> Optional[dict]:
+        return None
+
+    def get_env_sampler(self) -> AbsEnvSampler:
+        return CIMEnvSampler(self.env, self.test_env)
+
+    def get_agent2policy(self) -> Dict[Any, str]:
+        return {agent: f"{algorithm}_{agent}.policy"for agent in self.env.agent_idx_list}
+
+    def get_policy_creator(self) -> Dict[str, Callable[[], AbsPolicy]]:
+        if algorithm == "ac":
+            policy_creator = {
+                f"{algorithm}_{i}.policy": partial(get_ac_policy, state_dim, action_num, f"{algorithm}_{i}.policy")
+                for i in range(num_agents)
+            }
+        elif algorithm == "ppo":
+            policy_creator = {
+                f"{algorithm}_{i}.policy": partial(get_ppo_policy, state_dim, action_num, f"{algorithm}_{i}.policy")
+                for i in range(num_agents)
+            }
+        elif algorithm == "dqn":
+            policy_creator = {
+                f"{algorithm}_{i}.policy": partial(get_dqn_policy, state_dim, action_num, f"{algorithm}_{i}.policy")
+                for i in range(num_agents)
+            }
+        elif algorithm == "discrete_maddpg":
+            policy_creator = {
+                f"{algorithm}_{i}.policy": partial(get_maddpg_policy, state_dim, action_num, f"{algorithm}_{i}.policy")
+                for i in range(num_agents)
+            }
+        else:
+            raise ValueError(f"Unsupported algorithm: {algorithm}")
+
+        return policy_creator
+
+    def get_trainer_creator(self) -> Dict[str, Callable[[], AbsTrainer]]:
+        if algorithm == "ac":
+            trainer_creator = {
+                f"{algorithm}_{i}": partial(get_ac, state_dim, f"{algorithm}_{i}")
+                for i in range(num_agents)
+            }
+        elif algorithm == "ppo":
+            trainer_creator = {
+                f"{algorithm}_{i}": partial(get_ppo, state_dim, f"{algorithm}_{i}")
+                for i in range(num_agents)
+            }
+        elif algorithm == "dqn":
+            trainer_creator = {
+                f"{algorithm}_{i}": partial(get_dqn, f"{algorithm}_{i}")
+                for i in range(num_agents)
+            }
+        elif algorithm == "discrete_maddpg":
+            trainer_creator = {
+                f"{algorithm}_{i}": partial(get_maddpg, state_dim, [1], f"{algorithm}_{i}")
+                for i in range(num_agents)
+            }
+        else:
+            raise ValueError(f"Unsupported algorithm: {algorithm}")
+
+        return trainer_creator
diff --git a/examples/rl/README.md b/examples/rl/README.md
@@ -12,10 +12,8 @@ There are two ways to start the RL job:
 
 ## Create Your Own Scenarios
 
-You can create your own scenarios by supplying the necessary ingredients without worrying about putting them together in a workflow. It is necessary to create an ``__init__.py`` under your scenario folder (so that it can be treated as a package) and expose all ingredients in it. The ingredients include:
-* Definitions of policies and agent-to-policy mappings. These definitions should be provided as a dictionary named ``policy_creator`` that maps a name to a function that takes the name and returns a policy instance with that name. Optionally, you may specify which policies you intend to train by providing ``trainable_policies``, which a list of policy names. The experiences generated by these policies will be recorded by the environment sampler and used for training. The agent-to-policy mapping should be provided as a dictionary named ``agent2policy``.
-* Definitions of training algorithms. These definitions should be provided as a dictionary named ``trainer_creator`` that maps a name to a function that takes the name and returns a trainer instance with that name.
-* Definitions of state, action and reward shaping logic pertinent to your simulator and policies.
-These definitions should be encapsulated in ``env_sampler_creator``, which is a function that takes ``policy_creator`` and returns an environment sampler;
-It is possible to have customized routines invoked at the end of a roll-out episode or episode segment. These routines usually involve processing and / or rendering information collected during roll-out. To do this, first implement the ``post_step`` method in your environment sampler class to record whatever information you wish to keep track of during roll-out. Then create functions named ``post_collect`` and ``post_evaluate`` to process the information and expose them in the scenario folder's ``__init__.py``. These functions are used as callbacks in the main learning loop and executed at the end of each training or evaluation episode. See ``cim/callbacks.py`` for a simple example of how to create these functions.
-* An optional dictionary named ``device_mapping`` that specifies the compute device (CPU or GPU) for each policy. If not provided, all computations will be performed on the CPU.
+You can create your own scenarios by supplying the necessary ingredients without worrying about putting them together in a workflow. It is necessary to create an ``__init__.py`` under your scenario folder (so that it can be treated as a package) and expose a `rl_component_bundle_cls` interface. The MARO's RL workflow will use this interface to create a `RLComponentBundle` instance and start the RL workflow based on it. a `RLComponentBundle` instance defines all necessary components to run a RL job. You can go through the doc string of `RLComponentBundle` for detailed explanation, or just read one of the examples to learn its basic usage.
+
+## Example
+
+For a complete example, please check `examples/cim/rl`.
diff --git a/examples/vm_scheduling/rl/README.md b/examples/vm_scheduling/rl/README.md
@@ -2,13 +2,12 @@
 
 A virtual machine (VM) scheduler is a cloud computing service component responsible for providing compute resources to satisfy user demands. A good resource allocation policy should aim to optimize several metrics at the same time, such as user wait time, profit, energy consumption and physical machine (PM) overload. Many commercial cloud providers use rule-based policies. Alternatively, the policy can also be optimized using reinforcement learning (RL) techniques, which involves simulating with historical data. This example demonstrates how DQN and Actor-Critic algorithms can be applied to this scenario. In this folder, you can find:  
 
+* ``__init__.py``, the entrance of this example. You must expose a `rl_component_bundle_cls` interface in `__init__.py` (see the example file for details);
 * ``config.py``, which contains general configurations for the scenario;
-* ``algorithms``, which contains configurations for the Actor-Critic, DQN algorithms, including network configurations;
-* ``env_sampler.py``, which defines state, action and reward shaping in the ``VMEnvSampler`` class;
-* ``policy_trainer.py``, which contains a registry for the policies and algorithms defined in ``algorithms``;
-* ``callbacks.py``, which defines routines to be invoked at the end of training or evaluation episodes.
+* ``algorithms/``, which contains configurations for the algorithms, including network configurations;
+* ``rl_componenet_bundle.py``, which defines all necessary components to run a RL job. You can go through the doc string of `RLComponentBundle` for detailed explanation, or just read `VMBundle` to learn its basic usage.
 
-See ``README.md`` under ``examples/rl`` for details about running the single-threaded learning workflow. We recommend that you follow this example to write your own scenarios.
+We recommend that you follow this example to write your own scenarios.
 
 
 # Some Comments About the Results

diff --git a/examples/vm_scheduling/rl/__init__.py b/examples/vm_scheduling/rl/__init__.py
@@ -1,15 +1,8 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from .callbacks import post_collect, post_evaluate
-from .env_sampler import agent2policy, env_sampler_creator
-from .policy_trainer import policy_creator, trainer_creator
+from .rl_component_bundle import VMBundle as rl_component_bundle_cls
 
 __all__ = [
-    "agent2policy",
-    "env_sampler_creator",
-    "policy_creator",
-    "post_collect",
-    "post_evaluate",
-    "trainer_creator",
+    "rl_component_bundle_cls",
 ]
diff --git a/examples/vm_scheduling/rl/algorithms/ac.py b/examples/vm_scheduling/rl/algorithms/ac.py
@@ -57,7 +57,7 @@ def _get_v_values(self, states: torch.Tensor) -> torch.Tensor:
         return self._critic(features).squeeze(-1)
 
 
-def get_policy(state_dim: int, action_num: int, num_features: int, name: str) -> DiscretePolicyGradient:
+def get_ac_policy(state_dim: int, action_num: int, num_features: int, name: str) -> DiscretePolicyGradient:
     return DiscretePolicyGradient(name=name, policy_net=MyActorNet(state_dim, action_num, num_features))
 
 

diff --git a/examples/vm_scheduling/rl/algorithms/dqn.py b/examples/vm_scheduling/rl/algorithms/dqn.py
@@ -1,8 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from typing import Dict
-
 import numpy as np
 import torch
 from torch.optim import SGD
@@ -11,8 +9,7 @@
 from maro.rl.exploration import MultiLinearExplorationScheduler
 from maro.rl.model import DiscreteQNet, FullyConnected
 from maro.rl.policy import ValueBasedPolicy
-from maro.rl.training.algorithms import DQNTrainer, DQNParams
-
+from maro.rl.training.algorithms import DQNParams, DQNTrainer
 
 q_net_conf = {
     "hidden_dims": [64, 128, 256],
@@ -54,7 +51,7 @@ def __call__(self, states, actions, num_actions, *, epsilon):
         ])
 
 
-def get_policy(state_dim: int, action_num: int, num_features: int, name: str) -> ValueBasedPolicy:
+def get_dqn_policy(state_dim: int, action_num: int, num_features: int, name: str) -> ValueBasedPolicy:
     return ValueBasedPolicy(
         name=name,
         q_net=MyQNet(state_dim, action_num, num_features),