Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions examples/cim/rl/README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
# Container Inventory Management

This example demonstrates the use of MARO's RL toolkit to optimize container inventory management. The scenario consists of a set of ports, each acting as a learning agent, and vessels that transfer empty containers among them. Each port must decide 1) whether to load or discharge containers when a vessel arrives and 2) how many containers to be loaded or discharged. The objective is to minimize the overall container shortage over a certain period of time. In this folder you can find:
* ``__init__.py``, the entrance of this example. You must expose a `rl_component_bundle_cls` interface in `__init__.py` (see the example file for details);
* ``config.py``, which contains general configurations for the scenario;
* ``algorithms``, which contains configurations for the Actor-Critic, DQN and discrete-MADDPG algorithms, including network configurations;
* ``env_sampler.py``, which defines state, action and reward shaping in the ``CIMEnvSampler`` class;
* ``policy_trainer.py``, which contains a registry for the policies and algorithms defined in ``algorithms``;
* ``callbacks.py``, which defines routines to be invoked at the end of training or evaluation episodes.
* ``algorithms/``, which contains configurations for the PPO, Actor-Critic, DQN and discrete-MADDPG algorithms, including network configurations;
* ``rl_componenet_bundle.py``, which defines all necessary components to run a RL job. You can go through the doc string of `RLComponentBundle` for detailed explanation, or just read `CIMBundle` to learn its basic usage.

See ``README.md`` under ``examples/rl`` for details about running the single-threaded learning workflow. We recommend that you follow this example to write your own scenarios.
We recommend that you follow this example to write your own scenarios.
12 changes: 2 additions & 10 deletions examples/cim/rl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from .callbacks import post_collect, post_evaluate
from .env_sampler import agent2policy, env_sampler_creator
from .policy_trainer import device_mapping, policy_creator, trainer_creator
from .rl_component_bundle import CIMBundle as rl_component_bundle_cls

__all__ = [
"agent2policy",
"device_mapping",
"env_sampler_creator",
"policy_creator",
"post_collect",
"post_evaluate",
"trainer_creator",
"rl_component_bundle_cls",
]
2 changes: 1 addition & 1 deletion examples/cim/rl/algorithms/ac.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _get_v_values(self, states: torch.Tensor) -> torch.Tensor:
return self._critic(states).squeeze(-1)


def get_policy(state_dim: int, action_num: int, name: str) -> DiscretePolicyGradient:
def get_ac_policy(state_dim: int, action_num: int, name: str) -> DiscretePolicyGradient:
return DiscretePolicyGradient(name=name, policy_net=MyActorNet(state_dim, action_num))


Expand Down
2 changes: 1 addition & 1 deletion examples/cim/rl/algorithms/dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def _get_q_values_for_all_actions(self, states: torch.Tensor) -> torch.Tensor:
return self._fc(states)


def get_policy(state_dim: int, action_num: int, name: str) -> ValueBasedPolicy:
def get_dqn_policy(state_dim: int, action_num: int, name: str) -> ValueBasedPolicy:
return ValueBasedPolicy(
name=name,
q_net=MyQNet(state_dim, action_num),
Expand Down
2 changes: 1 addition & 1 deletion examples/cim/rl/algorithms/maddpg.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def get_multi_critic_net(state_dim: int, action_dims: List[int]) -> MyMultiCriti
return MyMultiCriticNet(state_dim, action_dims)


def get_policy(state_dim: int, action_num: int, name: str) -> DiscretePolicyGradient:
def get_maddpg_policy(state_dim: int, action_num: int, name: str) -> DiscretePolicyGradient:
return DiscretePolicyGradient(name=name, policy_net=MyActorNet(state_dim, action_num))


Expand Down
2 changes: 1 addition & 1 deletion examples/cim/rl/algorithms/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from .ac import MyActorNet, MyCriticNet


def get_policy(state_dim: int, action_num: int, name: str) -> DiscretePolicyGradient:
def get_ppo_policy(state_dim: int, action_num: int, name: str) -> DiscretePolicyGradient:
return DiscretePolicyGradient(name=name, policy_net=MyActorNet(state_dim, action_num))


Expand Down
25 changes: 0 additions & 25 deletions examples/cim/rl/callbacks.py

This file was deleted.

2 changes: 2 additions & 0 deletions examples/cim/rl/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,6 @@
+ len(vessel_attributes)
)

action_num = len(action_shaping_conf["action_space"])

algorithm = "ppo" # ac, ppo, dqn or discrete_maddpg
27 changes: 14 additions & 13 deletions examples/cim/rl/env_sampler.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Any, Dict, List, Tuple, Union

import numpy as np

from maro.rl.policy import RLPolicy
from maro.rl.rollout import AbsEnvSampler, CacheElement
from maro.simulator import Env
from maro.simulator.scenarios.cim.common import Action, ActionType, DecisionEvent

from .config import (
action_shaping_conf, algorithm, env_conf, port_attributes, reward_shaping_conf, state_shaping_conf,
vessel_attributes
action_shaping_conf, port_attributes, reward_shaping_conf, state_shaping_conf,
vessel_attributes,
)


Expand Down Expand Up @@ -82,13 +80,16 @@ def _post_step(self, cache_element: CacheElement, reward: Dict[Any, float]) -> N
def _post_eval_step(self, cache_element: CacheElement, reward: Dict[Any, float]) -> None:
self._post_step(cache_element, reward)

def post_collect(self, info_list: list, ep: int) -> None:
# print the env metric from each rollout worker
for info in info_list:
print(f"env summary (episode {ep}): {info['env_metric']}")

agent2policy = {agent: f"{algorithm}_{agent}.policy" for agent in Env(**env_conf).agent_idx_list}
# print the average env metric
if len(info_list) > 1:
metric_keys, num_envs = info_list[0]["env_metric"].keys(), len(info_list)
avg_metric = {key: sum(info["env_metric"][key] for info in info_list) / num_envs for key in metric_keys}
print(f"average env summary (episode {ep}): {avg_metric}")


def env_sampler_creator(policy_creator: Dict[str, Callable[[str], RLPolicy]]) -> CIMEnvSampler:
return CIMEnvSampler(
get_env=lambda: Env(**env_conf),
policy_creator=policy_creator,
agent2policy=agent2policy,
)
def post_evaluate(self, info_list: list, ep: int) -> None:
self.post_collect(info_list, ep)
33 changes: 0 additions & 33 deletions examples/cim/rl/policy_trainer.py

This file was deleted.

84 changes: 84 additions & 0 deletions examples/cim/rl/rl_component_bundle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from functools import partial
from typing import Any, Callable, Dict, Optional

from examples.cim.rl.config import action_num, algorithm, env_conf, num_agents, state_dim
from examples.cim.rl.env_sampler import CIMEnvSampler
from maro.rl.policy import AbsPolicy
from maro.rl.rl_component.rl_component_bundle import RLComponentBundle
from maro.rl.rollout import AbsEnvSampler
from maro.rl.training import AbsTrainer

from .algorithms.ac import get_ac_policy
from .algorithms.dqn import get_dqn_policy
from .algorithms.maddpg import get_maddpg_policy
from .algorithms.ppo import get_ppo_policy
from .algorithms.ac import get_ac
from .algorithms.ppo import get_ppo
from .algorithms.dqn import get_dqn
from .algorithms.maddpg import get_maddpg


class CIMBundle(RLComponentBundle):
def get_env_config(self) -> dict:
return env_conf

def get_test_env_config(self) -> Optional[dict]:
return None

def get_env_sampler(self) -> AbsEnvSampler:
return CIMEnvSampler(self.env, self.test_env)

def get_agent2policy(self) -> Dict[Any, str]:
return {agent: f"{algorithm}_{agent}.policy"for agent in self.env.agent_idx_list}

def get_policy_creator(self) -> Dict[str, Callable[[], AbsPolicy]]:
if algorithm == "ac":
policy_creator = {
f"{algorithm}_{i}.policy": partial(get_ac_policy, state_dim, action_num, f"{algorithm}_{i}.policy")
for i in range(num_agents)
}
elif algorithm == "ppo":
policy_creator = {
f"{algorithm}_{i}.policy": partial(get_ppo_policy, state_dim, action_num, f"{algorithm}_{i}.policy")
for i in range(num_agents)
}
elif algorithm == "dqn":
policy_creator = {
f"{algorithm}_{i}.policy": partial(get_dqn_policy, state_dim, action_num, f"{algorithm}_{i}.policy")
for i in range(num_agents)
}
elif algorithm == "discrete_maddpg":
policy_creator = {
f"{algorithm}_{i}.policy": partial(get_maddpg_policy, state_dim, action_num, f"{algorithm}_{i}.policy")
for i in range(num_agents)
}
else:
raise ValueError(f"Unsupported algorithm: {algorithm}")

return policy_creator

def get_trainer_creator(self) -> Dict[str, Callable[[], AbsTrainer]]:
if algorithm == "ac":
trainer_creator = {
f"{algorithm}_{i}": partial(get_ac, state_dim, f"{algorithm}_{i}")
for i in range(num_agents)
}
elif algorithm == "ppo":
trainer_creator = {
f"{algorithm}_{i}": partial(get_ppo, state_dim, f"{algorithm}_{i}")
for i in range(num_agents)
}
elif algorithm == "dqn":
trainer_creator = {
f"{algorithm}_{i}": partial(get_dqn, f"{algorithm}_{i}")
for i in range(num_agents)
}
elif algorithm == "discrete_maddpg":
trainer_creator = {
f"{algorithm}_{i}": partial(get_maddpg, state_dim, [1], f"{algorithm}_{i}")
for i in range(num_agents)
}
else:
raise ValueError(f"Unsupported algorithm: {algorithm}")

return trainer_creator
12 changes: 5 additions & 7 deletions examples/rl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@ There are two ways to start the RL job:

## Create Your Own Scenarios

You can create your own scenarios by supplying the necessary ingredients without worrying about putting them together in a workflow. It is necessary to create an ``__init__.py`` under your scenario folder (so that it can be treated as a package) and expose all ingredients in it. The ingredients include:
* Definitions of policies and agent-to-policy mappings. These definitions should be provided as a dictionary named ``policy_creator`` that maps a name to a function that takes the name and returns a policy instance with that name. Optionally, you may specify which policies you intend to train by providing ``trainable_policies``, which a list of policy names. The experiences generated by these policies will be recorded by the environment sampler and used for training. The agent-to-policy mapping should be provided as a dictionary named ``agent2policy``.
* Definitions of training algorithms. These definitions should be provided as a dictionary named ``trainer_creator`` that maps a name to a function that takes the name and returns a trainer instance with that name.
* Definitions of state, action and reward shaping logic pertinent to your simulator and policies.
These definitions should be encapsulated in ``env_sampler_creator``, which is a function that takes ``policy_creator`` and returns an environment sampler;
It is possible to have customized routines invoked at the end of a roll-out episode or episode segment. These routines usually involve processing and / or rendering information collected during roll-out. To do this, first implement the ``post_step`` method in your environment sampler class to record whatever information you wish to keep track of during roll-out. Then create functions named ``post_collect`` and ``post_evaluate`` to process the information and expose them in the scenario folder's ``__init__.py``. These functions are used as callbacks in the main learning loop and executed at the end of each training or evaluation episode. See ``cim/callbacks.py`` for a simple example of how to create these functions.
* An optional dictionary named ``device_mapping`` that specifies the compute device (CPU or GPU) for each policy. If not provided, all computations will be performed on the CPU.
You can create your own scenarios by supplying the necessary ingredients without worrying about putting them together in a workflow. It is necessary to create an ``__init__.py`` under your scenario folder (so that it can be treated as a package) and expose a `rl_component_bundle_cls` interface. The MARO's RL workflow will use this interface to create a `RLComponentBundle` instance and start the RL workflow based on it. a `RLComponentBundle` instance defines all necessary components to run a RL job. You can go through the doc string of `RLComponentBundle` for detailed explanation, or just read one of the examples to learn its basic usage.

## Example

For a complete example, please check `examples/cim/rl`.
9 changes: 4 additions & 5 deletions examples/vm_scheduling/rl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@

A virtual machine (VM) scheduler is a cloud computing service component responsible for providing compute resources to satisfy user demands. A good resource allocation policy should aim to optimize several metrics at the same time, such as user wait time, profit, energy consumption and physical machine (PM) overload. Many commercial cloud providers use rule-based policies. Alternatively, the policy can also be optimized using reinforcement learning (RL) techniques, which involves simulating with historical data. This example demonstrates how DQN and Actor-Critic algorithms can be applied to this scenario. In this folder, you can find:

* ``__init__.py``, the entrance of this example. You must expose a `rl_component_bundle_cls` interface in `__init__.py` (see the example file for details);
* ``config.py``, which contains general configurations for the scenario;
* ``algorithms``, which contains configurations for the Actor-Critic, DQN algorithms, including network configurations;
* ``env_sampler.py``, which defines state, action and reward shaping in the ``VMEnvSampler`` class;
* ``policy_trainer.py``, which contains a registry for the policies and algorithms defined in ``algorithms``;
* ``callbacks.py``, which defines routines to be invoked at the end of training or evaluation episodes.
* ``algorithms/``, which contains configurations for the algorithms, including network configurations;
* ``rl_componenet_bundle.py``, which defines all necessary components to run a RL job. You can go through the doc string of `RLComponentBundle` for detailed explanation, or just read `VMBundle` to learn its basic usage.

See ``README.md`` under ``examples/rl`` for details about running the single-threaded learning workflow. We recommend that you follow this example to write your own scenarios.
We recommend that you follow this example to write your own scenarios.


# Some Comments About the Results
Expand Down
11 changes: 2 additions & 9 deletions examples/vm_scheduling/rl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from .callbacks import post_collect, post_evaluate
from .env_sampler import agent2policy, env_sampler_creator
from .policy_trainer import policy_creator, trainer_creator
from .rl_component_bundle import VMBundle as rl_component_bundle_cls

__all__ = [
"agent2policy",
"env_sampler_creator",
"policy_creator",
"post_collect",
"post_evaluate",
"trainer_creator",
"rl_component_bundle_cls",
]
2 changes: 1 addition & 1 deletion examples/vm_scheduling/rl/algorithms/ac.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _get_v_values(self, states: torch.Tensor) -> torch.Tensor:
return self._critic(features).squeeze(-1)


def get_policy(state_dim: int, action_num: int, num_features: int, name: str) -> DiscretePolicyGradient:
def get_ac_policy(state_dim: int, action_num: int, num_features: int, name: str) -> DiscretePolicyGradient:
return DiscretePolicyGradient(name=name, policy_net=MyActorNet(state_dim, action_num, num_features))


Expand Down
7 changes: 2 additions & 5 deletions examples/vm_scheduling/rl/algorithms/dqn.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from typing import Dict

import numpy as np
import torch
from torch.optim import SGD
Expand All @@ -11,8 +9,7 @@
from maro.rl.exploration import MultiLinearExplorationScheduler
from maro.rl.model import DiscreteQNet, FullyConnected
from maro.rl.policy import ValueBasedPolicy
from maro.rl.training.algorithms import DQNTrainer, DQNParams

from maro.rl.training.algorithms import DQNParams, DQNTrainer

q_net_conf = {
"hidden_dims": [64, 128, 256],
Expand Down Expand Up @@ -54,7 +51,7 @@ def __call__(self, states, actions, num_actions, *, epsilon):
])


def get_policy(state_dim: int, action_num: int, num_features: int, name: str) -> ValueBasedPolicy:
def get_dqn_policy(state_dim: int, action_num: int, num_features: int, name: str) -> ValueBasedPolicy:
return ValueBasedPolicy(
name=name,
q_net=MyQNet(state_dim, action_num, num_features),
Expand Down
Loading