diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/Unity ML Agents - Python API - Examples.iml b/.idea/Unity ML Agents - Python API - Examples.iml new file mode 100644 index 0000000..83f5ef8 --- /dev/null +++ b/.idea/Unity ML Agents - Python API - Examples.iml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..0aa4e7f --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,31 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..670dc26 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..a1468a5 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/other.xml b/.idea/other.xml new file mode 100644 index 0000000..68993fb --- /dev/null +++ b/.idea/other.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..4cf123e --- /dev/null +++ b/README.md @@ -0,0 +1,107 @@ +[//]: # (Image References) + +[image1]: https://user-images.githubusercontent.com/10624937/42386929-76f671f0-8106-11e8-9376-f17da2ae852e.png "Kernel" +# Reinforcement Learning Project + +This project was created to make it easier to get started with Reinforcement Learning. It now contains: +- An implementation of the [DDPG Algorithm](https://arxiv.org/abs/1509.02971) in Python, which works for both single-agent environments and multi-agent environments. +- Single and parallel environments in [Unity ML agents](https://unity.com/products/machine-learning-agents) using the [Python API](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Python-API.md). +- Two Jupyter notebooks: + - [3DBall.ipynb](notebooks/3DBall.ipynb): This is a simple example to get started with Unity ML Agents & the DDPG Algorithm. + - [3DBall_parallel_environment.ipynb](notebooks/3DBall_parallel_environment.ipynb): The same, but now for an environment run in parallel. + +# Getting Started + +## Install Basic Dependencies + +To set up your python environment to run the code in the notebooks, follow the instructions below. + +- If you're on Windows I recommend installing [Miniforge](https://github.com/conda-forge/miniforge). It's a minimal installer for Conda. I also recommend using the [Mamba](https://github.com/mamba-org/mamba) package manager instead of [Conda](https://docs.conda.io/). It works almost the same as Conda, but only faster. There's a [cheatsheet](https://docs.conda.io/projects/conda/en/latest/user-guide/cheatsheet.html) of Conda commands which also work in Mamba. To install Mamba, use this command: +```bash +conda install mamba -n base -c conda-forge +``` +- Create (and activate) a new environment with Python 3.6 or later. I recommend using Python 3.9: + + - __Linux__ or __Mac__: + ```bash + mamba create --name rl39 python=3.9 numpy + source activate rl39 + ``` + - __Windows__: + ```bash + mamba create --name rl39 python=3.9 numpy + activate rl39 + ``` +- Install PyTorch by following instructions on [Pytorch.org](https://pytorch.org/). For example, to install PyTorch on + Windows with GPU support, use this command: + +```bash +mamba install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch +``` + +- Install additional packages: +```bash +mamba install jupyter notebook matplotlib +``` + +- Create an [IPython kernel](http://ipython.readthedocs.io/en/stable/install/kernel_install.html) for the `rl39` environment in Jupyter. + +```bash +python -m ipykernel install --user --name rl39 --display-name "rl39" +``` + +- Change the kernel to match the `rl39` environment by using the drop-down menu `Kernel` -> `Change kernel` inside Jupyter Notebook. + +## Install Unity Machine Learning Agents + +**Note**: +In order to run the notebooks on **Windows**, it's not necessary to install the Unity Editor, because I have provided the [standalone executables](notebooks/README.md) of the environments for you. + +[Unity ML Agents](https://unity.com/products/machine-learning-agents) is the software that we use for the environments. The agents that we create in Python can interact with these environments. Unity ML Agents consists of several parts: +- [The Unity Editor](https://unity.com/) is used for creating environments. To install: + - Install [Unity Hub](https://unity.com/download). + - Install the latest version of Unity by clicking on the green button `Unity Hub` on the [download page](https://unity3d.com/get-unity/download/archive). + + To start the Unity editor you must first have a project: + + - Start the Unity Hub. + - Click on "Projects" + - Create a new dummy project. + - Click on the project you've just added in the Unity Hub. The Unity Editor should start now. + +- [The Unity ML-Agents Toolkit](https://github.com/Unity-Technologies/ml-agents#unity-ml-agents-toolkit). Download [the latest release](https://github.com/Unity-Technologies/ml-agents/releases) of the source code or use the [Git](https://git-scm.com/downloads/guis) command: `git clone --branch release_18 https://github.com/Unity-Technologies/ml-agents.git`. +- The Unity ML Agents package is used inside the Unity Editor. Please read [the instructions for installation](https://github.com/Unity-Technologies/ml-agents/blob/release_18_docs/docs/Installation.md#install-the-comunityml-agents-unity-package). +- The `mlagents` Python package is used as a bridge between Python and the Unity editor (or standalone executable). To install, use this command: `python -m pip install mlagents==0.27.0`. +Please note that there's no conda package available for this. + +## Install an IDE for Python + +For Windows, I would recommend using [PyCharm](https://www.jetbrains.com/pycharm/) (my choice), or [Visual Studio Code](https://code.visualstudio.com/). +Inside those IDEs you can use the Conda environment you have just created. + +## Creating a custom Unity executable + +### Load the examples project +[The Unity ML-Agents Toolkit](https://github.com/Unity-Technologies/ml-agents#unity-ml-agents-toolkit) contains several [example environments](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Learning-Environment-Examples.md). Here we will load them all inside the Unity editor: +- Start the Unity Hub. +- Click on "Projects" +- Add a project by navigating to the `Project` folder inside the toolkit. +- Click on the project you've just added in the Unity Hub. The Unity Editor should start now. + +### Create a 3D Ball executable +The 3D Ball example contains 12 environments in one, but this doesn't work very well in the Python API. The main problem is that there's no way to reset each environment individually. Therefore, we will remove the other 11 environments in the editor: +- Load the 3D Ball scene, by going to the project window and navigating to `Examples` -> `3DBall` -> `Scenes`-> `3DBall` +- In the Hierarchy window select the other 11 3DBall objects and delete them, so that only the `3DBall` object remains. + +Next, we will build the executable: +- Go to `File` -> `Build Settings` +- In the Build Settings window, click `Build` +- Navigate to `notebooks` folder and add `3DBall` to the folder name that is used for the build. + + +## Instructions for running the notebooks + +1. [Download](notebooks/README.md) the Unity executables for Windows. In case you're not on Windows, you have to build the executables yourself by following the instructions above. +2. Place the Unity executable folders in the same folder as the notebooks. +3. Load a notebook with Jupyter notebook. (The command to start Jupyter notebook is `jupyter notebook`) +4. Follow further instructions in the notebook. diff --git a/Report.md b/Report.md new file mode 100644 index 0000000..1baec2e --- /dev/null +++ b/Report.md @@ -0,0 +1,37 @@ +[//]: # (Image References) + +[image1]: ./plot.png + +# Project 3: Collaboration and Competition +## Learning Algorithm +The learning algorithm used for this project is [Deep Deterministic Policy Gradient (DDPG)](https://arxiv.org/abs/1509.02971). DDPG is known as an Actor-Critic method, and it can be used for continuous action spaces. Just like DQN (from project 1) it uses [Experience Replay](https://paperswithcode.com/method/experience-replay) and a [Target Network](https://towardsdatascience.com/deep-q-network-dqn-ii-b6bf911b6b2c). The Actor learns a deterministic policy function, and the Critic learns a Q value function. They both interact with each other when learning. The Critic uses the deterministic action from the Actor when calculating the Q value. Because the Actor learns a deterministic policy, some noise must be added to the action values, to help with exploration. This algorithm uses a noise decay, so that the noise at the start of the learning process is high and much lower at the end of it. + +Two types of neural networks are used in this project, one for the Actor and one for the Critic. They both have two hidden layers with 256 and 128 linear units. The Actor network has 24 inputs, and 2 outputs. That's because each state has 24 parameters and there are 2 action parameters. The Critic has 26 (24 + 2) inputs and only one output, the Q value. + +In this project there are two agents, so there is an Actor and a Critic neural network for each agent. Both agents learn independently of each other. The Critic only uses the state that the agent sees and not the global state like in the [MADDPG](https://proceedings.neurips.cc/paper/2017/file/68a9750337a418a86fe06c1991a1d64c-Paper.pdf) algorithm. + +The hyperparameters used for this algorithm are: + +- `buffer_size=100000` replay buffer size +- `batch_size=1000` minibatch size +- `gamma=0.99` discount factor +- `tau=1e-3` for soft update of the target network parameters +- `lr_actor=1e-4` learning rate of the actor +- `lr_critic=1e-3` learning rate of the critic +- `weight_decay=0.0` L2 weight decay +- `update_every=20` how often to update the networks +- `noise_decay=3e-6` the noise decay used for the action values + +## Plot of Rewards +![plot][image1] + +The environment was solved in 23746 episodes. + +## Ideas for Future Work +The performance of the agent could be improved in several ways: + +- [MADDPG](https://proceedings.neurips.cc/paper/2017/file/68a9750337a418a86fe06c1991a1d64c-Paper.pdf) +- [Twin Delayed DDPG](https://spinningup.openai.com/en/latest/algorithms/td3.html) +- [Soft Actor Critic (SAC)](https://spinningup.openai.com/en/latest/algorithms/sac.html) +- [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952) + diff --git a/ddpg_agent.py b/ddpg_agent.py new file mode 100644 index 0000000..6838460 --- /dev/null +++ b/ddpg_agent.py @@ -0,0 +1,115 @@ +from model import Actor, Critic +from pytorch_device import pytorch_device +import torch +import torch.nn.functional as f +import torch.optim as optim +from typing import Tuple, List +import copy + + +class DDPGAgent: + """Interacts with and learns from the environment.""" + + def __init__(self, actor: Actor, critic: Critic, gamma=0.99, tau=1e-3, + lr_actor=1e-4, lr_critic=1e-3, weight_decay=1e-2): + """Initialize a DDPG Agent object. + + :param actor: + :param critic: + :param gamma: discount factor + :param tau: for soft update of target parameters + :param lr_actor: learning rate of the actor + :param lr_critic: learning rate of the critic + :param weight_decay: L2 weight decay + """ + self.action_size = actor.action_size + self.gamma = gamma + self.tau = tau + + # Actor Network (w/ Target Network) + self.actor = actor.to(pytorch_device) + self.actor_target = copy.deepcopy(actor).to(pytorch_device) + self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor) + + # Critic Network (w/ Target Network) + self.critic = critic.to(pytorch_device) + self.critic_target = copy.deepcopy(critic).to(pytorch_device) + self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay) + + def act(self, state) -> torch.Tensor: + self.actor.eval() + with torch.no_grad(): + action = self.actor(state) + self.actor.train() + return action + + def step(self, samples: Tuple[torch.Tensor, ...]): + """Update policy and value parameters using given batch of experience tuples. + Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) + where: + actor_target(state) -> action + critic_target(state, action) -> Q-value + + :param samples: tuple of (s, a, r, s', done) + """ + states, actions, rewards, next_states, dones = samples + + # ---------------------------- update critic ---------------------------- # + with torch.no_grad(): + # Get predicted next-state actions and Q values from target models + actions_next = self.actor_target(next_states) # + \ + # (torch.rand(*actions.shape, device=pytorch_device) * 0.1 - 0.05) + # torch.clamp_(actions_next, min=-1.0, max=1.0) + q_targets_next = self.critic_target(next_states, actions_next) + # Compute Q targets for current states + q_targets = rewards + (self.gamma * q_targets_next * (1 - dones)) + # Compute critic loss + q_expected = self.critic(states, actions) + critic_loss = f.mse_loss(q_expected, q_targets) + # Minimize the loss + self.critic_optimizer.zero_grad() + critic_loss.backward() + # torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1) + self.critic_optimizer.step() + + # ---------------------------- update actor ---------------------------- # + # Compute actor loss + actions_pred = self.actor(states) # + \ + # (torch.rand(*actions.shape, device=pytorch_device) * 0.1 - 0.05) + # torch.clamp_(actions_pred, min=-1.0, max=1.0) + actor_loss = -self.critic(states, actions_pred).mean() + # Minimize the loss + self.actor_optimizer.zero_grad() + actor_loss.backward() + # torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1) + self.actor_optimizer.step() + + def update_target_networks(self): + soft_update(self.critic, self.critic_target, self.tau) + soft_update(self.actor, self.actor_target, self.tau) + + def get_state_dicts(self): + return {'actor_params': self.actor.state_dict(), + 'actor_optim_params': self.actor_optimizer.state_dict(), + 'critic_params': self.critic.state_dict(), + 'critic_optim_params': self.critic_optimizer.state_dict()} + + def load_state_dicts(self, state_dicts): + self.actor.load_state_dict(state_dicts['actor_params']) + self.actor_optimizer.load_state_dict(state_dicts['actor_optim_params']) + self.critic.load_state_dict(state_dicts['critic_params']) + self.critic_optimizer.load_state_dict(state_dicts['critic_optim_params']) + + +def soft_update(local_model, target_model, tau): + """Soft update model parameters. + θ_target = τ*θ_local + (1 - τ)*θ_target + + Params + ====== + local_model: PyTorch model (weights will be copied from) + target_model: PyTorch model (weights will be copied to) + tau (float): interpolation parameter + """ + for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): + target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) diff --git a/ddpg_agents.py b/ddpg_agents.py new file mode 100644 index 0000000..c5ed53d --- /dev/null +++ b/ddpg_agents.py @@ -0,0 +1,62 @@ +from ddpg_agent import DDPGAgent +from utilities import convert_to_numpy +import torch +import numpy as np +from typing import List, Tuple + + +class DDPGAgents: + def __init__(self, ddpg_agents: List[DDPGAgent]): + self.ddpg_agents = ddpg_agents + self.num_agents = len(ddpg_agents) + + def act(self, agent_states: torch.Tensor, noise_scale: float) -> np.ndarray: + """ Get actions from all agents + + :param agent_states: states for each agent -> tensor[num_agents, batch_size, state_size] + :param noise_scale: the amount of noise to add to action values + :return: np.ndarray[num_agents, batch_size, action_size] + """ + actions = [] + for i, ddpg_agent in enumerate(self.ddpg_agents): + states = agent_states[i] + noise = np.random.normal(scale=noise_scale, size=ddpg_agent.action_size) + action = convert_to_numpy(ddpg_agent.act(states)) + noise + actions.append(action) + return np.stack(actions) + + def step(self, samples: List[Tuple[torch.Tensor, ...]]): + """ + :param samples: list[num_agents] of tuple(states, actions, rewards, next_states, dones). + Each element in the tuple is a tensor[num_samples, num_agents, *] + """ + for i, ddpg_agent, samples_for_agent in zip(range(len(self)), self.ddpg_agents, samples): + # transpose samples_for_agent to tuple of tensor[num_agents, num_samples, *]: + samples_for_agent = tuple(torch.transpose(t, 0, 1) for t in samples_for_agent) + # convert samples_for_agent to tuple of tensor[num_samples, *]: + samples_for_agent = tuple(t[i] for t in samples_for_agent) + ddpg_agent.step(samples_for_agent) + + def update_target_networks(self): + for ddpg_agent in self.ddpg_agents: + ddpg_agent.update_target_networks() + + def save_checkpoint(self, filename: str): + state_dicts_list = [] + for ddpg_agent in self.ddpg_agents: + state_dicts = ddpg_agent.get_state_dicts() + state_dicts_list.append(state_dicts) + torch.save(state_dicts_list, filename) + + def load_checkpoint(self, filename): + state_dicts_list = torch.load(filename) + for ddpg_agent, state_dicts in zip(self.ddpg_agents, state_dicts_list): + ddpg_agent.load_state_dicts(state_dicts) + + def __len__(self): + """Return number of agents.""" + return self.num_agents +# +# class GaussianNoise: +# def sample(self, output_shape, noise_scale): +# return np.random.normal(scale=noise_scale, size=output_shape) diff --git a/model.py b/model.py new file mode 100644 index 0000000..ed92fd7 --- /dev/null +++ b/model.py @@ -0,0 +1,120 @@ +import torch +import torch.nn as nn +import torch.nn.functional as f +import numpy as np +from typing import List + + +def hidden_init(layer): + """ see https://arxiv.org/abs/1509.02971 Section 7 for details: + (CONTINUOUS CONTROL WITH DEEP REINFORCEMENT LEARNING) + """ + fan_in = layer.weight.data.size()[0] + lim = 1. / np.sqrt(fan_in) + return -lim, lim + + +class Actor(nn.Module): + """Actor (Policy) Model.""" + + def __init__(self, state_size: int, action_size: int, hidden_layer_sizes: List[int], activation_func=f.relu): + """Initialize parameters and build model. + + :param state_size: Dimension of each state + :param action_size: Dimension of each action + :param hidden_layer_sizes: Number of nodes in hidden layers + :param activation_func: Activation function + """ + super(Actor, self).__init__() + self.action_size = action_size + self.input_norm = nn.BatchNorm1d(state_size) + self.activation_func = activation_func + self.input_layer = nn.Linear(state_size, hidden_layer_sizes[0]) + self.hidden_layers = nn.ModuleList() + self.hidden_input_norms = nn.ModuleList() + for i in range(len(hidden_layer_sizes) - 1): + hidden_layer = nn.Linear(hidden_layer_sizes[i], hidden_layer_sizes[i + 1]) + self.hidden_layers.append(hidden_layer) + self.hidden_input_norms.append(nn.BatchNorm1d(hidden_layer_sizes[i])) + self.hidden_input_norms.append(nn.BatchNorm1d(hidden_layer_sizes[-1])) + self.output_layer = nn.Linear(hidden_layer_sizes[-1], action_size) + self.reset_parameters() + + def reset_parameters(self): + self.input_layer.weight.data.uniform_(*hidden_init(self.input_layer)) + for hidden_layer in self.hidden_layers: + hidden_layer.weight.data.uniform_(*hidden_init(hidden_layer)) + self.output_layer.weight.data.uniform_(-3e-3, 3e-3) + + def forward(self, state: torch.Tensor): + """Build an actor (policy) network that maps states -> actions. + Note: Do not call this function directly. Instead, use: actor(state) + """ + x = self.input_norm(state) + x = self.activation_func(self.input_layer(x)) + for i, hidden_layer in enumerate(self.hidden_layers): + x = self.hidden_input_norms[i](x) + x = self.activation_func(hidden_layer(x)) + x = self.hidden_input_norms[-1](x) + # this outputs action values in the range -1 to 1 : + return torch.tanh(self.output_layer(x)) + + def __call__(self, state: torch.Tensor) -> torch.Tensor: + return super().__call__(state) + + +class Critic(nn.Module): + """Critic (Value) Model.""" + + def __init__(self, state_size, action_size, hidden_layer_sizes: List[int], activation_func=f.relu, inject_layer=0): + """Initialize parameters and build model. + + :param state_size: Dimension of each state + :param action_size: Dimension of each action + :param hidden_layer_sizes: Number of nodes in hidden layers + :param activation_func: Activation function + :param inject_layer: The number of the hidden layer to inject action values into + """ + super(Critic, self).__init__() + if inject_layer < 0 or inject_layer >= len(hidden_layer_sizes) - 1: + raise ValueError() + self.inject_layer = inject_layer + self.input_norm = nn.BatchNorm1d(state_size) + self.activation_func = activation_func + self.input_layer = nn.Linear(state_size, hidden_layer_sizes[0]) + self.hidden_layers = nn.ModuleList() + self.hidden_input_norms = nn.ModuleList() + for i in range(len(hidden_layer_sizes) - 1): + in_features = hidden_layer_sizes[i] + # insert the action parameters in hidden layer: + if i == inject_layer: + in_features += action_size + hidden_layer = nn.Linear(in_features, hidden_layer_sizes[i + 1]) + self.hidden_layers.append(hidden_layer) + self.hidden_input_norms.append(nn.BatchNorm1d(hidden_layer_sizes[i])) + # There's only one Q-value as output, because the input is a state-action pair now (compared to DQN): + self.output_layer = nn.Linear(hidden_layer_sizes[-1], 1) + self.reset_parameters() + + def reset_parameters(self): + self.input_layer.weight.data.uniform_(*hidden_init(self.input_layer)) + for hidden_layer in self.hidden_layers: + hidden_layer.weight.data.uniform_(*hidden_init(hidden_layer)) + self.output_layer.weight.data.uniform_(-3e-3, 3e-3) + + def forward(self, state, action): + """Build a critic (value) network that maps (state, action) pairs -> Q-values. + Note: Do not call this function directly. Instead, use: critic(state, action) + """ + x = self.input_norm(state) + x = self.activation_func(self.input_layer(x)) + for i, hidden_layer in enumerate(self.hidden_layers): + x = self.hidden_input_norms[i](x) + # insert the action parameters in hidden layer: + if i == self.inject_layer: + x = torch.cat((x, action), dim=1) + x = self.activation_func(hidden_layer(x)) + return self.output_layer(x) + + def __call__(self, state: torch.Tensor, action: torch.Tensor) -> torch.Tensor: + return super().__call__(state, action) diff --git a/my_unity_environment.py b/my_unity_environment.py new file mode 100644 index 0000000..74903c8 --- /dev/null +++ b/my_unity_environment.py @@ -0,0 +1,99 @@ +import numpy as np +from mlagents_envs.base_env import ActionTuple +from mlagents_envs.environment import UnityEnvironment +from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel +from typing import Tuple, List, Optional + + +class MyUnityEnvironment: + def __init__(self, file_name=None, no_graphics=False, seed=1, worker_id=0): + """ + :param file_name: The filename of the Unity executable, or None when using the Unity editor + (press Play to connect). + :param no_graphics: Whether to use a graphics window or not. + :param seed: The seed used for a pseudo random number generator. + :param worker_id: The id of the Unity thread to create. You cannot create threads with the same id. + """ + self.engine_configuration_channel = EngineConfigurationChannel() + side_channels = [self.engine_configuration_channel] + self.env = UnityEnvironment(file_name=file_name, no_graphics=no_graphics, seed=seed, worker_id=worker_id, + side_channels=side_channels) + self.env.reset() + self.behavior_names = sorted(self.env.behavior_specs.keys()) + self.behavior_specs = [self.env.behavior_specs[behavior_name] for behavior_name in self.behavior_names] + self.num_agents_list = [] # number of agents for each behavior + for behavior_name in self.behavior_names: + decision_steps, _ = self.env.get_steps(behavior_name) + self.num_agents_list.append(len(decision_steps)) + + def set_timescale(self, time_scale: float): + """ Set the timescale at which the physics simulation runs. + + :param time_scale: a value of 1.0 means the simulation runs in realtime. + """ + self.engine_configuration_channel.set_configuration_parameters(time_scale=time_scale) + + def set_display_size(self, width: int, height: int): + self.engine_configuration_channel.set_configuration_parameters(width=width, height=height) + + def reset(self): + self.env.reset() + + def get_observations(self, behavior_index: int) -> np.ndarray: + """ Get observations for behavior. + Agents can have different behaviors. For example: Two strikers, and a goalie in the soccer example. + + :return: np.ndarray[num_agents, observation_size] + """ + num_agents = self.num_agents_list[behavior_index] + behavior_spec = self.behavior_specs[behavior_index] + behavior_name = self.behavior_names[behavior_index] + observations = np.ndarray((num_agents, *behavior_spec.observation_specs[0].shape)) + decision_steps, terminal_steps = self.env.get_steps(behavior_name) + for agent_id in decision_steps: + observations[agent_id] = decision_steps[agent_id].obs[0] + return observations + + def set_actions(self, behavior_index: int, continuous: Optional[np.ndarray] = None, + discrete: Optional[np.ndarray] = None): + """ Set actions for behavior. + + :param behavior_index: + :param continuous: ndarray[num_agents, *] + :param discrete: + """ + + behavior_name = self.behavior_names[behavior_index] + action_tuple = ActionTuple(continuous=continuous, discrete=discrete) + self.env.set_actions(behavior_name, action_tuple) + + def step(self): + """ Step forward in environment. """ + self.env.step() + + def get_experiences(self, behavior_index: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """ Get experiences for all agents with behavior %behavior_index. + + :param behavior_index: + :return: Tuple of (observations, rewards, dones). Each element is ndarray[num_agents, *] + """ + num_agents = self.num_agents_list[behavior_index] + behavior_spec = self.behavior_specs[behavior_index] + behavior_name = self.behavior_names[behavior_index] + # TODO: implement stacked observations: + observations = np.ndarray((num_agents, *behavior_spec.observation_specs[0].shape)) + rewards = np.ndarray((num_agents, 1)) + dones = np.ndarray((num_agents, 1)) + decision_steps, terminal_steps = self.env.get_steps(behavior_name) + for agent_id in decision_steps: + observations[agent_id] = decision_steps[agent_id].obs[0] + rewards[agent_id] = decision_steps[agent_id].reward + dones[agent_id] = False + for agent_id in terminal_steps: + observations[agent_id] = terminal_steps[agent_id].obs[0] + rewards[agent_id] = terminal_steps[agent_id].reward + dones[agent_id] = not terminal_steps[agent_id].interrupted + return observations, rewards, dones + + def close(self): + self.env.close() diff --git a/notebooks/3DBall.ipynb b/notebooks/3DBall.ipynb new file mode 100644 index 0000000..9555609 --- /dev/null +++ b/notebooks/3DBall.ipynb @@ -0,0 +1,542 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3D Balance Ball\n", + "\n", + "In this notebook, we will run the [3D Balance Ball example](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Learning-Environment-Examples.md#3dball-3d-balance-ball) from [Unity ML Agents](https://unity.com/products/machine-learning-agents). Please check the README file to setup this project.\n", + "\n", + "### 1. Start the Environment\n", + "\n", + "We begin by importing the necessary packages:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "from my_unity_environment import MyUnityEnvironment\n", + "from model import Actor, Critic\n", + "from ddpg_agents import DDPGAgents\n", + "from ddpg_agent import DDPGAgent\n", + "from replay_buffer import ReplayBuffer\n", + "from utilities import convert_to_tensor\n", + "import numpy as np\n", + "import torch\n", + "import torch.nn.functional as f\n", + "import random\n", + "from collections import deque\n", + "import time\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we will start the environment. Before running the code cell below, change the `ENV_FILE_NAME` parameter to match the location of the Unity executable that you [downloaded](README.md) or [created](../README.md#creating-a-custom-unity-executable) yourself. For example:\n", + "\n", + "```\n", + "ENV_FILE_NAME = \"3DBall_Windows_x86_64/UnityEnvironment.exe\"\n", + "```\n", + "A new window should pop up. Don't worry if the window becomes unresponsive.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "ENV_FILE_NAME = \"3DBall_Windows_x86_64/UnityEnvironment.exe\"\n", + "CHECKPOINT_FILENAME = \"checkpoint-3dball.pth\" # this is used for saving and loading the model\n", + "DISPLAY_SIZE = [1024, 768] # The width and height of the Unity window\n", + "\n", + "test_env = MyUnityEnvironment(file_name=ENV_FILE_NAME, no_graphics=False)\n", + "test_env.set_timescale(1.0)\n", + "test_env.set_display_size(width=DISPLAY_SIZE[0], height=DISPLAY_SIZE[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Examine the State and Action Spaces\n", + "\n", + "In this environment, an agent must balance a ball on its head for as long as possible.\n", + "\n", + "**Agent Reward Function:**\n", + "- +0.1 for every step the ball remains on its head.\n", + "- -1.0 if the ball falls off.\n", + "\n", + "**Behavior Parameters:**\n", + "- Vector Observation space: 8 variables corresponding to rotation of the agent cube, and position and velocity of ball.\n", + "- Actions: 2 continuous actions, with one value corresponding to X-rotation, and the other to Z-rotation.\n", + "\n", + "Run the code cell below to print some information about the environment:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of agents: 1\n", + "Size of each action: 2\n", + "States look like: [-0.04766776 -0.08700117 -0.54295158 4. 0.11863136 0.\n", + " 0. 0. ]\n", + "States have shape: (8,)\n" + ] + } + ], + "source": [ + "def examine_environment(myenv: MyUnityEnvironment):\n", + " # number of agents in the first behavior:\n", + " print('Number of agents:', myenv.num_agents_list[0])\n", + "\n", + " # number of actions\n", + " print('Size of each action:', myenv.behavior_specs[0].action_spec.continuous_size)\n", + "\n", + " # examine the state space\n", + " print('States look like:', myenv.get_observations(0)[0])\n", + " print('States have shape:', myenv.behavior_specs[0].observation_specs[0].shape)\n", + "\n", + "examine_environment(test_env)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Take Random Actions in the Parallel Environment\n", + "\n", + "Run the code cell below, to watch a random agent in action." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score from episode 0: 2.0000000447034836\n", + "Score from episode 1: 1.500000037252903\n", + "Score from episode 2: 2.400000050663948\n", + "Score from episode 3: 2.1000000461935997\n", + "Score from episode 4: 1.2000000327825546\n", + "Score from episode 5: 2.400000050663948\n", + "Score from episode 6: 1.0000000298023224\n", + "Score from episode 7: 1.8000000417232513\n", + "Score from episode 8: 1.1000000312924385\n", + "Score from episode 9: 1.1000000312924385\n", + "Time elapsed: 27.05\n" + ] + } + ], + "source": [ + "def test_random_agents(myenv: MyUnityEnvironment, n_episodes: int, max_t: int):\n", + " start_time = time.time()\n", + " for i in range(n_episodes):\n", + " myenv.reset()\n", + " scores = np.zeros(myenv.num_agents_list[0])\n", + " for t in range(max_t):\n", + " actions = np.random.randn(myenv.num_agents_list[0],\n", + " myenv.behavior_specs[0].action_spec.continuous_size)\n", + " actions = np.clip(actions, -1, 1)\n", + " myenv.set_actions(behavior_index=0, continuous=actions)\n", + " myenv.step()\n", + " _, rewards, dones = myenv.get_experiences(behavior_index=0)\n", + " scores += rewards.squeeze()\n", + " if np.any(dones):\n", + " break\n", + " print('Score from episode {}: {}'.format(i, np.max(scores)))\n", + " print(f\"Time elapsed: {time.time() - start_time:.2f}\")\n", + "\n", + "test_random_agents(test_env, n_episodes=10, max_t=200)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [], + "source": [ + "test_env.close()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### 4. Train the Agent with DDPG\n", + "\n", + "Run the code cells below to train the agent from scratch.\n", + "\n", + "Alternatively, you can skip to the next step below (**5. Watch a Smart Agent**), to load the saved model weights from a pre-trained agent." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "class DDPGAgentsTester:\n", + " def __init__(self, ddpg_agents: DDPGAgents,\n", + " myenv: MyUnityEnvironment,\n", + " buffer_size=int(1.0e6), # replay buffer size\n", + " noise_start=1.0\n", + " ):\n", + " self.ddpg_agents = ddpg_agents\n", + " self.myenv = myenv\n", + " self.buffer_size = buffer_size\n", + " self.scores = []\n", + " self.scores_deque = deque(maxlen=100)\n", + " self.episode = 0\n", + " self.noise = noise_start\n", + " self.replay_buffer = ReplayBuffer(buffer_size)\n", + "\n", + " def train_agents(self, n_episodes, max_t, goal=float(\"inf\"), print_every=1000, update_every=1,\n", + " num_updates=1, batch_size=64, noise_decay=6.93e-6):\n", + " \"\"\" Multi Agent Deep Deterministic Policy Gradient algorithm.\n", + "\n", + " Params\n", + " ======\n", + " n_episodes (int): maximum number of training episodes\n", + " max_t (int): maximum number of timesteps per episode\n", + " goal (float): the algorithm will stop when the goal is reached\n", + " print_every (int) : print intermediate results every %print_every episodes\n", + " update_every (int): update the neural networks every %update_every time steps\n", + " num_updates: How many updates to do in a row\n", + " batch_size (int): minibatch size\n", + " noise_decay (float): noise decay factor = 1.0 - %noise_decay\n", + " \"\"\"\n", + " noise_decay = 1.0 - noise_decay\n", + " start_episode = self.episode\n", + " stop_episode = self.episode + n_episodes\n", + " steps = 0\n", + " start_time = time.time()\n", + " last_print_time = 0\n", + " for self.episode in range(start_episode, stop_episode):\n", + " score = np.zeros(len(self.ddpg_agents))\n", + " self.myenv.reset()\n", + " states = self.myenv.get_observations(behavior_index=0)\n", + " for t in range(max_t):\n", + " steps += 1\n", + " # get actions from all agents:\n", + " actions = self.ddpg_agents.act(convert_to_tensor(states[:, np.newaxis, :]), self.noise)\n", + " # remove batch_size from actions:\n", + " actions = actions[:, 0, :]\n", + " self.myenv.set_actions(behavior_index=0, continuous=actions)\n", + " self.myenv.step()\n", + " next_states, rewards, dones = self.myenv.get_experiences(behavior_index=0)\n", + "\n", + " # add sample to replay buffer:\n", + " sample = (states, actions, rewards, next_states, dones)\n", + " self.replay_buffer.add(sample)\n", + "\n", + " states = next_states\n", + " self.noise *= noise_decay\n", + " score += rewards.squeeze()\n", + "\n", + " # update networks every %update_every time steps:\n", + " if steps % update_every == 0 and len(self.replay_buffer) > batch_size * 100:\n", + " for _ in range(num_updates):\n", + " samples = [self.replay_buffer.sample(batch_size) for _ in range(len(self.ddpg_agents))]\n", + " self.ddpg_agents.step(samples)\n", + " #soft update the target network towards the actual networks:\n", + " self.ddpg_agents.update_target_networks()\n", + "\n", + " if np.any(dones): # exit loop if episode finished\n", + " break\n", + "\n", + " self.scores_deque.append(score)\n", + " self.scores.append(score)\n", + "\n", + " average_scores = np.mean(self.scores_deque, 0) # average score over last 100 episodes for each agent\n", + " if time.time() - last_print_time > 1.0:\n", + " time_per_step = (time.time() - start_time) / steps\n", + " print('\\rEpisode {}\\tSteps: {}\\tTime per step: {:.6f}\\tAverage Scores: {:.3f}'\n", + " .format(self.episode, steps, time_per_step, *average_scores), end=\"\")\n", + " last_print_time = time.time()\n", + " if self.episode % print_every == 0:\n", + " print(\"\\r\" + \" \" * 80, end=\"\")\n", + " print('\\rEpisode {}\\tAverage Scores: {:.3f}'.format(self.episode, *average_scores))\n", + " if len(self.scores) >= print_every and np.max(average_scores) >= goal:\n", + " print('\\nEnvironment solved in {:d} episodes!\\tAverage Score: {:.2f}\\tTime elapsed: {}'.format(\n", + " self.episode, np.max(average_scores), time.time() - start_time))\n", + " break\n", + "\n", + " def test_agent(self, n_episodes, max_t):\n", + " for _ in range(n_episodes):\n", + " self.myenv.reset()\n", + " states = self.myenv.get_observations(behavior_index=0)\n", + " score = np.zeros(len(self.ddpg_agents))\n", + " for _ in range(max_t):\n", + " # get actions from all agents:\n", + " actions = self.ddpg_agents.act(convert_to_tensor(states[:, np.newaxis, :]), noise_scale=0.0)\n", + " # remove batch_size from actions:\n", + " actions = actions[:, 0, :]\n", + "\n", + " self.myenv.set_actions(behavior_index=0, continuous=actions)\n", + " self.myenv.step()\n", + " next_states, rewards, dones = self.myenv.get_experiences(behavior_index=0)\n", + "\n", + " score += rewards.squeeze()\n", + " states = next_states\n", + " if np.any(dones): # exit loop if episode finished\n", + " break\n", + " print(\"Score: {}\".format(score))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [], + "source": [ + "random_seed = 1\n", + "np.random.seed(random_seed)\n", + "torch.manual_seed(random_seed)\n", + "random.seed(random_seed)\n", + "train_env = MyUnityEnvironment(file_name=ENV_FILE_NAME, seed=random_seed, no_graphics=True, worker_id=0)\n", + "train_env.set_timescale(time_scale=100.0)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [], + "source": [ + "actor1 = Actor(state_size=8, action_size=2, hidden_layer_sizes=[400, 300], activation_func=f.leaky_relu)\n", + "critic1 = Critic(state_size=8, action_size=2, hidden_layer_sizes=[400, 300], activation_func=f.leaky_relu,\n", + " inject_layer=0)\n", + "ddpg_agent1 = DDPGAgent(actor1, critic1, gamma=0.99, tau=1.0e-3, lr_actor=1.0e-4, lr_critic=1.0e-3, weight_decay=1.0e-2)\n", + "ddpg_agent_list = [ddpg_agent1]\n", + "ddpg_agents = DDPGAgents(ddpg_agent_list)\n", + "ddpg_agents_tester = DDPGAgentsTester(ddpg_agents, train_env, buffer_size=int(1.0e6), noise_start=1.0)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "You can skip this cell, if you don’t want to train the agent from scratch. It may take 30 to 45 minutes:" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Episode 0\tAverage Scores: 1.500 \n", + "Episode 1000\tAverage Scores: 0.854 \n", + "Episode 2000\tAverage Scores: 0.710 \n", + "Episode 3000\tAverage Scores: 0.829 \n", + "Episode 4000\tAverage Scores: 1.844 \n", + "Episode 5000\tAverage Scores: 2.710 \n", + "Episode 6000\tAverage Scores: 3.985 \n", + "Episode 6301\tSteps: 182838\tTime per step: 0.011439\tAverage Scores: 10.000\n", + "Environment solved in 6301 episodes!\tAverage Score: 10.00\tTime elapsed: 2091.4249007701874\n" + ] + } + ], + "source": [ + "ddpg_agents_tester.myenv = train_env\n", + "ddpg_agents_tester.train_agents(n_episodes=int(1.0e5), max_t=100, goal=10.0, update_every=1,\n", + " num_updates=1, batch_size=64, noise_decay=6.93e-6)\n", + "ddpg_agents.save_checkpoint(filename=CHECKPOINT_FILENAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "def plot_scores():\n", + " scores = np.vstack(ddpg_agents_tester.scores)\n", + " scores = np.max(scores, 1)\n", + " fig = plt.figure()\n", + " ax = fig.add_subplot(111)\n", + " plt.plot(np.arange(1, len(scores) + 1), scores)\n", + " plt.ylabel('Score')\n", + " plt.xlabel('Episode #')\n", + " plt.show()\n", + "\n", + "plot_scores()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Watch a Smart Agent" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "test_env = MyUnityEnvironment(file_name=ENV_FILE_NAME, worker_id=1)\n", + "test_env.set_timescale(1.0)\n", + "test_env.set_display_size(width=DISPLAY_SIZE[0], height=DISPLAY_SIZE[1])\n", + "ddpg_agents_tester.myenv = test_env\n", + "ddpg_agents.load_checkpoint(filename=CHECKPOINT_FILENAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score: [20.0000003]\n", + "Score: [20.0000003]\n", + "Score: [20.0000003]\n", + "Score: [20.0000003]\n", + "Score: [20.0000003]\n", + "Score: [20.0000003]\n", + "Score: [20.0000003]\n", + "Score: [20.0000003]\n", + "Score: [20.0000003]\n", + "Score: [20.0000003]\n" + ] + } + ], + "source": [ + "ddpg_agents_tester.test_agent(n_episodes=10, max_t=200)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "When finished, you can close the environment." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "train_env.close()\n", + "test_env.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "name": "rl39", + "language": "python", + "display_name": "rl39" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/notebooks/3DBall_parallel_environment.ipynb b/notebooks/3DBall_parallel_environment.ipynb new file mode 100644 index 0000000..a22b4f5 --- /dev/null +++ b/notebooks/3DBall_parallel_environment.ipynb @@ -0,0 +1,618 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3D Balance Ball in parallel environments\n", + "\n", + "In this notebook, we will run the [3D Balance Ball example](https://github.com/Unity-Technologies/ml-agents/blob/main/docs/Learning-Environment-Examples.md#3dball-3d-balance-ball) from [Unity ML Agents](https://unity.com/products/machine-learning-agents) in parallel environments. Please check the README file to setup this project.\n", + "\n", + "### 1. Start the Environment\n", + "\n", + "We begin by importing the necessary packages:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "from parallel_unity_environment import ParallelUnityEnvironment\n", + "from model import Actor, Critic\n", + "from ddpg_agents import DDPGAgents\n", + "from ddpg_agent import DDPGAgent\n", + "from replay_buffer import ReplayBuffer\n", + "from utilities import convert_to_tensor\n", + "import numpy as np\n", + "import torch\n", + "import torch.nn.functional as f\n", + "import random\n", + "from collections import deque\n", + "import time\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we will start the environment. Before running the code cell below, change the `ENV_FILE_NAME` parameter to match the location of the Unity environment that you [downloaded](README.md) or [created](../README.md#creating-a-custom-unity-executable) yourself. For example:\n", + "```\n", + "ENV_FILE_NAME = \"3DBall_Windows_x86_64/UnityEnvironment.exe\"\n", + "```\n", + "Four new windows should pop up, one for each environment. Don't worry if the windows become unresponsive." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "ENV_FILE_NAME = \"3DBall_Windows_x86_64/UnityEnvironment.exe\"\n", + "NUM_ENVS = 4 # number of environments to run in parallel\n", + "CHECKPOINT_FILENAME = \"checkpoint-3dball-parallel.pth\" # this is used for saving and loading the model\n", + "DISPLAY_SIZE = [1024, 768] # The width and height of the Unity windows\n", + "\n", + "test_env = ParallelUnityEnvironment(num_envs=NUM_ENVS, seeds=list(range(NUM_ENVS)),\n", + " file_name=ENV_FILE_NAME, no_graphics=False)\n", + "test_env.set_timescale(1.0)\n", + "test_env.set_display_size(width=DISPLAY_SIZE[0], height=DISPLAY_SIZE[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Examine the State and Action Spaces\n", + "\n", + "In this environment, an agent must balance a ball on its head for as long as possible.\n", + "\n", + "**Agent Reward Function:**\n", + "- +0.1 for every step the ball remains on its head.\n", + "- -1.0 if the ball falls off.\n", + "\n", + "**Behavior Parameters:**\n", + "- Vector Observation space: 8 variables corresponding to rotation of the agent cube, and position and velocity of ball.\n", + "- Actions: 2 continuous actions, with one value corresponding to X-rotation, and the other to Z-rotation.\n", + "\n", + "Run the code cell below to print some information about the environment:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of agents: 1\n", + "Size of each action: 2\n", + "States look like: [-0.01467304 -0.01468306 -0.52082086 4. -0.79952097 0.\n", + " 0. 0. ]\n", + "States have shape: (8,)\n" + ] + } + ], + "source": [ + "def examine_environment(env: ParallelUnityEnvironment):\n", + " # number of agents in the first behavior:\n", + " print('Number of agents:', env.num_agents_list[0])\n", + "\n", + " # number of actions\n", + " print('Size of each action:', env.behavior_specs[0].action_spec.continuous_size)\n", + "\n", + " # examine the state space\n", + " print('States look like:', env.get_observations(0, 0)[0])\n", + " print('States have shape:', env.behavior_specs[0].observation_specs[0].shape)\n", + "\n", + "examine_environment(test_env)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Take Random Actions in the Parallel Environment\n", + "\n", + "Run the code cell below, to watch a random agent in action." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score from environment 3, episode 0: 1.40\n", + "Score from environment 0, episode 1: 1.90\n", + "Score from environment 2, episode 2: 2.10\n", + "Score from environment 1, episode 3: 2.70\n", + "Score from environment 3, episode 4: 1.40\n", + "Score from environment 2, episode 5: 2.10\n", + "Score from environment 1, episode 6: 1.70\n", + "Score from environment 3, episode 7: 1.30\n", + "Score from environment 0, episode 8: 3.40\n", + "Score from environment 2, episode 9: 0.70\n", + "Time elapsed: 8.16\n" + ] + } + ], + "source": [ + "def test_random_agents(env: ParallelUnityEnvironment, n_episodes: int, max_t: int):\n", + " start_time = time.time()\n", + " current_episode = 0\n", + " current_timestep_list = [0] * env.num_envs\n", + " scores = np.zeros(env.num_envs)\n", + " reset_list = [True] * env.num_envs\n", + " reset_env = True\n", + " while current_episode < n_episodes:\n", + " # reset environments if needed:\n", + " if reset_env:\n", + " env.reset(reset_list)\n", + " for env_index, reset in enumerate(reset_list):\n", + " reset_list[env_index] = False\n", + "\n", + " # set actions for each environment:\n", + " for env_index in range(env.num_envs):\n", + " actions = np.random.randn(env.num_agents_list[0], env.behavior_specs[0].action_spec.continuous_size)\n", + " actions = np.clip(actions, -1, 1)\n", + " env.set_actions(behavior_index=0, env_index=env_index, continuous=actions)\n", + "\n", + " # step forward in all environments:\n", + " env.step()\n", + "\n", + " for env_index in range(env.num_envs):\n", + " # collect experiences from environment:\n", + " _, rewards, dones = env.get_experiences(behavior_index=0, env_index=env_index)\n", + " scores[env_index] += rewards.squeeze()\n", + " current_timestep_list[env_index] += 1\n", + "\n", + " # check if episode has ended:\n", + " if current_timestep_list[env_index] >= max_t or np.any(dones):\n", + " print(f\"Score from environment {env_index}, episode {current_episode}: \"\n", + " f\"{scores[env_index]:.2f}\")\n", + " current_timestep_list[env_index] = 0\n", + " reset_list[env_index] = True\n", + " reset_env = True\n", + " scores[env_index] = 0.0\n", + " current_episode += 1\n", + "\n", + " print(f\"Time elapsed: {time.time() - start_time:.2f}\")\n", + "\n", + "test_random_agents(test_env, n_episodes=10, max_t=200)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "outputs": [], + "source": [ + "test_env.close()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### 4. Train the Agent with DDPG\n", + "\n", + "Run the code cells below to train the agent from scratch.\n", + "\n", + "Alternatively, you can skip to the next step below (**5. Watch a Smart Agent**), to load the saved model weights from a pre-trained agent." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "class DDPGAgentsTester:\n", + " def __init__(self, ddpg_agents: DDPGAgents,\n", + " env: ParallelUnityEnvironment,\n", + " buffer_size=int(1.0e6), # replay buffer size\n", + " noise_start=1.0\n", + " ):\n", + " self.ddpg_agents = ddpg_agents\n", + " self.env = env\n", + " self.buffer_size = buffer_size\n", + " self.scores = []\n", + " self.scores_deque = deque(maxlen=100)\n", + " self.episode = 0\n", + " self.noise = noise_start\n", + " self.replay_buffer = ReplayBuffer(buffer_size)\n", + "\n", + " def train_agents(self, n_episodes, max_t, goal=float(\"inf\"), print_every=1000, update_every=1,\n", + " num_updates=1, batch_size=64, noise_decay=6.93e-6):\n", + " \"\"\" Multi Agent Deep Deterministic Policy Gradient algorithm.\n", + "\n", + " Params\n", + " ======\n", + " n_episodes (int): maximum number of training episodes\n", + " max_t (int): maximum number of timesteps per episode\n", + " goal (float): the algorithm will stop when the goal is reached\n", + " print_every (int) : print intermediate results every %print_every episodes\n", + " update_every (int): update the neural networks every %update_every time steps\n", + " num_updates: How many updates to do in a row\n", + " batch_size (int): minibatch size\n", + " noise_decay (float): noise decay factor = 1.0 - %noise_decay\n", + " \"\"\"\n", + " noise_decay = 1.0 - noise_decay\n", + " stop_episode = self.episode + n_episodes\n", + " timesteps = 0\n", + " start_time = time.time()\n", + " last_print_time = 0\n", + " current_timestep_list = [0] * self.env.num_envs\n", + " scores = np.zeros((self.env.num_envs, len(self.ddpg_agents)))\n", + " states_list = [np.ndarray((0,))] * self.env.num_envs\n", + " actions_list = [np.ndarray((0,))] * self.env.num_envs\n", + " reset_list = [True] * self.env.num_envs\n", + " reset_env = True\n", + " while self.episode < stop_episode:\n", + " # reset environments if needed:\n", + " if reset_env:\n", + " self.env.reset(reset_list)\n", + " for env_index, reset in enumerate(reset_list):\n", + " if reset:\n", + " states_list[env_index] = self.env.get_observations(behavior_index=0, env_index=env_index)\n", + " reset_list[env_index] = False\n", + "\n", + " # get a batch of states from all environments:\n", + " env_states = np.stack([states for env_index, states in enumerate(states_list)], axis=1)\n", + " # get actions from all agents:\n", + " env_actions = self.ddpg_agents.act(convert_to_tensor(env_states), self.noise)\n", + "\n", + " # set actions for each environment:\n", + " for env_index in range(self.env.num_envs):\n", + " actions_list[env_index] = env_actions[:, env_index, :]\n", + " self.env.set_actions(behavior_index=0, env_index=env_index, continuous=actions_list[env_index])\n", + "\n", + " # step forward in all environments:\n", + " self.env.step()\n", + "\n", + " for env_index in range(self.env.num_envs):\n", + " # collect experiences from environment:\n", + " next_states, rewards, dones = self.env.get_experiences(behavior_index=0, env_index=env_index)\n", + "\n", + " # add sample to replay buffer:\n", + " sample = (states_list[env_index].copy(), actions_list[env_index].copy(), rewards, next_states, dones)\n", + " self.replay_buffer.add(sample)\n", + "\n", + " # update networks every %update_every time steps:\n", + " if timesteps % update_every == 0 and len(self.replay_buffer) > batch_size * 100:\n", + " for _ in range(num_updates):\n", + " samples = [self.replay_buffer.sample(batch_size) for _ in range(len(self.ddpg_agents))]\n", + " self.ddpg_agents.step(samples)\n", + " #soft update the target network towards the actual networks:\n", + " self.ddpg_agents.update_target_networks()\n", + "\n", + " states_list[env_index] = next_states\n", + " self.noise *= noise_decay\n", + " scores[env_index] += rewards.squeeze()\n", + " current_timestep_list[env_index] += 1\n", + " timesteps += 1\n", + "\n", + " # check if episode has ended:\n", + " if current_timestep_list[env_index] >= max_t or np.any(dones):\n", + " self.scores_deque.append(scores[env_index, :].copy())\n", + " self.scores.append(scores[env_index, :].copy())\n", + " current_timestep_list[env_index] = 0\n", + " reset_list[env_index] = True\n", + " reset_env = True\n", + " scores[env_index, :] = 0.0\n", + " self.episode += 1\n", + "\n", + " average_scores = np.mean(self.scores_deque, 0) # average score over last 100 episodes for each agent\n", + " if time.time() - last_print_time > 1.0:\n", + " time_per_step = (time.time() - start_time) / timesteps\n", + " print('\\rEpisode {}\\tSteps: {}\\tTime per step: {:.6f}\\tAverage Scores: {:.3f}'\n", + " .format(self.episode, timesteps, time_per_step, *average_scores), end=\"\")\n", + " last_print_time = time.time()\n", + " if self.episode % print_every == 0:\n", + " print(\"\\r\" + \" \" * 80, end=\"\")\n", + " print('\\rEpisode {}\\tAverage Scores: {:.3f}'.format(self.episode, *average_scores))\n", + " if np.max(average_scores) >= goal:\n", + " print('\\nEnvironment solved in {:d} episodes!\\tAverage Score: {:.2f}\\tTime elapsed: {}'.format(\n", + " self.episode, np.max(average_scores), time.time() - start_time))\n", + " return\n", + "\n", + " def test_agents(self, n_episodes, max_t):\n", + " current_episode = 0\n", + " current_timestep_list = [0] * self.env.num_envs\n", + " scores = np.zeros((self.env.num_envs, len(self.ddpg_agents)))\n", + " states_list = [np.ndarray((0,))] * self.env.num_envs\n", + " actions_list = [np.ndarray((0,))] * self.env.num_envs\n", + " reset_list = [True] * self.env.num_envs\n", + " reset_env = True\n", + " while current_episode < n_episodes:\n", + " # reset environments if needed:\n", + " if reset_env:\n", + " self.env.reset(reset_list)\n", + " for env_index, reset in enumerate(reset_list):\n", + " if reset:\n", + " states_list[env_index] = self.env.get_observations(behavior_index=0, env_index=env_index)\n", + " reset_list[env_index] = False\n", + "\n", + " # get a batch of states from all environments:\n", + " env_states = np.stack([states for env_index, states in enumerate(states_list)], axis=1)\n", + " # get actions from all agents:\n", + " env_actions = self.ddpg_agents.act(convert_to_tensor(env_states), self.noise)\n", + "\n", + " # set actions for each environment:\n", + " for env_index in range(self.env.num_envs):\n", + " actions_list[env_index] = env_actions[:, env_index, :]\n", + " self.env.set_actions(behavior_index=0, env_index=env_index, continuous=actions_list[env_index])\n", + "\n", + " # step forward in all environments:\n", + " self.env.step()\n", + "\n", + " for env_index in range(self.env.num_envs):\n", + " # collect experiences from environment:\n", + " next_states, rewards, dones = self.env.get_experiences(behavior_index=0, env_index=env_index)\n", + " states_list[env_index] = next_states\n", + " scores[env_index] += rewards.squeeze()\n", + " current_timestep_list[env_index] += 1\n", + "\n", + " # check if episode has ended:\n", + " if current_timestep_list[env_index] >= max_t or np.any(dones):\n", + " print(f\"Score from environment {env_index}, episode {current_episode}: \"\n", + " f\"{scores[env_index, 0]:.2f}\")\n", + " current_timestep_list[env_index] = 0\n", + " reset_list[env_index] = True\n", + " reset_env = True\n", + " scores[env_index, :] = 0.0\n", + " current_episode += 1" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [], + "source": [ + "random_seed = 1\n", + "np.random.seed(random_seed)\n", + "torch.manual_seed(random_seed)\n", + "random.seed(random_seed)\n", + "train_env = ParallelUnityEnvironment(num_envs=NUM_ENVS, seeds=list(range(random_seed, random_seed + NUM_ENVS)),\n", + " file_name=ENV_FILE_NAME, no_graphics=True, worked_id_start=10)\n", + "train_env.set_timescale(100.0)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 8, + "outputs": [], + "source": [ + "actor1 = Actor(state_size=8, action_size=2, hidden_layer_sizes=[400, 300], activation_func=f.leaky_relu)\n", + "critic1 = Critic(state_size=8, action_size=2, hidden_layer_sizes=[400, 300], activation_func=f.leaky_relu,\n", + " inject_layer=0)\n", + "ddpg_agent1 = DDPGAgent(actor1, critic1, gamma=0.99, tau=1.0e-3, lr_actor=1.0e-4, lr_critic=1.0e-3, weight_decay=1.0e-2)\n", + "ddpg_agent_list = [ddpg_agent1]\n", + "ddpg_agents = DDPGAgents(ddpg_agent_list)\n", + "ddpg_agents_tester = DDPGAgentsTester(ddpg_agents, train_env, buffer_size=int(1.0e6), noise_start=1.0)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "You can skip this cell, if you don't want to train the agent from scratch. It may take 30 to 45 minutes:" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Episode 1000\tAverage Scores: 1.270 \n", + "Episode 2000\tAverage Scores: 1.220 \n", + "Episode 3000\tAverage Scores: 1.101 \n", + "Episode 4000\tAverage Scores: 2.310 \n", + "Episode 5000\tAverage Scores: 5.038 \n", + "Episode 6000\tAverage Scores: 4.930 \n", + "Episode 6708\tSteps: 234123\tTime per step: 0.009527\tAverage Scores: 9.930\n", + "Environment solved in 6710 episodes!\tAverage Score: 10.00\tTime elapsed: 2231.088708639145\n" + ] + } + ], + "source": [ + "ddpg_agents_tester.env = train_env\n", + "ddpg_agents_tester.train_agents(n_episodes=int(1.0e5), max_t=100, goal=10.0, update_every=1,\n", + " num_updates=1, batch_size=64, noise_decay=6.93e-6)\n", + "ddpg_agents.save_checkpoint(filename=CHECKPOINT_FILENAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "def plot_scores():\n", + " scores = np.vstack(ddpg_agents_tester.scores)\n", + " scores = np.max(scores, 1)\n", + " fig = plt.figure()\n", + " ax = fig.add_subplot(111)\n", + " plt.plot(np.arange(1, len(scores) + 1), scores)\n", + " plt.ylabel('Score')\n", + " plt.xlabel('Episode #')\n", + " plt.show()\n", + "\n", + "plot_scores()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Watch a Smart Agent" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "test_env = ParallelUnityEnvironment(num_envs=NUM_ENVS, seeds=list(range(NUM_ENVS)),\n", + " file_name=ENV_FILE_NAME, no_graphics=False)\n", + "test_env.set_timescale(1.0)\n", + "test_env.set_display_size(width=DISPLAY_SIZE[0], height=DISPLAY_SIZE[1])\n", + "ddpg_agents_tester.env = test_env\n", + "ddpg_agents.load_checkpoint(filename=CHECKPOINT_FILENAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score from environment 0, episode 0: 20.00\n", + "Score from environment 1, episode 1: 20.00\n", + "Score from environment 2, episode 2: 20.00\n", + "Score from environment 3, episode 3: 20.00\n", + "Score from environment 0, episode 4: 20.00\n", + "Score from environment 1, episode 5: 20.00\n", + "Score from environment 2, episode 6: 20.00\n", + "Score from environment 3, episode 7: 20.00\n", + "Score from environment 0, episode 8: 20.00\n", + "Score from environment 1, episode 9: 20.00\n", + "Score from environment 2, episode 10: 20.00\n", + "Score from environment 3, episode 11: 20.00\n" + ] + } + ], + "source": [ + "ddpg_agents_tester.test_agents(n_episodes=10, max_t=200)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "When finished, you can close the environments:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "train_env.close()\n", + "test_env.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "name": "rl39", + "language": "python", + "display_name": "rl39" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/notebooks/README.md b/notebooks/README.md new file mode 100644 index 0000000..55094ff --- /dev/null +++ b/notebooks/README.md @@ -0,0 +1,7 @@ +# Unity Executables: + +Here are some download links for the Unity Executables: + +- [3DBall Windows 64](https://www.dropbox.com/s/hydogwlr9ok0nbb/3DBall_Windows_x86_64.zip?dl=1) + +If you're not on Windows, you have to [create the executable](../README.md#creating-a-custom-unity-executable) yourself. Btw. you can help this project by sharing it here. \ No newline at end of file diff --git a/notebooks/checkpoint-3dball-parallel.pth b/notebooks/checkpoint-3dball-parallel.pth new file mode 100644 index 0000000..a56f398 Binary files /dev/null and b/notebooks/checkpoint-3dball-parallel.pth differ diff --git a/notebooks/checkpoint-3dball.pth b/notebooks/checkpoint-3dball.pth new file mode 100644 index 0000000..81aeabe Binary files /dev/null and b/notebooks/checkpoint-3dball.pth differ diff --git a/parallel_unity_environment.py b/parallel_unity_environment.py new file mode 100644 index 0000000..45e3bf7 --- /dev/null +++ b/parallel_unity_environment.py @@ -0,0 +1,95 @@ +from my_unity_environment import MyUnityEnvironment +import numpy as np +import concurrent +from concurrent.futures import ThreadPoolExecutor, Future +from typing import Tuple, List, Optional, Any + + +class ParallelUnityEnvironment: + def __init__(self, num_envs: int, seeds: List[int], file_name=None, no_graphics=False, worked_id_start=0): + """ + :param num_envs: number of environments to run in parallel + :param seeds: a list of random seeds for each environment + :param file_name: The filename of the Unity executable, or None when using the Unity editor + (press Play to connect). + :param no_graphics: Whether to use graphics windows or not. + :param worked_id_start: The id of the first Unity thread to create. + For example, a value of 4 would create threads with ids: 4, 5, 6 etc. + """ + if len(seeds) != num_envs: + raise ValueError() + + def _init_env(_file_name, _no_graphics, _seed, _worker_id): + return MyUnityEnvironment(file_name=_file_name, no_graphics=_no_graphics, seed=_seed, worker_id=_worker_id) + + self.num_envs = num_envs + self.executor = ThreadPoolExecutor(max_workers=num_envs + 2, thread_name_prefix="Unity_") + self.futures: List[Future[Any]] = [ + self.executor.submit(_init_env, file_name, no_graphics, seed, worker_id) + for seed, worker_id in zip(seeds, range(worked_id_start, worked_id_start + num_envs))] + self.envs: List[MyUnityEnvironment] = [future.result() for future in self.futures] + self.behavior_names = self.envs[0].behavior_names + self.behavior_specs = self.envs[0].behavior_specs + self.num_agents_list = self.envs[0].num_agents_list + + def set_timescale(self, time_scale: float): + """ Set the timescale at which the physics simulation runs. + + :param time_scale: a value of 1.0 means the simulation runs in realtime. + """ + for env in self.envs: + env.set_timescale(time_scale=time_scale) + + def set_display_size(self, width: int, height: int): + for env in self.envs: + env.set_display_size(width=width, height=height) + + def reset(self, reset_list: List[bool]): + """Resets all environments where reset_list[env_index] == True """ + + def _reset(env: MyUnityEnvironment): + env.reset() + + for env_index, reset in enumerate(reset_list): + if reset: + self.futures[env_index] = self.executor.submit(_reset, self.envs[env_index]) + concurrent.futures.wait(self.futures) + + def get_observations(self, behavior_index: int, env_index: int): + """ Get observations for each environment. + + :return: np.ndarray[num_agents, observation_size]""" + + return self.envs[env_index].get_observations(behavior_index) + + def set_actions(self, behavior_index: int, env_index: int, continuous: Optional[np.ndarray] = None, + discrete: Optional[np.ndarray] = None): + self.envs[env_index].set_actions(behavior_index, continuous, discrete) + + def get_experiences(self, behavior_index: int, env_index: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """ Get experiences for environment %env_index with behavior %behavior_index. + + :param behavior_index: + :param env_index: + :return: Tuple of (observations, rewards, dones). Each element is ndarray[num_agents, *] + """ + return self.envs[env_index].get_experiences(behavior_index) + + def step(self): + """ Step forward in all environments.""" + + def _step(myenv: MyUnityEnvironment): + myenv.step() + + for env_index, env in enumerate(self.envs): + self.futures[env_index] = self.executor.submit(_step, env) + concurrent.futures.wait(self.futures) + + def close(self): + def _close(_env: MyUnityEnvironment): + _env.close() + + for env_index, env in enumerate(self.envs): + self.futures[env_index] = self.executor.submit(_close, env) + concurrent.futures.wait(self.futures) + self.executor.shutdown() diff --git a/plot.png b/plot.png new file mode 100644 index 0000000..62f2a36 Binary files /dev/null and b/plot.png differ diff --git a/pytorch_device.py b/pytorch_device.py new file mode 100644 index 0000000..ecd895c --- /dev/null +++ b/pytorch_device.py @@ -0,0 +1,3 @@ +import torch + +pytorch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") diff --git a/replay_buffer.py b/replay_buffer.py new file mode 100644 index 0000000..6d31f6e --- /dev/null +++ b/replay_buffer.py @@ -0,0 +1,31 @@ +from utilities import convert_to_tensor +import numpy as np +import torch +from collections import deque +import random +from typing import Tuple, Deque, Union + + +class ReplayBuffer: + """Fixed-size buffer to store experience tuples.""" + + def __init__(self, size: int): + """Initialize a ReplayBuffer object. + + :param size: maximum size of buffer + """ + self.deque: Deque[Tuple[np.ndarray, ...]] = deque(maxlen=size) + + def add(self, sample: Tuple[np.ndarray, ...]): + """Add a new sample to the buffer.""" + self.deque.append(sample) + + def sample(self, batch_size: int) -> Tuple[torch.Tensor, ...]: + """Randomly sample a batch of samples from the buffer.""" + samples = random.sample(self.deque, k=batch_size) + samples_transposed = tuple(zip(*samples)) + return tuple(convert_to_tensor(np.stack(np_array_list)) for np_array_list in samples_transposed) + + def __len__(self): + """Return the current size of the buffer.""" + return len(self.deque) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a3eb1d9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +numpy +torch +matplotlib diff --git a/utilities.py b/utilities.py new file mode 100644 index 0000000..75a244c --- /dev/null +++ b/utilities.py @@ -0,0 +1,21 @@ +from pytorch_device import pytorch_device +import torch +import numpy as np + + +def convert_to_tensor(x) -> torch.Tensor: + if isinstance(x, np.ndarray): + return torch.from_numpy(x).float().to(pytorch_device) + elif isinstance(x, torch.Tensor): + return x + else: + return torch.tensor(x).float().to(pytorch_device) + + +def convert_to_numpy(x) -> np.ndarray: + if isinstance(x, torch.Tensor): + return x.detach().cpu().numpy() + elif isinstance(x, np.ndarray): + return x + else: + return np.array(x)