From a302479f5835c5257b935d712ad4f46555c1739c Mon Sep 17 00:00:00 2001 From: Euijin Jeong Date: Wed, 10 Jun 2020 16:37:24 +0900 Subject: [PATCH] Replace prioritized_replay_buffer as PrioritizedBufferWrapper (#233) * Add perwrapper * Add descriptions & Change config parameter * Delete prioritized_replay_buffer & Add descriptions * Change minor parameters name & descriptions * Fix isseus commented --- rl_algorithms/common/abstract/buffer.py | 53 ++++++++++++++++ rl_algorithms/common/buffer/replay_buffer.py | 33 +++++----- ...priortized_replay_buffer.py => wrapper.py} | 61 ++++++++----------- rl_algorithms/dqn/agent.py | 11 ++-- rl_algorithms/fd/ddpg_agent.py | 15 +++-- rl_algorithms/fd/dqn_agent.py | 15 +++-- rl_algorithms/fd/sac_agent.py | 15 +++-- rl_algorithms/per/ddpg_agent.py | 12 ++-- 8 files changed, 130 insertions(+), 85 deletions(-) create mode 100644 rl_algorithms/common/abstract/buffer.py rename rl_algorithms/common/buffer/{priortized_replay_buffer.py => wrapper.py} (68%) diff --git a/rl_algorithms/common/abstract/buffer.py b/rl_algorithms/common/abstract/buffer.py new file mode 100644 index 00000000..4184d5bd --- /dev/null +++ b/rl_algorithms/common/abstract/buffer.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +"""Abstract Buffer & BufferWrapper class. + +- Author: Euijin Jeong +- Contact: euijin.jeong@medipixel.io +""" + +from abc import ABC, abstractmethod +from typing import Any, Tuple + +import numpy as np + + +class BaseBuffer(ABC): + """Abstract Buffer used for replay buffer.""" + + @abstractmethod + def add(self, transition: Tuple[Any, ...]) -> Tuple[Any, ...]: + pass + + @abstractmethod + def sample(self) -> Tuple[np.ndarray, ...]: + pass + + @abstractmethod + def __len__(self) -> int: + pass + + +class BufferWrapper(BaseBuffer): + """Abstract BufferWrapper used for buffer wrapper. + + Attributes: + buffer (Buffer): Hold replay buffer as am attribute + """ + + def __init__(self, base_buffer: BaseBuffer): + """Initialize a ReplayBuffer object. + + Args: + base_buffer (int): ReplayBuffer which should be hold + """ + self.buffer = base_buffer + + def add(self, transition: Tuple[Any, ...]) -> Tuple[Any, ...]: + return self.buffer.add(transition) + + def sample(self) -> Tuple[np.ndarray, ...]: + return self.buffer.sample() + + def __len__(self) -> int: + """Return the current size of internal memory.""" + return len(self.buffer) diff --git a/rl_algorithms/common/buffer/replay_buffer.py b/rl_algorithms/common/buffer/replay_buffer.py index 132bf350..5811d5d1 100644 --- a/rl_algorithms/common/buffer/replay_buffer.py +++ b/rl_algorithms/common/buffer/replay_buffer.py @@ -6,10 +6,11 @@ import numpy as np +from rl_algorithms.common.abstract.buffer import BaseBuffer from rl_algorithms.common.helper_functions import get_n_step_info -class ReplayBuffer: +class ReplayBuffer(BaseBuffer): """Fixed-size buffer to store experience tuples. Attributes: @@ -21,7 +22,7 @@ class ReplayBuffer: n_step_buffer (deque): recent n transitions n_step (int): step size for n-step transition gamma (float): discount factor - buffer_size (int): size of buffers + max_len (int): size of buffers batch_size (int): batch size for training demo_size (int): size of demo transitions length (int): amount of memory filled @@ -30,7 +31,7 @@ class ReplayBuffer: def __init__( self, - buffer_size: int, + max_len: int, batch_size: int, gamma: float = 0.99, n_step: int = 1, @@ -39,15 +40,15 @@ def __init__( """Initialize a ReplayBuffer object. Args: - buffer_size (int): size of replay buffer for experience + max_len (int): size of replay buffer for experience batch_size (int): size of a batched sampled from replay buffer for training gamma (float): discount factor n_step (int): step size for n-step transition demo (list): transitions of human play """ - assert 0 < batch_size <= buffer_size + assert 0 < batch_size <= max_len assert 0.0 <= gamma <= 1.0 - assert 1 <= n_step <= buffer_size + assert 1 <= n_step <= max_len self.obs_buf: np.ndarray = None self.acts_buf: np.ndarray = None @@ -59,7 +60,7 @@ def __init__( self.n_step = n_step self.gamma = gamma - self.buffer_size = buffer_size + self.max_len = max_len self.batch_size = batch_size self.demo_size = len(demo) if demo else 0 self.demo = demo @@ -68,7 +69,7 @@ def __init__( # demo may have empty tuple list [()] if self.demo and self.demo[0]: - self.buffer_size += self.demo_size + self.max_len += self.demo_size self.length += self.demo_size for idx, d in enumerate(self.demo): state, action, reward, next_state, done = d @@ -112,8 +113,8 @@ def add( self.done_buf[self.idx] = done self.idx += 1 - self.idx = self.demo_size if self.idx % self.buffer_size == 0 else self.idx - self.length = min(self.length + 1, self.buffer_size) + self.idx = self.demo_size if self.idx % self.max_len == 0 else self.idx + self.length = min(self.length + 1, self.max_len) # return a single step transition to insert to replay buffer return self.n_step_buffer[0] @@ -143,17 +144,15 @@ def sample(self, indices: List[int] = None) -> Tuple[np.ndarray, ...]: def _initialize_buffers(self, state: np.ndarray, action: np.ndarray) -> None: """Initialze buffers for state, action, resward, next_state, done.""" # In case action of demo is not np.ndarray - self.obs_buf = np.zeros( - [self.buffer_size] + list(state.shape), dtype=state.dtype - ) + self.obs_buf = np.zeros([self.max_len] + list(state.shape), dtype=state.dtype) self.acts_buf = np.zeros( - [self.buffer_size] + list(action.shape), dtype=action.dtype + [self.max_len] + list(action.shape), dtype=action.dtype ) - self.rews_buf = np.zeros([self.buffer_size], dtype=float) + self.rews_buf = np.zeros([self.max_len], dtype=float) self.next_obs_buf = np.zeros( - [self.buffer_size] + list(state.shape), dtype=state.dtype + [self.max_len] + list(state.shape), dtype=state.dtype ) - self.done_buf = np.zeros([self.buffer_size], dtype=float) + self.done_buf = np.zeros([self.max_len], dtype=float) def __len__(self) -> int: """Return the current size of internal memory.""" diff --git a/rl_algorithms/common/buffer/priortized_replay_buffer.py b/rl_algorithms/common/buffer/wrapper.py similarity index 68% rename from rl_algorithms/common/buffer/priortized_replay_buffer.py rename to rl_algorithms/common/buffer/wrapper.py index e51eabe9..c90123b8 100644 --- a/rl_algorithms/common/buffer/priortized_replay_buffer.py +++ b/rl_algorithms/common/buffer/wrapper.py @@ -1,31 +1,33 @@ # -*- coding: utf-8 -*- -"""Prioritized Replay buffer for algorithms. +"""Wrappers for buffer. -- Author: Kyunghwan Kim -- Contact: kh.kim@medipixel.io +- Author: Kyunghwan Kim & Euijin Jeong +- Contact: kh.kim@medipixel.io & euijin.jeong@medipixel.io - Paper: https://arxiv.org/pdf/1511.05952.pdf https://arxiv.org/pdf/1707.08817.pdf """ import random -from typing import Any, List, Tuple +from typing import Any, Tuple import numpy as np import torch -from rl_algorithms.common.buffer.replay_buffer import ReplayBuffer +from rl_algorithms.common.abstract.buffer import BaseBuffer, BufferWrapper from rl_algorithms.common.buffer.segment_tree import MinSegmentTree, SumSegmentTree device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -class PrioritizedReplayBuffer(ReplayBuffer): - """Create Prioritized Replay buffer. +class PrioritizedBufferWrapper(BufferWrapper): + """Prioritized Experience Replay wrapper for Buffer. + Refer to OpenAI baselines github repository: https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py Attributes: + buffer (Buffer): Hold replay buffer as an attribute alpha (float): alpha parameter for prioritized replay buffer epsilon_d (float): small positive constants to add to the priorities tree_idx (int): next index of tree @@ -35,26 +37,17 @@ class PrioritizedReplayBuffer(ReplayBuffer): """ def __init__( - self, - buffer_size: int, - batch_size: int = 32, - gamma: float = 0.99, - n_step: int = 1, - alpha: float = 0.6, - epsilon_d: float = 1.0, - demo: List[Tuple[np.ndarray, np.ndarray, float, np.ndarray, bool]] = None, + self, base_buffer: BaseBuffer, alpha: float = 0.6, epsilon_d: float = 1.0 ): """Initialize. Args: - buffer_size (int): size of replay buffer for experience - batch_size (int): size of a batched sampled from replay buffer for training + base_buffer (Buffer): ReplayBuffer which should be hold alpha (float): alpha parameter for prioritized replay buffer + epsilon_d (float): small positive constants to add to the priorities """ - super(PrioritizedReplayBuffer, self).__init__( - buffer_size, batch_size, gamma, n_step, demo - ) + BufferWrapper.__init__(self, base_buffer) assert alpha >= 0 self.alpha = alpha self.epsilon_d = epsilon_d @@ -62,7 +55,7 @@ def __init__( # capacity must be positive and a power of 2. tree_capacity = 1 - while tree_capacity < self.buffer_size: + while tree_capacity < self.buffer.max_len: tree_capacity *= 2 self.sum_tree = SumSegmentTree(tree_capacity) @@ -70,8 +63,8 @@ def __init__( self._max_priority = 1.0 # for init priority of demo - self.tree_idx = self.demo_size - for i in range(self.demo_size): + self.tree_idx = self.buffer.demo_size + for i in range(self.buffer.demo_size): self.sum_tree[i] = self._max_priority ** self.alpha self.min_tree[i] = self._max_priority ** self.alpha @@ -79,21 +72,21 @@ def add( self, transition: Tuple[np.ndarray, np.ndarray, float, np.ndarray, bool] ) -> Tuple[Any, ...]: """Add experience and priority.""" - n_step_transition = super().add(transition) + n_step_transition = self.buffer.add(transition) if n_step_transition: self.sum_tree[self.tree_idx] = self._max_priority ** self.alpha self.min_tree[self.tree_idx] = self._max_priority ** self.alpha self.tree_idx += 1 - if self.tree_idx % self.buffer_size == 0: - self.tree_idx = self.demo_size + if self.tree_idx % self.buffer.max_len == 0: + self.tree_idx = self.buffer.demo_size return n_step_transition def _sample_proportional(self, batch_size: int) -> list: """Sample indices based on proportional.""" indices = [] - p_total = self.sum_tree.sum(0, len(self) - 1) + p_total = self.sum_tree.sum(0, len(self.buffer) - 1) segment = p_total / batch_size for i in range(batch_size): @@ -106,21 +99,21 @@ def _sample_proportional(self, batch_size: int) -> list: def sample(self, beta: float = 0.4) -> Tuple[torch.Tensor, ...]: """Sample a batch of experiences.""" - assert len(self) >= self.batch_size + assert len(self.buffer) >= self.buffer.batch_size assert beta > 0 - indices = self._sample_proportional(self.batch_size) + indices = self._sample_proportional(self.buffer.batch_size) # get max weight p_min = self.min_tree.min() / self.sum_tree.sum() - max_weight = (p_min * len(self)) ** (-beta) + max_weight = (p_min * len(self.buffer)) ** (-beta) # calculate weights weights_, eps_d = [], [] for i in indices: - eps_d.append(self.epsilon_d if i < self.demo_size else 0.0) + eps_d.append(self.epsilon_d if i < self.buffer.demo_size else 0.0) p_sample = self.sum_tree[i] / self.sum_tree.sum() - weight = (p_sample * len(self)) ** (-beta) + weight = (p_sample * len(self.buffer)) ** (-beta) weights_.append(weight / max_weight) weights = np.array(weights_) @@ -128,7 +121,7 @@ def sample(self, beta: float = 0.4) -> Tuple[torch.Tensor, ...]: weights = weights.reshape(-1, 1) - states, actions, rewards, next_states, dones = super().sample(indices) + states, actions, rewards, next_states, dones = self.buffer.sample(indices) return states, actions, rewards, next_states, dones, weights, indices, eps_d @@ -138,7 +131,7 @@ def update_priorities(self, indices: list, priorities: np.ndarray): for idx, priority in zip(indices, priorities): assert priority > 0 - assert 0 <= idx < len(self) + assert 0 <= idx < len(self.buffer) self.sum_tree[idx] = priority ** self.alpha self.min_tree[idx] = priority ** self.alpha diff --git a/rl_algorithms/dqn/agent.py b/rl_algorithms/dqn/agent.py index dcc99b8b..b6fd0730 100644 --- a/rl_algorithms/dqn/agent.py +++ b/rl_algorithms/dqn/agent.py @@ -23,8 +23,8 @@ import wandb from rl_algorithms.common.abstract.agent import Agent -from rl_algorithms.common.buffer.priortized_replay_buffer import PrioritizedReplayBuffer from rl_algorithms.common.buffer.replay_buffer import ReplayBuffer +from rl_algorithms.common.buffer.wrapper import PrioritizedBufferWrapper from rl_algorithms.common.helper_functions import numpy2floattensor from rl_algorithms.dqn.learner import DQNLearner from rl_algorithms.registry import AGENTS @@ -105,10 +105,11 @@ def _initialize(self): """Initialize non-common things.""" if not self.args.test: # replay memory for a single step - self.memory = PrioritizedReplayBuffer( - self.hyper_params.buffer_size, - self.hyper_params.batch_size, - alpha=self.hyper_params.per_alpha, + self.memory = ReplayBuffer( + self.hyper_params.buffer_size, self.hyper_params.batch_size, + ) + self.memory = PrioritizedBufferWrapper( + self.memory, alpha=self.hyper_params.per_alpha ) # replay memory for multi-steps diff --git a/rl_algorithms/fd/ddpg_agent.py b/rl_algorithms/fd/ddpg_agent.py index 1a91f678..20b865a2 100644 --- a/rl_algorithms/fd/ddpg_agent.py +++ b/rl_algorithms/fd/ddpg_agent.py @@ -15,8 +15,8 @@ import numpy as np import torch -from rl_algorithms.common.buffer.priortized_replay_buffer import PrioritizedReplayBuffer from rl_algorithms.common.buffer.replay_buffer import ReplayBuffer +from rl_algorithms.common.buffer.wrapper import PrioritizedBufferWrapper import rl_algorithms.common.helper_functions as common_utils from rl_algorithms.common.helper_functions import numpy2floattensor from rl_algorithms.ddpg.agent import DDPGAgent @@ -56,7 +56,7 @@ def _initialize(self): # replay memory for multi-steps self.memory_n = ReplayBuffer( - buffer_size=self.hyper_params.buffer_size, + max_len=self.hyper_params.buffer_size, batch_size=self.hyper_params.batch_size, n_step=self.hyper_params.n_step, gamma=self.hyper_params.gamma, @@ -64,12 +64,11 @@ def _initialize(self): ) # replay memory for a single step - self.memory = PrioritizedReplayBuffer( - self.hyper_params.buffer_size, - self.hyper_params.batch_size, - demo=demos, - alpha=self.hyper_params.per_alpha, - epsilon_d=self.hyper_params.per_eps_demo, + self.memory = ReplayBuffer( + self.hyper_params.buffer_size, self.hyper_params.batch_size, + ) + self.memory = PrioritizedBufferWrapper( + self.memory, alpha=self.hyper_params.per_alpha ) self.learner = DDPGfDLearner( diff --git a/rl_algorithms/fd/dqn_agent.py b/rl_algorithms/fd/dqn_agent.py index eda118b6..5ec247bb 100644 --- a/rl_algorithms/fd/dqn_agent.py +++ b/rl_algorithms/fd/dqn_agent.py @@ -13,8 +13,8 @@ import torch import wandb -from rl_algorithms.common.buffer.priortized_replay_buffer import PrioritizedReplayBuffer from rl_algorithms.common.buffer.replay_buffer import ReplayBuffer +from rl_algorithms.common.buffer.wrapper import PrioritizedBufferWrapper import rl_algorithms.common.helper_functions as common_utils from rl_algorithms.dqn.agent import DQNAgent from rl_algorithms.fd.dqn_learner import DQfDLearner @@ -45,7 +45,7 @@ def _initialize(self): ) self.memory_n = ReplayBuffer( - buffer_size=self.hyper_params.buffer_size, + max_len=self.hyper_params.buffer_size, batch_size=self.hyper_params.batch_size, n_step=self.hyper_params.n_step, gamma=self.hyper_params.gamma, @@ -53,12 +53,11 @@ def _initialize(self): ) # replay memory - self.memory = PrioritizedReplayBuffer( - self.hyper_params.buffer_size, - self.hyper_params.batch_size, - demo=demos, - alpha=self.hyper_params.per_alpha, - epsilon_d=self.hyper_params.per_eps_demo, + self.memory = ReplayBuffer( + self.hyper_params.buffer_size, self.hyper_params.batch_size, + ) + self.memory = PrioritizedBufferWrapper( + self.memory, alpha=self.hyper_params.per_alpha ) self.learner = DQfDLearner( diff --git a/rl_algorithms/fd/sac_agent.py b/rl_algorithms/fd/sac_agent.py index d47129f3..0baf2293 100644 --- a/rl_algorithms/fd/sac_agent.py +++ b/rl_algorithms/fd/sac_agent.py @@ -16,8 +16,8 @@ import numpy as np import torch -from rl_algorithms.common.buffer.priortized_replay_buffer import PrioritizedReplayBuffer from rl_algorithms.common.buffer.replay_buffer import ReplayBuffer +from rl_algorithms.common.buffer.wrapper import PrioritizedBufferWrapper import rl_algorithms.common.helper_functions as common_utils from rl_algorithms.common.helper_functions import numpy2floattensor from rl_algorithms.fd.sac_learner import SACfDLearner @@ -56,7 +56,7 @@ def _initialize(self): # replay memory for multi-steps self.memory_n = ReplayBuffer( - buffer_size=self.hyper_params.buffer_size, + max_len=self.hyper_params.buffer_size, batch_size=self.hyper_params.batch_size, n_step=self.hyper_params.n_step, gamma=self.hyper_params.gamma, @@ -64,12 +64,11 @@ def _initialize(self): ) # replay memory - self.memory = PrioritizedReplayBuffer( - self.hyper_params.buffer_size, - self.hyper_params.batch_size, - demo=demos, - alpha=self.hyper_params.per_alpha, - epsilon_d=self.hyper_params.per_eps_demo, + self.memory = ReplayBuffer( + self.hyper_params.buffer_size, self.hyper_params.batch_size, + ) + self.memory = PrioritizedBufferWrapper( + self.memory, alpha=self.hyper_params.per_alpha ) self.learner = SACfDLearner( diff --git a/rl_algorithms/per/ddpg_agent.py b/rl_algorithms/per/ddpg_agent.py index 16d186c0..74a17f51 100644 --- a/rl_algorithms/per/ddpg_agent.py +++ b/rl_algorithms/per/ddpg_agent.py @@ -12,7 +12,8 @@ import torch import torch.nn as nn -from rl_algorithms.common.buffer.priortized_replay_buffer import PrioritizedReplayBuffer +from rl_algorithms.common.buffer.replay_buffer import ReplayBuffer +from rl_algorithms.common.buffer.wrapper import PrioritizedBufferWrapper import rl_algorithms.common.helper_functions as common_utils from rl_algorithms.ddpg.agent import DDPGAgent from rl_algorithms.registry import AGENTS @@ -37,10 +38,11 @@ def _initialize(self): if not self.args.test: # replay memory - self.memory = PrioritizedReplayBuffer( - self.hyper_params.buffer_size, - self.hyper_params.batch_size, - alpha=self.hyper_params.per_alpha, + self.memory = ReplayBuffer( + self.hyper_params.buffer_size, self.hyper_params.batch_size, + ) + self.memory = PrioritizedBufferWrapper( + self.memory, alpha=self.hyper_params.per_alpha ) def update_model(self) -> Tuple[torch.Tensor, ...]: