Ttdjs
diff --git a/‎README.md
+1 b/‎README.md
+1
diff --git a/‎baselines/a2c/utils.py
+17-5 b/‎baselines/a2c/utils.py
+17-5
diff --git a/‎baselines/bench/monitor.py
+7-2 b/‎baselines/bench/monitor.py
+7-2
diff --git a/‎baselines/common/cmd_util.py
+25-1 b/‎baselines/common/cmd_util.py
+25-1
diff --git a/‎baselines/common/console_util.py
+6-1 b/‎baselines/common/console_util.py
+6-1
diff --git a/‎baselines/common/tf_util.py
+17 b/‎baselines/common/tf_util.py
+17
diff --git a/‎baselines/common/vec_env/dummy_vec_env.py
+24-13 b/‎baselines/common/vec_env/dummy_vec_env.py
+24-13
diff --git a/‎baselines/her/README.md
+35 b/‎baselines/her/README.md
+35
diff --git a/‎baselines/her/__init__.py b/‎baselines/her/__init__.py
diff --git a/‎baselines/her/actor_critic.py
+44 b/‎baselines/her/actor_critic.py
+44
@@ -20,6 +20,7 @@ pip install -e .
 - [DDPG](baselines/ddpg)
 - [DQN](baselines/deepq)
 - [GAIL](baselines/gail)
+- [HER](baselines/her)
 - [PPO1](baselines/ppo1) (Multi-CPU using MPI)
 - [PPO2](baselines/ppo2) (Optimized for GPU)
 - [TRPO](baselines/trpo_mpi)
 
@@ -39,12 +39,24 @@ def _ortho_init(shape, dtype, partition_info=None):
         return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
     return _ortho_init
 
-def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0):
+def conv(x, scope, *, nf, rf, stride, pad='VALID', init_scale=1.0, data_format='NHWC'):
+    if data_format == 'NHWC':
+        channel_ax = 3
+        strides = [1, stride, stride, 1]
+        bshape = [1, 1, 1, nf]
+    elif data_format == 'NCHW':
+        channel_ax = 1
+        strides = [1, 1, stride, stride]
+        bshape = [1, nf, 1, 1]
+    else:
+        raise NotImplementedError
+    nin = x.get_shape()[channel_ax].value
+    wshape = [rf, rf, nin, nf]
     with tf.variable_scope(scope):
-        nin = x.get_shape()[3].value
-        w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
-        b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
-        return tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
+        w = tf.get_variable("w", wshape, initializer=ortho_init(init_scale))
+        b = tf.get_variable("b", [1, nf, 1, 1], initializer=tf.constant_initializer(0.0))
+        if data_format == 'NHWC': b = tf.reshape(b, bshape)
+        return b + tf.nn.conv2d(x, w, strides=strides, padding=pad, data_format=data_format)
 
 def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):
     with tf.variable_scope(scope):
 
@@ -7,12 +7,13 @@
 import csv
 import os.path as osp
 import json
+import numpy as np
 
 class Monitor(Wrapper):
     EXT = "monitor.csv"
     f = None
 
-    def __init__(self, env, filename, allow_early_resets=False, reset_keywords=()):
+    def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()):
         Wrapper.__init__(self, env=env)
         self.tstart = time.time()
         if filename is None:
@@ -26,10 +27,12 @@ def __init__(self, env, filename, allow_early_resets=False, reset_keywords=()):
                     filename = filename + "." + Monitor.EXT
             self.f = open(filename, "wt")
             self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, 'env_id' : env.spec and env.spec.id}))
-            self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords)
+            self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords+info_keywords)
             self.logger.writeheader()
+            self.f.flush()
 
         self.reset_keywords = reset_keywords
+        self.info_keywords = info_keywords
         self.allow_early_resets = allow_early_resets
         self.rewards = None
         self.needs_reset = True
@@ -61,6 +64,8 @@ def step(self, action):
             eprew = sum(self.rewards)
             eplen = len(self.rewards)
             epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)}
+            for k in self.info_keywords:
+                epinfo[k] = info[k]
             self.episode_rewards.append(eprew)
             self.episode_lengths.append(eplen)
             self.episode_times.append(time.time() - self.tstart)
 
@@ -4,6 +4,7 @@
 
 import os
 import gym
+from gym.wrappers import FlattenDictWrapper
 from baselines import logger
 from baselines.bench import Monitor
 from baselines.common import set_global_seeds
@@ -36,6 +37,19 @@ def make_mujoco_env(env_id, seed):
     env.seed(seed)
     return env
 
+def make_robotics_env(env_id, seed, rank=0):
+    """
+    Create a wrapped, monitored gym.Env for MuJoCo.
+    """
+    set_global_seeds(seed)
+    env = gym.make(env_id)
+    env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
+    env = Monitor(
+        env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
+        info_keywords=('is_success',))
+    env.seed(seed)
+    return env
+
 def arg_parser():
     """
     Create an empty argparse.ArgumentParser.
@@ -58,7 +72,17 @@ def mujoco_arg_parser():
     Create an argparse.ArgumentParser for run_mujoco.py.
     """
     parser = arg_parser()
-    parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1")
+    parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
+    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
+    parser.add_argument('--num-timesteps', type=int, default=int(1e6))
+    return parser
+
+def robotics_arg_parser():
+    """
+    Create an argparse.ArgumentParser for run_mujoco.py.
+    """
+    parser = arg_parser()
+    parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0')
     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
     parser.add_argument('--num-timesteps', type=int, default=int(1e6))
     return parser
@@ -16,7 +16,12 @@ def fmt_item(x, l):
     if isinstance(x, np.ndarray):
         assert x.ndim==0
         x = x.item()
-    if isinstance(x, float): rep = "%g"%x
+    if isinstance(x, (float, np.float32, np.float64)):
+        v = abs(x)
+        if (v < 1e-4 or v > 1e+4) and v > 0:
+            rep = "%7.2e" % x
+        else:
+            rep = "%7.5f" % x
     else: rep = str(x)
     return " "*(l - len(rep)) + rep
 
 
@@ -261,3 +261,20 @@ def get_placeholder_cached(name):
 
 def flattenallbut0(x):
     return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
+
+
+# ================================================================
+# Diagnostics 
+# ================================================================
+
+def display_var_info(vars):
+    from baselines import logger
+    count_params = 0
+    for v in vars:
+        name = v.name
+        if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue
+        count_params += np.prod(v.shape.as_list())
+        if "/b:" in name: continue    # Wx+b, bias is not interesting to look at => count params, but not print
+        logger.info("    %s%s%s" % (name, " "*(55-len(name)), str(v.shape)))
+    logger.info("Total model parameters: %0.1f million" % (count_params*1e-6))
+
@@ -1,31 +1,42 @@
 import numpy as np
+import gym
 from . import VecEnv
 
 class DummyVecEnv(VecEnv):
     def __init__(self, env_fns):
         self.envs = [fn() for fn in env_fns]
-        env = self.envs[0]        
+        env = self.envs[0]
         VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
-        self.ts = np.zeros(len(self.envs), dtype='int')        
+
+        obs_spaces = self.observation_space.spaces if isinstance(self.observation_space, gym.spaces.Tuple) else (self.observation_space,)
+        self.buf_obs = [np.zeros((self.num_envs,) + tuple(s.shape), s.dtype) for s in obs_spaces]
+        self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
+        self.buf_rews  = np.zeros((self.num_envs,), dtype=np.float32)
+        self.buf_infos = [{} for _ in range(self.num_envs)]
         self.actions = None
 
     def step_async(self, actions):
         self.actions = actions
 
     def step_wait(self):
-        results = [env.step(a) for (a,env) in zip(self.actions, self.envs)]
-        obs, rews, dones, infos = map(np.array, zip(*results))
-        self.ts += 1
-        for (i, done) in enumerate(dones):
-            if done: 
-                obs[i] = self.envs[i].reset()
-                self.ts[i] = 0
-        self.actions = None
-        return np.array(obs), np.array(rews), np.array(dones), infos
+        for i in range(self.num_envs):
+            obs_tuple, self.buf_rews[i], self.buf_dones[i], self.buf_infos[i] = self.envs[i].step(self.actions[i])
+            if isinstance(obs_tuple, (tuple, list)):
+                for t,x in enumerate(obs_tuple):
+                    self.buf_obs[t][i] = x
+            else:
+                self.buf_obs[0][i] = obs_tuple
+        return self.buf_obs, self.buf_rews, self.buf_dones, self.buf_infos
 
     def reset(self):        
-        results = [env.reset() for env in self.envs]
-        return np.array(results)
+        for i in range(self.num_envs):
+            obs_tuple = self.envs[i].reset()
+            if isinstance(obs_tuple, (tuple, list)):
+                for t,x in enumerate(obs_tuple):
+                    self.buf_obs[t][i] = x
+            else:
+                self.buf_obs[0][i] = obs_tuple
+        return self.buf_obs
 
     def close(self):
         return
@@ -0,0 +1,35 @@
+# Hindsight Experience Replay
+For details on Hindsight Experience Replay (HER), please read the [paper](https://arxiv.org/pdf/1707.01495.pdf).
+
+## How to use Hindsight Experience Replay
+
+### Getting started
+Training an agent is very simple:
+```bash
+python -m baselines.her.experiment.train
+```
+This will train a DDPG+HER agent on the `FetchReach` environment.
+You should see the success rate go up quickly to `1.0`, which means that the agent achieves the
+desired goal in 100% of the cases.
+The training script logs other diagnostics as well and pickles the best policy so far (w.r.t. to its test success rate),
+the latest policy, and, if enabled, a history of policies every K epochs.
+
+To inspect what the agent has learned, use the play script:
+```bash
+python -m baselines.her.experiment.play /path/to/an/experiment/policy_best.pkl
+```
+You can try it right now with the results of the training step (the script prints out the path for you).
+This should visualize the current policy for 10 episodes and will also print statistics.
+
+
+### Advanced usage
+The train script comes with advanced features like MPI support, that allows to scale across all cores of a single machine.
+To see all available options, simply run this command:
+```bash
+python -m baselines.her.experiment.train --help
+```
+To run on, say, 20 CPU cores, you can use the following command:
+```bash
+python -m baselines.her.experiment.train --num_cpu 20
+```
+That's it, you are now running rollouts using 20 MPI workers and average gradients for network updates across all 20 core.
@@ -0,0 +1,44 @@
+import tensorflow as tf
+from baselines.her.util import store_args, nn
+
+
+class ActorCritic:
+    @store_args
+    def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers,
+                 **kwargs):
+        """The actor-critic network and related training code.
+
+        Args:
+            inputs_tf (dict of tensors): all necessary inputs for the network: the
+                observation (o), the goal (g), and the action (u)
+            dimo (int): the dimension of the observations
+            dimg (int): the dimension of the goals
+            dimu (int): the dimension of the actions
+            max_u (float): the maximum magnitude of actions; action outputs will be scaled
+                accordingly
+            o_stats (baselines.her.Normalizer): normalizer for observations
+            g_stats (baselines.her.Normalizer): normalizer for goals
+            hidden (int): number of hidden units that should be used in hidden layers
+            layers (int): number of hidden layers
+        """
+        self.o_tf = inputs_tf['o']
+        self.g_tf = inputs_tf['g']
+        self.u_tf = inputs_tf['u']
+
+        # Prepare inputs for actor and critic.
+        o = self.o_stats.normalize(self.o_tf)
+        g = self.g_stats.normalize(self.g_tf)
+        input_pi = tf.concat(axis=1, values=[o, g])  # for actor
+
+        # Networks.
+        with tf.variable_scope('pi'):
+            self.pi_tf = self.max_u * tf.tanh(nn(
+                input_pi, [self.hidden] * self.layers + [self.dimu]))
+        with tf.variable_scope('Q'):
+            # for policy training
+            input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u])
+            self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
+            # for critic training
+            input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u])
+            self._input_Q = input_Q  # exposed for tests
+            self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)