reset EOL spaces for diff clarity

openai · abefetterman · Nov 19, 2018 · Nov 19, 2018 · Nov 19, 2018 · Nov 20, 2018
commit 06b799451117dd8c12a134f3d784de6369501c7f
diff --git a/spinup/exercises/problem_set_1/exercise1_2.py b/spinup/exercises/problem_set_1/exercise1_2.py
@@ -6,7 +6,7 @@
 
 Exercise 1.2: PPO Gaussian Policy
 
-Implement an MLP diagonal Gaussian policy for PPO.
+Implement an MLP diagonal Gaussian policy for PPO. 
 
 Log-likelihoods will be computed using your answer to Exercise 1.1,
 so make sure to complete that exercise before beginning this one.
@@ -62,13 +62,13 @@ def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, actio
             environment this agent will interact with.
 
     Returns:
-        pi: A symbol for sampling stochastic actions from a Gaussian
+        pi: A symbol for sampling stochastic actions from a Gaussian 
             distribution.
 
-        logp: A symbol for computing log-likelihoods of actions from a Gaussian
+        logp: A symbol for computing log-likelihoods of actions from a Gaussian 
             distribution.
 
-        logp_pi: A symbol for computing log-likelihoods of actions in pi from a
+        logp_pi: A symbol for computing log-likelihoods of actions in pi from a 
             Gaussian distribution.
 
     """
@@ -77,9 +77,9 @@ def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, actio
     #   YOUR CODE HERE    #
     #                     #
     #######################
-    # mu =
-    # log_std =
-    # pi =
+    # mu = 
+    # log_std = 
+    # pi = 
 
     logp = exercise1_1.gaussian_likelihood(a, mu, log_std)
     logp_pi = exercise1_1.gaussian_likelihood(pi, mu, log_std)

diff --git a/spinup/exercises/problem_set_1/exercise1_3.py b/spinup/exercises/problem_set_1/exercise1_3.py
@@ -2,12 +2,12 @@
 import tensorflow as tf
 import gym
 import time
+import pybullet_envs
 from spinup.algos.td3 import core
 from spinup.algos.td3.td3 import td3 as true_td3
 from spinup.algos.td3.core import get_vars
 from spinup.utils.logx import EpochLogger
 from spinup.user_config import HALFCHEETAH_ENV
-import pybullet_envs
 
 """
 
@@ -18,7 +18,7 @@
 As starter code, you are given the entirety of the TD3 algorithm except
 for the computation graph. Find "YOUR CODE HERE" to begin.
 
-To clarify: you will not write an "actor_critic" function for this
+To clarify: you will not write an "actor_critic" function for this 
 exercise. But you will use one to build the graph for computing the
 TD3 updates.
 
@@ -59,43 +59,43 @@ def sample_batch(self, batch_size=32):
 TD3 (Twin Delayed DDPG)
 
 """
-def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
-        steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99,
-        polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000,
-        act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2,
+def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
+        steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, 
+        polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 
+        act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, 
         max_ep_len=1000, logger_kwargs=dict(), save_freq=1):
     """
 
     Args:
         env_fn : A function which creates a copy of the environment.
             The environment must satisfy the OpenAI Gym API.
 
-        actor_critic: A function which takes in placeholder symbols
-            for state, ``x_ph``, and action, ``a_ph``, and returns the main
+        actor_critic: A function which takes in placeholder symbols 
+            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
             outputs from the agent's Tensorflow computation graph:
 
             ===========  ================  ======================================
             Symbol       Shape             Description
             ===========  ================  ======================================
             ``pi``       (batch, act_dim)  | Deterministically computes actions
                                            | from policy given states.
-            ``q1``       (batch,)          | Gives one estimate of Q* for
+            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                            | states in ``x_ph`` and actions in
                                            | ``a_ph``.
-            ``q2``       (batch,)          | Gives another estimate of Q* for
+            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                            | states in ``x_ph`` and actions in
                                            | ``a_ph``.
-            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and
-                                           | ``pi`` for states in ``x_ph``:
+            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
+                                           | ``pi`` for states in ``x_ph``: 
                                            | q1(x, pi(x)).
             ===========  ================  ======================================
 
-        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
+        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
             function you provided to TD3.
 
         seed (int): Seed for random number generators.
 
-        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
+        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
             for the agent and the environment in each epoch.
 
         epochs (int): Number of epochs to run and train agent.
@@ -104,14 +104,14 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
 
         gamma (float): Discount factor. (Always between 0 and 1.)
 
-        polyak (float): Interpolation factor in polyak averaging for target
-            networks. Target networks are updated towards main networks
+        polyak (float): Interpolation factor in polyak averaging for target 
+            networks. Target networks are updated towards main networks 
             according to:
 
-            .. math:: \\theta_{\\text{targ}} \\leftarrow
+            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
-            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
+            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
             close to 1.)
 
         pi_lr (float): Learning rate for policy.
@@ -123,16 +123,16 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
         start_steps (int): Number of steps for uniform-random action selection,
             before running real policy. Helps exploration.
 
-        act_noise (float): Stddev for Gaussian exploration noise added to
+        act_noise (float): Stddev for Gaussian exploration noise added to 
             policy at training time. (At test time, no noise is added.)
 
-        target_noise (float): Stddev for smoothing noise added to target
+        target_noise (float): Stddev for smoothing noise added to target 
             policy.
 
-        noise_clip (float): Limit for absolute value of target policy
+        noise_clip (float): Limit for absolute value of target policy 
             smoothing noise.
 
-        policy_delay (int): Policy will only be updated once every
+        policy_delay (int): Policy will only be updated once every 
             policy_delay times for each update of the Q-networks.
 
         max_ep_len (int): Maximum length of trajectory / episode / rollout.
@@ -176,9 +176,9 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
         #   YOUR CODE HERE    #
         #                     #
         #######################
-        # pi, q1, q2, q1_pi =
+        # pi, q1, q2, q1_pi = 
         pass
-
+    
     # Target policy network
     with tf.variable_scope('target'):
         #######################
@@ -188,7 +188,7 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
         #######################
         # pi_targ =
         pass
-
+    
     # Target Q networks
     with tf.variable_scope('target', reuse=True):
 
@@ -227,10 +227,10 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
     #   YOUR CODE HERE    #
     #                     #
     #######################
-    # pi_loss =
-    # q1_loss =
-    # q2_loss =
-    # q_loss =
+    # pi_loss = 
+    # q1_loss = 
+    # q2_loss = 
+    # q_loss = 
 
     #=========================================================================#
     #                                                                         #
@@ -283,8 +283,8 @@ def test_agent(n=10):
 
         """
         Until start_steps have elapsed, randomly sample actions
-        from a uniform distribution for better exploration. Afterwards,
-        use the learned policy (with some noise, via act_noise).
+        from a uniform distribution for better exploration. Afterwards, 
+        use the learned policy (with some noise, via act_noise). 
         """
         if t > start_steps:
             a = get_action(o, act_noise)
@@ -304,7 +304,7 @@ def test_agent(n=10):
         # Store experience to replay buffer
         replay_buffer.store(o, a, r, o2, d)
 
-        # Super critical, easy to overlook step: make sure to update
+        # Super critical, easy to overlook step: make sure to update 
         # most recent observation!
         o = o2
 
@@ -372,15 +372,15 @@ def test_agent(n=10):
     logger_kwargs = setup_logger_kwargs(args.exp_name + '-' + args.env.lower(), args.seed)
 
     all_kwargs = dict(
-        env_fn=lambda : gym.make(args.env),
+        env_fn=lambda : gym.make(args.env), 
         actor_critic=core.mlp_actor_critic,
-        ac_kwargs=dict(hidden_sizes=[128,128]),
+        ac_kwargs=dict(hidden_sizes=[128,128]), 
         max_ep_len=150,
-        seed=args.seed,
+        seed=args.seed, 
         logger_kwargs=logger_kwargs,
         epochs=10
         )
-
+    
     if args.use_soln:
         true_td3(**all_kwargs)
     else:

diff --git a/spinup/exercises/problem_set_2/exercise2_3.py b/spinup/exercises/problem_set_2/exercise2_3.py
@@ -9,57 +9,56 @@
 from spinup.utils.run_utils import ExperimentGrid
 from spinup.user_config import HALFCHEETAH_ENV
 
-
 """
 
 Exercise 2.3: Details Matter
 
 In this exercise, you will run TD3 with a tiny implementation difference,
-pertaining to how target actions are calculated. Your goal is to determine
+pertaining to how target actions are calculated. Your goal is to determine 
 whether or not there is any change in performance, and if so, explain why.
 
 You do NOT need to write code for this exercise.
 
 """
 
-def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
-        steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99,
-        polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000,
-        act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2,
-        max_ep_len=1000, logger_kwargs=dict(), save_freq=1,
+def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, 
+        steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, 
+        polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, 
+        act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, 
+        max_ep_len=1000, logger_kwargs=dict(), save_freq=1, 
         remove_action_clip=False):
     """
 
     Args:
         env_fn : A function which creates a copy of the environment.
             The environment must satisfy the OpenAI Gym API.
 
-        actor_critic: A function which takes in placeholder symbols
-            for state, ``x_ph``, and action, ``a_ph``, and returns the main
+        actor_critic: A function which takes in placeholder symbols 
+            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
             outputs from the agent's Tensorflow computation graph:
 
             ===========  ================  ======================================
             Symbol       Shape             Description
             ===========  ================  ======================================
             ``pi``       (batch, act_dim)  | Deterministically computes actions
                                            | from policy given states.
-            ``q1``       (batch,)          | Gives one estimate of Q* for
+            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                            | states in ``x_ph`` and actions in
                                            | ``a_ph``.
-            ``q2``       (batch,)          | Gives another estimate of Q* for
+            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                            | states in ``x_ph`` and actions in
                                            | ``a_ph``.
-            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and
-                                           | ``pi`` for states in ``x_ph``:
+            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
+                                           | ``pi`` for states in ``x_ph``: 
                                            | q1(x, pi(x)).
             ===========  ================  ======================================
 
-        ac_kwargs (dict): Any kwargs appropriate for the actor_critic
+        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
             function you provided to TD3.
 
         seed (int): Seed for random number generators.
 
-        steps_per_epoch (int): Number of steps of interaction (state-action pairs)
+        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
             for the agent and the environment in each epoch.
 
         epochs (int): Number of epochs to run and train agent.
@@ -68,14 +67,14 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
 
         gamma (float): Discount factor. (Always between 0 and 1.)
 
-        polyak (float): Interpolation factor in polyak averaging for target
-            networks. Target networks are updated towards main networks
+        polyak (float): Interpolation factor in polyak averaging for target 
+            networks. Target networks are updated towards main networks 
             according to:
 
-            .. math:: \\theta_{\\text{targ}} \\leftarrow
+            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
 
-            where :math:`\\rho` is polyak. (Always between 0 and 1, usually
+            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
             close to 1.)
 
         pi_lr (float): Learning rate for policy.
@@ -87,16 +86,16 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
         start_steps (int): Number of steps for uniform-random action selection,
             before running real policy. Helps exploration.
 
-        act_noise (float): Stddev for Gaussian exploration noise added to
+        act_noise (float): Stddev for Gaussian exploration noise added to 
             policy at training time. (At test time, no noise is added.)
 
-        target_noise (float): Stddev for smoothing noise added to target
+        target_noise (float): Stddev for smoothing noise added to target 
             policy.
 
-        noise_clip (float): Limit for absolute value of target policy
+        noise_clip (float): Limit for absolute value of target policy 
             smoothing noise.
 
-        policy_delay (int): Policy will only be updated once every
+        policy_delay (int): Policy will only be updated once every 
             policy_delay times for each update of the Q-networks.
 
         max_ep_len (int): Maximum length of trajectory / episode / rollout.
@@ -133,11 +132,11 @@ def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
     # Main outputs from computation graph
     with tf.variable_scope('main'):
         pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs)
-
+    
     # Target policy network
     with tf.variable_scope('target'):
         pi_targ, _, _, _  = actor_critic(x2_ph, a_ph, **ac_kwargs)
-
+    
     # Target Q networks
     with tf.variable_scope('target', reuse=True):
 
@@ -213,8 +212,8 @@ def test_agent(n=10):
 
         """
         Until start_steps have elapsed, randomly sample actions
-        from a uniform distribution for better exploration. Afterwards,
-        use the learned policy (with some noise, via act_noise).
+        from a uniform distribution for better exploration. Afterwards, 
+        use the learned policy (with some noise, via act_noise). 
         """
         if t > start_steps:
             a = get_action(o, act_noise)
@@ -234,7 +233,7 @@ def test_agent(n=10):
         # Store experience to replay buffer
         replay_buffer.store(o, a, r, o2, d)
 
-        # Super critical, easy to overlook step: make sure to update
+        # Super critical, easy to overlook step: make sure to update 
         # most recent observation!
         o = o2
 
@@ -301,7 +300,7 @@ def test_agent(n=10):
     args = parser.parse_args()
 
     def td3_with_actor_critic(**kwargs):
-        td3(ac_kwargs=dict(hidden_sizes=[args.h]*args.l),
+        td3(ac_kwargs=dict(hidden_sizes=[args.h]*args.l), 
             start_steps=5000,
             max_ep_len=150,
             batch_size=64,