Skip to content

Commit

Permalink
Working DDQN on mujoco continuous tasks along with parameters
Browse files Browse the repository at this point in the history
  • Loading branch information
yiliu77 committed Dec 6, 2018
1 parent e7a6a33 commit a8f83d7
Show file tree
Hide file tree
Showing 19 changed files with 147 additions and 89 deletions.
4 changes: 2 additions & 2 deletions Policy_Divergence.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -421,13 +421,13 @@
{
"ename": "TypeError",
"evalue": "__init__() got an unexpected keyword argument 'shape'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-8-a797723b0162>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0miniter\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtruncated_normal_initializer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstddev\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.01\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mW1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mVariable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat32\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshape\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minitializer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0miniter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"weights_1\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mb1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mVariable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat32\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshape\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minitializer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0miniter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"weights_1\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mW2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mVariable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat32\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshape\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minitializer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0miniter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"weights_1\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: __init__() got an unexpected keyword argument 'shape'"
]
],
"output_type": "error"
}
],
"source": [
Expand Down
10 changes: 5 additions & 5 deletions environments/mujoco/env.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from environments.mujoco.tasks.define_task import Task
from gym.wrappers.time_limit import TimeLimit

envs = {"simple_task": TimeLimit(Task("simple_task.xml"), max_episode_steps=600),
"speed_task": TimeLimit(Task("simple_task.xml", speed_direction=1), max_episode_steps=600),
"slow_task": TimeLimit(Task("simple_task.xml", speed_direction=-1), max_episode_steps=1500),
"friction_task": TimeLimit(Task("friction_task.xml"), max_episode_steps=1500),
"away_task": TimeLimit(Task("friction_task.xml", reward_scale=-1), max_episode_steps=400)}
envs = {"simple_task": TimeLimit(Task("simple_task.xml"), max_episode_steps=1200),
"speed_task": TimeLimit(Task("large_target.xml", speed_direction=1), max_episode_steps=1200),
"slow_task": TimeLimit(Task("large_target.xml", speed_direction=-1), max_episode_steps=1500),
"away_task": TimeLimit(Task("simple_task.xml", reward_scale=-1), max_episode_steps=600),
}


def make(name):
Expand Down
19 changes: 13 additions & 6 deletions environments/mujoco/tasks/define_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,38 @@ def __init__(self, model_file, speed_direction=False, reward_scale=1):
self.speed_direction = speed_direction
self.reward_scale = reward_scale

self.distance = 0.04

def step(self, a):
speed = a[0]
speed = a[0] * 0.002
rotation = a[1] * 0.02
if speed > 1:
speed = 1
if speed < -1:
speed = -1
theta = self.sim.data.qpos.flat[2]
action_vector = [speed * np.cos(theta), speed * np.sin(theta), a[1]]
action_vector = [speed * np.cos(theta), speed * np.sin(theta), rotation]

vec = self.get_body_com("agent") - self.get_body_com("target")
reward_dist = -self.reward_scale * np.linalg.norm(vec)
reward_dist = -4 * self.reward_scale * np.linalg.norm(vec)
reward = reward_dist
self.do_simulation(action_vector, self.frame_skip)
ob = self._get_obs()
done = False

if self.speed_direction and np.linalg.norm(vec) <= 0.02:
if self.speed_direction and np.linalg.norm(vec) <= self.distance:
done = True
reward = self.speed_direction * np.linalg.norm(self.sim.data.qvel.flat[:2])
reward = 28 * self.speed_direction * np.linalg.norm(self.sim.data.qvel.flat[:2])
return ob, reward, done, dict(reward_dist=reward_dist)

def viewer_setup(self):
self.viewer.cam.trackbodyid = 0

def reset_model(self):
qpos = self.np_random.uniform(low=-0.2, high=0.2, size=self.model.nq)
while np.linalg.norm(qpos[0:2] - qpos[3:5]) <= self.distance:
qpos = self.np_random.uniform(low=-0.2, high=0.2, size=self.model.nq)

qpos[2] = self.np_random.uniform(low=-3, high=3, size=1)
qvel = np.array([0, 0, 0, 0, 0])
self.set_state(qpos, qvel)
Expand All @@ -46,6 +52,7 @@ def _get_obs(self):
return np.concatenate([
[np.cos(theta)],
[np.sin(theta)],
self.sim.data.qvel.flat[:2],
self.sim.data.qvel.flat[:3],
self.get_body_com("agent")[0:2],
self.get_body_com("agent")[0:2] - self.get_body_com("target")[0:2]
])
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
<geom conaffinity="0" fromto="-.3 -.3 .01 -.3 .3 .01" name="sideW" rgba="0 0.5 1 1" size=".02" type="capsule"/>
<!-- Agent -->
<body name="agent" pos=".1 -.1 .01">
<joint armature="1000" damping="800" axis="1 0 0" limited="true" name="agent_x" pos="0 0 0" range="-.27 .27" ref=".1" type="slide"/>
<joint armature="1000" damping="800" axis="0 1 0" limited="true" name="agent_y" pos="0 0 0" range="-.27 .27" ref="-.1" type="slide"/>
<joint axis="1 0 0" limited="true" name="agent_x" pos="0 0 0" range="-.27 .27" ref=".1" type="slide"/>
<joint axis="0 1 0" limited="true" name="agent_y" pos="0 0 0" range="-.27 .27" ref="-.1" type="slide"/>
<geom conaffinity="0" contype="0" name="agent_body" pos="0 0 0" rgba="0 0.75 1 1" size="0.02 0.02 .0001" type="box"/>
<joint axis="0 0 1" limited="false" name="viewer" pos="0 0 0" type="hinge"/>
<body name="agent_eye" pos="0 0 0">
Expand All @@ -27,12 +27,12 @@
<body name="target" pos=".1 -.1 .01">
<joint armature="0" damping="0" stiffness="0" axis="1 0 0" limited="true" name="target_x" pos="0 0 0" range="-.27 .27" ref=".1" type="slide"/>
<joint armature="0" damping="0" stiffness="0" axis="0 1 0" limited="true" name="target_y" pos="0 0 0" range="-.27 .27" ref="-.1" type="slide"/>
<geom conaffinity="0" contype="0" name="target_body" pos="0 0 0" rgba="1 0 0 1" size=".01" type="sphere"/>
<geom conaffinity="0" contype="0" name="target_body" pos="0 0 0" rgba="0 1 0 0.3" size="0.04 .002" type="cylinder"/>
</body>
</worldbody>
<actuator>
<motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="100.0" joint="agent_x"/>
<motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="100.0" joint="agent_y"/>
<motor ctrllimited="true" ctrlrange="0 1.0" gear="200.0" joint="viewer"/>
<motor ctrllimited="true" ctrlrange="-1.0 1.0" gear="200.0" joint="viewer"/>
</actuator>
</mujoco>
4 changes: 2 additions & 2 deletions environments/mujoco/tasks/simple_task.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
<geom conaffinity="0" fromto="-.3 -.3 .01 -.3 .3 .01" name="sideW" rgba="0 0.5 1 1" size=".02" type="capsule"/>
<!-- Agent -->
<body name="agent" pos=".1 -.1 .01">
<joint armature="1000" axis="1 0 0" limited="true" name="agent_x" pos="0 0 0" range="-.27 .27" ref=".1" type="slide"/>
<joint armature="1000" axis="0 1 0" limited="true" name="agent_y" pos="0 0 0" range="-.27 .27" ref="-.1" type="slide"/>
<joint axis="1 0 0" limited="true" name="agent_x" pos="0 0 0" range="-.27 .27" ref=".1" type="slide"/>
<joint axis="0 1 0" limited="true" name="agent_y" pos="0 0 0" range="-.27 .27" ref="-.1" type="slide"/>
<geom conaffinity="0" contype="0" name="agent_body" pos="0 0 0" rgba="0 0.75 1 1" size="0.02 0.02 .0001" type="box"/>
<joint axis="0 0 1" limited="false" name="viewer" pos="0 0 0" type="hinge"/>
<body name="agent_eye" pos="0 0 0">
Expand Down
Binary file added tests/away_task/run-5.ckpt.data-00000-of-00001
Binary file not shown.
Binary file added tests/away_task/run-5.ckpt.index
Binary file not shown.
Binary file added tests/away_task/run-5.ckpt.meta
Binary file not shown.
161 changes: 93 additions & 68 deletions tests/ddqn-network.py → tests/ddpg-network.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
import environments.mujoco.env as mujoco_env
import numpy as np
import tensorflow as tf
import os
import sys
import time

dir_path = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.dirname(dir_path))

import environments.mujoco.env as mujoco_env


class OrnsteinUhlenbeckActionNoise:
def __init__(self, mu, sigma=0.3, theta=.15, dt=1e-2, x0=None):
def __init__(self, mu, sigma=0.3, theta=0.15, dt=1e-2, x0=None):
self.theta = theta
self.mu = mu
self.sigma = sigma
Expand Down Expand Up @@ -143,12 +149,12 @@ def _create_network(self, scope):
actions = tf.placeholder(tf.float32, [None, self.action_size])
hidden1_layer = tf.layers.dense(inputs, self.n_hidden1, activation=tf.nn.relu)

# t_s = tf.layers.dense(hidden1_layer, self.n_hidden2)
# t_a = tf.layers.dense(actions, self.n_hidden2)
# hidden2_layer = t_s + t_a
# hidden2_layer = tf.nn.relu(hidden2_layer)
hidden2_layer = tf.layers.dense(tf.concat([hidden1_layer, actions], 1),
self.n_hidden2, activation=tf.nn.relu)
t_s = tf.layers.dense(hidden1_layer, self.n_hidden2)
t_a = tf.layers.dense(actions, self.n_hidden2)
hidden2_layer = tf.add(t_s, t_a)
hidden2_layer = tf.nn.relu(hidden2_layer)
# hidden2_layer = tf.layers.dense(tf.concat([hidden1_layer, actions], 1),
# self.n_hidden2, activation=tf.nn.relu)

outputs = tf.layers.dense(hidden2_layer, 1,
kernel_initializer=tf.initializers.random_uniform(minval=-0.003, maxval=0.003))
Expand All @@ -174,11 +180,12 @@ def get_action_grads(self, sess, inputs, actions):
})

def train(self, sess, inputs, actions, q_pred):
sess.run(self.train_op, feed_dict={
error, _ = sess.run([self.loss, self.train_op], feed_dict={
self.inputs: inputs,
self.actions: actions,
self.q_pred: q_pred
})
return error

def init_target(self, sess):
sess.run(self.target_init)
Expand All @@ -187,17 +194,23 @@ def update_target(self, sess):
sess.run(self.update_target_params)


env = mujoco_env.make('simple_task')
state_dim = 6
task = 'speed_task'
save_index = 11

env = mujoco_env.make(task)
state_dim = 9
action_dim = 2
action_bound = 1

# env = gym.make("Reacher-v2")
# state_dim = env.observation_space.shape[0]
# action_dim = env.action_space.shape[0]
# action_bound = env.action_space.high[0]
action_repeat = 4
n_per_render = 10
n_per_train = 1
n_episodes = 1200
batch_size = 64
gamma = 0.99
save_index = 0

memory = ReplayMemory(50000, batch_size)
actor = ActorNetwork(state_dim, action_dim, action_bound, batch_size, 0.00005, 0.001)
Expand All @@ -208,73 +221,85 @@ def update_target(self, sess):
tf.global_variables_initializer().run()
saver = tf.train.Saver()

checkpoint = "./run-" + str(save_index) + ".ckpt"
checkpoint = task + "/run-" + str(save_index) + ".ckpt"
if os.path.isfile(checkpoint + ".meta"):
saver.restore(session, checkpoint)
elif save_index != 0:
raise Exception("Session data not found!!")

actor.init_target(session)
critic.init_target(session)

for episode in range(n_episodes):
if episode != 0 and episode % 100 == 0:
saver.save(session, "./run-" + str(save_index + 1) + ".ckpt")
save_index += 1

obs = env.reset()
total_reward = 0
episode_length = 0

while True:
if episode % n_per_render == 0:
env.render()

if episode > 20:
action = actor.get_action(session, obs[np.newaxis, :])[0] + actor_noise()
# print(action)
else:
action = np.random.uniform(low=-1, high=1, size=2)

new_obs, reward, done, info = env.step(action)
total_reward += reward
episode_length += 1

memory.append([obs, action, reward, new_obs, done])
# print(reward)
obs = new_obs

if episode_length % n_per_train == 0 and episode > 20:
mem_s, mem_a, mem_r, mem_s_, mem_done = memory.sample()
target_actions = actor.get_target_action(session, mem_s_)
target_q = critic.get_target_value(session, mem_s_, target_actions)

q_y = []
for i in range(len(target_q)):
if mem_done[i]:
q_y.append([mem_r[i]])
else:
q_y.append(target_q[i] * gamma + mem_r[i])
critic.train(session, mem_s, mem_a, q_y)

actor_pred = actor.get_action(session, mem_s)
critic_grads = critic.get_action_grads(session, mem_s, actor_pred)[0]

actor.train(session, mem_s, critic_grads)
actor.update_target(session)
critic.update_target(session)

if done:
print(episode, episode_length, total_reward, total_reward / episode_length)
break
# actor.init_target(session)
# critic.init_target(session)
#
# for episode in range(n_episodes):
# if episode != 0 and episode % 300 == 0:
# saver.save(session, "./run-" + str(save_index + 1) + ".ckpt")
# save_index += 1
#
# obs = env.reset()
# total_reward = 0
# episode_length = 0
# total_critic_error = 0
#
# while True:
# # if episode % n_per_render == 0:
# # env.render()
#
# if episode > 20:
# action = actor.get_action(session, obs[np.newaxis, :])[0] + actor_noise()
# # print(action)
# else:
# action = np.random.uniform(low=-1, high=1, size=2)
#
# for i in range(action_repeat):
# new_obs, reward, done, info = env.step(action)
# if done:
# break
#
# total_reward += reward
# episode_length += 1
#
# memory.append([obs, action, reward, new_obs, done])
# # print(reward)
# obs = new_obs
#
# if episode_length % n_per_train == 0 and episode > 20:
# mem_s, mem_a, mem_r, mem_s_, mem_done = memory.sample()
# target_actions = actor.get_target_action(session, mem_s_)
# target_q = critic.get_target_value(session, mem_s_, target_actions)
#
# q_y = []
# for i in range(len(target_q)):
# if mem_done[i]:
# q_y.append([mem_r[i]])
# else:
# q_y.append(target_q[i] * gamma + mem_r[i])
# total_critic_error += critic.train(session, mem_s, mem_a, q_y)
#
# actor_pred = actor.get_action(session, mem_s)
# critic_grads = critic.get_action_grads(session, mem_s, actor_pred)[0]
#
# actor.train(session, mem_s, critic_grads)
# actor.update_target(session)
# critic.update_target(session)
#
# if done:
# print(episode, episode_length, total_reward / episode_length, total_critic_error / episode_length)
# break

while True:
obs = env.reset()
len_eps = 0
while True:
env.render()
action = actor.get_action(session, obs[np.newaxis, :])[0]
new_obs, reward, done, info = env.step(action)
# print(28 * np.linalg.norm(new_obs[2:4]))
len_eps += 1
print(reward)
# print(obs)
# print(critic.get_value(session, obs[np.newaxis, :], action[np.newaxis, :]), reward)
# print(action)
obs = new_obs

if done:
if done or len_eps > 800:
break
Binary file added tests/simple_task/run-5.ckpt.data-00000-of-00001
Binary file not shown.
Binary file added tests/simple_task/run-5.ckpt.index
Binary file not shown.
Binary file added tests/simple_task/run-5.ckpt.meta
Binary file not shown.
Binary file added tests/slow_task/run-7.ckpt.data-00000-of-00001
Binary file not shown.
Binary file added tests/slow_task/run-7.ckpt.index
Binary file not shown.
Binary file added tests/slow_task/run-7.ckpt.meta
Binary file not shown.
Binary file added tests/speed_task/run-11.ckpt.data-00000-of-00001
Binary file not shown.
Binary file added tests/speed_task/run-11.ckpt.index
Binary file not shown.
Binary file added tests/speed_task/run-11.ckpt.meta
Binary file not shown.
30 changes: 28 additions & 2 deletions tests/test-mujoco-env.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,35 @@
import environments.mujoco.env as mujoco_env
import numpy as np


class OrnsteinUhlenbeckActionNoise:
def __init__(self, mu, sigma=0.2, theta=0.15, dt=1e-2, x0=None):
self.theta = theta
self.mu = mu
self.sigma = sigma
self.dt = dt
self.x0 = x0
self.reset()

def __call__(self):
x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
self.x_prev = x
return x

def reset(self):
self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

def __repr__(self):
return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)

noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(2))
env = mujoco_env.make('speed_task')
obs = env.reset()
while True:
env.render()
obs, _, _, _ = env.step([1, 0.001])
print(obs)
a = noise()
obs, reward, _, _ = env.step([1, 1])
print(reward)
# print(obs)

0 comments on commit a8f83d7

Please sign in to comment.