|
| 1 | +import argparse |
| 2 | +import gym |
| 3 | +from gym import wrappers |
| 4 | +import os.path as osp |
| 5 | +import random |
| 6 | +import numpy as np |
| 7 | +import tensorflow as tf |
| 8 | +import tensorflow.contrib.layers as layers |
| 9 | + |
| 10 | +import dqn |
| 11 | +from dqn_utils import * |
| 12 | +from atari_wrappers import * |
| 13 | + |
| 14 | + |
| 15 | +def atari_model(img_in, num_actions, scope, reuse=False): |
| 16 | + # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf |
| 17 | + with tf.variable_scope(scope, reuse=reuse): |
| 18 | + out = img_in |
| 19 | + with tf.variable_scope("convnet"): |
| 20 | + # original architecture |
| 21 | + out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu) |
| 22 | + out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu) |
| 23 | + out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu) |
| 24 | + out = layers.flatten(out) |
| 25 | + with tf.variable_scope("action_value"): |
| 26 | + out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) |
| 27 | + out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) |
| 28 | + out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) |
| 29 | + out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) |
| 30 | + |
| 31 | + return out |
| 32 | + |
| 33 | +def atari_learn(env, |
| 34 | + session, |
| 35 | + num_timesteps): |
| 36 | + # This is just a rough estimate |
| 37 | + num_iterations = float(num_timesteps) / 4.0 |
| 38 | + |
| 39 | + lr_multiplier = 1.0 |
| 40 | + lr_schedule = PiecewiseSchedule([ |
| 41 | + (0, 1e-4 * lr_multiplier), |
| 42 | + (num_iterations / 10, 1e-4 * lr_multiplier), |
| 43 | + (num_iterations / 2, 5e-5 * lr_multiplier), |
| 44 | + ], |
| 45 | + outside_value=5e-5 * lr_multiplier) |
| 46 | + optimizer = dqn.OptimizerSpec( |
| 47 | + constructor=tf.train.AdamOptimizer, |
| 48 | + kwargs=dict(epsilon=1e-4), |
| 49 | + lr_schedule=lr_schedule |
| 50 | + ) |
| 51 | + |
| 52 | + def stopping_criterion(env, t): |
| 53 | + # notice that here t is the number of steps of the wrapped env, |
| 54 | + # which is different from the number of steps in the underlying env |
| 55 | + return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps |
| 56 | + |
| 57 | + exploration_schedule = PiecewiseSchedule( |
| 58 | + [ |
| 59 | + (0, 1.0), |
| 60 | + (1e6, 0.1), |
| 61 | + (num_iterations / 2, 0.01), |
| 62 | + ], outside_value=0.01 |
| 63 | + ) |
| 64 | + |
| 65 | + dqn.learn( |
| 66 | + env=env, |
| 67 | + q_func=atari_model, |
| 68 | + optimizer_spec=optimizer, |
| 69 | + session=session, |
| 70 | + exploration=exploration_schedule, |
| 71 | + stopping_criterion=stopping_criterion, |
| 72 | + replay_buffer_size=1000000, |
| 73 | + batch_size=32, |
| 74 | + gamma=0.99, |
| 75 | + learning_starts=50000, |
| 76 | + learning_freq=4, |
| 77 | + frame_history_len=4, |
| 78 | + target_update_freq=10000, |
| 79 | + grad_norm_clipping=10, |
| 80 | + double_q=True |
| 81 | + ) |
| 82 | + env.close() |
| 83 | + |
| 84 | +def get_available_gpus(): |
| 85 | + from tensorflow.python.client import device_lib |
| 86 | + local_device_protos = device_lib.list_local_devices() |
| 87 | + return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] |
| 88 | + |
| 89 | +def set_global_seeds(i): |
| 90 | + try: |
| 91 | + import tensorflow as tf |
| 92 | + except ImportError: |
| 93 | + pass |
| 94 | + else: |
| 95 | + tf.set_random_seed(i) |
| 96 | + np.random.seed(i) |
| 97 | + random.seed(i) |
| 98 | + |
| 99 | +def get_session(): |
| 100 | + tf.reset_default_graph() |
| 101 | + tf_config = tf.ConfigProto( |
| 102 | + inter_op_parallelism_threads=1, |
| 103 | + intra_op_parallelism_threads=1) |
| 104 | + session = tf.Session(config=tf_config) |
| 105 | + print("AVAILABLE GPUS: ", get_available_gpus()) |
| 106 | + return session |
| 107 | + |
| 108 | +def get_env(task, seed): |
| 109 | + env = gym.make('PongNoFrameskip-v4') |
| 110 | + |
| 111 | + set_global_seeds(seed) |
| 112 | + env.seed(seed) |
| 113 | + |
| 114 | + expt_dir = '/tmp/hw3_vid_dir2/' |
| 115 | + env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) |
| 116 | + env = wrap_deepmind(env) |
| 117 | + |
| 118 | + return env |
| 119 | + |
| 120 | +def main(): |
| 121 | + # Get Atari games. |
| 122 | + task = gym.make('PongNoFrameskip-v4') |
| 123 | + |
| 124 | + # Run training |
| 125 | + seed = random.randint(0, 9999) |
| 126 | + print('random seed = %d' % seed) |
| 127 | + env = get_env(task, seed) |
| 128 | + session = get_session() |
| 129 | + atari_learn(env, session, num_timesteps=8e6) |
| 130 | + |
| 131 | +if __name__ == "__main__": |
| 132 | + main() |
0 commit comments