diff --git a/Amazon GPU howto.md b/Amazon GPU howto.md index a24c1c82d..21fd21924 100644 --- a/Amazon GPU howto.md +++ b/Amazon GPU howto.md @@ -1,17 +1,17 @@ -# How to set up GPU on EC2 instance +**Warning: this guide has not been updated since we migrated away from Theano/Lasagne, which happened a long time ago.** -## Create EC2 instance +# How to set up an Amazon EC2 GPU instance + +## Create an instance Use `p2.xlarge` instance type and `ami-e00a8180` AMI image. [Details](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html) -Open ports `22` (ssh) and `80` (http) on your freshly created instance, -you create a [security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-network-security.html) -and attach it your instance to get ports open +Open ports `22` (ssh) and `80` (http) on your freshly created instance, then create a [security group](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-network-security.html) and attach it to your instance to get the ports open. ## Launch notebook -Instance you have created contains all you need: fresh versions of theano, lasagne, CUDA driver and cuDNN, -just lunch ipython and get hands dirty: +The instance you have created contains all you need: fresh versions of theano, lasagne, CUDA driver and cuDNN, +just launch Jupyter and get your hands dirty: ```bash $ sudo su @@ -19,4 +19,3 @@ $ export THEANO_FLAGS='cuda.root=/usr/local/cuda,device=gpu,floatX=float32' $ export PATH=/usr/local/cuda-8.0/bin${PATH:+:${PATH}} $ jupyter notebook ``` - diff --git a/README.md b/README.md index 90d17b2c9..ea1453fd4 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,6 @@ Taught on-campus at [HSE](https://cs.hse.ru) and [YSDA](https://yandexdataschool [![Github contributors](https://img.shields.io/github/contributors/yandexdataschool/Practical_RL.svg?logo=github&logoColor=white)](https://github.com/yandexdataschool/Practical_RL/graphs/contributors) # Course info -* __Chat room__ for YSDA & HSE students is [here](http://t.me/practical_rl_2020) -* __Grading__ rules for YSDA & HSE students is [here](https://github.com/yandexdataschool/Practical_RL/wiki/Homeworks-and-grading) * __FAQ:__ [About the course](https://github.com/yandexdataschool/Practical_RL/wiki/Practical-RL), [Technical issues thread](https://github.com/yandexdataschool/Practical_RL/issues/1), [Lecture Slides](https://yadi.sk/d/loPpY45J3EAYfU), [Online Student Survival Guide](https://github.com/yandexdataschool/Practical_RL/wiki/Online-student's-survival-guide) @@ -65,7 +63,7 @@ The syllabus is approximate: the lectures may occur in a slightly different orde * Seminar: REINFORCE, advantage actor-critic * [__week07_seq2seq__](./week07_seq2seq) Reinforcement Learning for Sequence Models - * Lecture: Problems with sequential data. Recurrent neural netowks. Backprop through time. Vanishing & exploding gradients. LSTM, GRU. Gradient clipping + * Lecture: Problems with sequential data. Recurrent neural networks. Backprop through time. Vanishing & exploding gradients. LSTM, GRU. Gradient clipping * Seminar: character-level RNN language model * [__week08_pomdp__](./week08_pomdp) Partially Observed MDP @@ -86,7 +84,7 @@ The syllabus is approximate: the lectures may occur in a slightly different orde # Course staff Course materials and teaching by: _[unordered]_ -- [Pavel Shvechikov](https://github.com/bestxolodec) - lectures, seminars, hw checkups, reading group +- [Pavel Shvechikov](https://github.com/pshvechikov) - lectures, seminars, hw checkups, reading group - [Nikita Putintsev](https://github.com/qwasser) - seminars, hw checkups, organizing our hot mess - [Alexander Fritsler](https://github.com/Fritz449) - lectures, seminars, hw checkups - [Oleg Vasilev](https://github.com/Omrigan) - seminars, hw checkups, technical support diff --git a/docker/README.md b/docker/README.md index e2e354bb8..e9eab249b 100644 --- a/docker/README.md +++ b/docker/README.md @@ -7,7 +7,7 @@ _tl;dr [dockerhub url](https://hub.docker.com/r/justheuristic/practical_rl/)_ We recommend you to use either native docker (recommended for linux) or kitematic(recommended for windows). * Installing [kitematic](https://kitematic.com/), a simple interface to docker (all platforms) * Pure docker: Guide for [windows](https://docs.docker.com/docker-for-windows/), [linux](https://docs.docker.com/engine/installation/), or [macOS](https://docs.docker.com/docker-for-mac/). -* If you want to use your GPU make sure you have [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) and [NVidia driver](https://www.nvidia.com/en-us/drivers/unix/) + [CUDA 10.2](https://developer.nvidia.com/cuda-downloads) installed +* If you want to use your GPU make sure you have [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) and [NVidia driver](https://www.nvidia.com/en-us/drivers/unix/) + [CUDA 10.2](https://developer.nvidia.com/cuda-downloads) installed Below are the instructions for both approaches. diff --git a/setup_colab.sh b/setup_colab.sh index 5e1db94ee..c9b812bf9 100755 --- a/setup_colab.sh +++ b/setup_colab.sh @@ -2,4 +2,16 @@ apt-get -qq update apt-get -qq install -y xvfb -wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/xvfb -O ../xvfb +wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/xvfb -O ../xvfb + +# Download & import Atari ROMs (Colab stopped bundling them around the beginning of June 2021) + +gdown -q https://drive.google.com/uc?id=1c6_W2Fig92hm5FRIc2Mpc_ZZyr6o52lF + +# Alternative download: +# wget -q http://www.atarimania.com/roms/Roms.rar + +pip install -q unrar +mkdir ./roms +unrar x Roms.rar ./roms > /dev/null 2>&1 +python -m atari_py.import_roms ./roms > /dev/null 2>&1 diff --git a/week01_intro/README.md b/week01_intro/README.md index 6120dbe34..0c81dc013 100644 --- a/week01_intro/README.md +++ b/week01_intro/README.md @@ -11,7 +11,7 @@ ## More materials: -* __[recommended]__ - awesome openai post about evolution strategies - [blog post](https://blog.openai.com/evolution-strategies/), [article](https://arxiv.org/abs/1703.03864) +* __[recommended]__ - awesome openai post about evolution strategies - [blog post](https://openai.com/research/evolution-strategies), [article](https://arxiv.org/abs/1703.03864) * __[recommended]__ - formal explanation of crossentropy method in [general](https://people.smp.uq.edu.au/DirkKroese/ps/CEEncycl.pdf) and for [optimization](https://people.smp.uq.edu.au/DirkKroese/ps/CEopt.pdf) * Deep learning course (if you want to learn in parallel) - https://github.com/yandexdataschool/HSE_deeplearning * Video on genetic algorithms (english) - [video](https://www.youtube.com/watch?v=ejxfTy4lI6I) @@ -23,10 +23,10 @@ ## Practice assignment -Instant dive in: [__seminar_gym_interface__](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week01_intro/seminar_gym_interface.ipynb), [__crossentropy_method__](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week01_intro/crossentropy_method.ipynb), +Instant dive in: [__seminar_gymnasium_interface__](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week01_intro/seminar_gymnasium_interface.ipynb), [__crossentropy_method__](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week01_intro/crossentropy_method.ipynb), [__deep_crossentropy_method__](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week01_intro/deep_crossentropy_method.ipynb) -* Open `gym_interface.ipynb` and follow instructions from there +* Open `seminar_gymnasium_interface.ipynb` and follow instructions from there * After you're done there, proceed to `crossentropy_method.ipynb` * You can find homework and bonus assignment descriptions at the end of that notebook. -* Note: so far it's enough to say `pip install gym` on top of any data-science-stuffed python, but we'd appreciate if you gradually switch to [full installation](https://github.com/openai/gym#installing-everything). +* Note: so far it's enough to say `pip install gymnasium` on top of any data-science-stuffed python, but we'd appreciate if you gradually switch to [full installation](https://github.com/Farama-Foundation/Gymnasium). diff --git a/week01_intro/crossentropy_method.ipynb b/week01_intro/crossentropy_method.ipynb index 16e2a5f2b..40ae90548 100644 --- a/week01_intro/crossentropy_method.ipynb +++ b/week01_intro/crossentropy_method.ipynb @@ -1,422 +1,508 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Crossentropy method\n", - "\n", - "This notebook will teach you to solve reinforcement learning problems with crossentropy method. We'll follow-up by scaling everything up and using neural network policy." - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "7XGyc-FCG35I" + }, + "source": [ + "# Crossentropy method\n", + "\n", + "This notebook will teach you to solve reinforcement learning problems with crossentropy method. We'll follow-up by scaling everything up and using neural network policy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2jwz8moTG35K" + }, + "outputs": [], + "source": [ + "import sys, os\n", + "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + " !touch .setup_complete\n", + "\n", + "# This code creates a virtual display to draw game images on.\n", + "# It will have no effect if your machine has a monitor.\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " os.environ['DISPLAY'] = ':1'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oe7EKolvLC67" + }, + "outputs": [], + "source": [ + "!pip install gymnasium[toy_text]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ltjzx5AFG35K" + }, + "outputs": [], + "source": [ + "import gymnasium as gym\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "env = gym.make(\"Taxi-v3\", render_mode=\"rgb_array\")\n", + "print(env.reset(seed=0))\n", + "plt.imshow(env.render())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "O6waF01eG35L", + "outputId": "8ca46444-7a7f-4091-c50c-0cab96a995e9" + }, + "outputs": [], + "source": [ + "n_states = env.observation_space.n\n", + "n_actions = env.action_space.n\n", + "\n", + "print(f\"n_states={n_states}, n_actions={n_actions}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IHXnU2QWG35L" + }, + "source": [ + "# Create stochastic policy\n", + "\n", + "This time our policy should be a probability distribution.\n", + "\n", + "```policy[s,a] = P(take action a | in state s)```\n", + "\n", + "Since we still use integer state and action representations, you can use a 2-dimensional array to represent the policy.\n", + "\n", + "Please initialize the policy __uniformly__, that is, probabililities of all actions should be equal." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "9qL5eW-rG35L" + }, + "outputs": [], + "source": [ + "def initialize_policy(n_states, n_actions):\n", + " \n", + "\n", + " return policy\n", + "\n", + "\n", + "policy = initialize_policy(n_states, n_actions)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "G1SeRRGgG35L" + }, + "outputs": [], + "source": [ + "assert type(policy) in (np.ndarray, np.matrix)\n", + "assert np.allclose(policy, 1.0 / n_actions)\n", + "assert np.allclose(np.sum(policy, axis=1), 1)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9zR1fCUrG35L" + }, + "source": [ + "# Play the game\n", + "\n", + "Just like before, but we also record all states and actions we took." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "d4v8WmvlG35L" + }, + "outputs": [], + "source": [ + "def generate_session(env, policy, t_max=10**4):\n", + " \"\"\"\n", + " Play game until end or for t_max ticks.\n", + " :param policy: an array of shape [n_states,n_actions] with action probabilities\n", + " :returns: list of states, list of actions and sum of rewards\n", + " \"\"\"\n", + " states, actions = [], []\n", + " total_reward = 0.0\n", + "\n", + " s, _ = env.reset()\n", + "\n", + " for t in range(t_max):\n", + " # Hint: you can use np.random.choice for sampling action\n", + " # https://numpy.org/doc/stable/reference/random/generated/numpy.random.choice.html\n", + "\n", + " a = \n", + "\n", + " new_s, r, terminated, truncated, _ = env.step(a)\n", + "\n", + " # Record information we just got from the environment.\n", + " states.append(s)\n", + " actions.append(a)\n", + " total_reward += r\n", + "\n", + " s = new_s\n", + " if terminated or truncated:\n", + " break\n", + "\n", + " return states, actions, total_reward\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "a1EUUZ29G35M" + }, + "outputs": [], + "source": [ + "s, a, r = generate_session(env, policy)\n", + "assert type(s) == type(a) == list\n", + "assert len(s) == len(a)\n", + "assert type(r) in [float, np.float64]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_5YEDTKnG35M" + }, + "outputs": [], + "source": [ + "# let's see the initial reward distribution\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "sample_rewards = [generate_session(env, policy, t_max=1000)[-1] for _ in range(200)]\n", + "\n", + "plt.hist(sample_rewards, bins=20)\n", + "plt.vlines([np.percentile(sample_rewards, 50)], [0], [100], label=\"50'th percentile\", color='green')\n", + "plt.vlines([np.percentile(sample_rewards, 90)], [0], [100], label=\"90'th percentile\", color='red')\n", + "plt.legend()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EeWtL3F5G35M" + }, + "source": [ + "### Crossentropy method steps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "00WWzr0KG35N" + }, + "outputs": [], + "source": [ + "def select_elites(states_batch, actions_batch, rewards_batch, percentile):\n", + " \"\"\"\n", + " Select states and actions from games that have rewards >= percentile\n", + " :param states_batch: list of lists of states, states_batch[session_i][t]\n", + " :param actions_batch: list of lists of actions, actions_batch[session_i][t]\n", + " :param rewards_batch: list of rewards, rewards_batch[session_i]\n", + "\n", + " :returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions\n", + "\n", + " Please return elite states and actions in their original order\n", + " [i.e. sorted by session number and timestep within session]\n", + "\n", + " If you are confused, see examples below. Please don't assume that states are integers\n", + " (they will become different later).\n", + " \"\"\"\n", + "\n", + " reward_threshold = \n", + "\n", + " elite_states = \n", + " elite_actions = \n", + "\n", + " return elite_states, elite_actions\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "teOLBGojG35N" + }, + "outputs": [], + "source": [ + "states_batch = [\n", + " [1, 2, 3], # game1\n", + " [4, 2, 0, 2], # game2\n", + " [3, 1], # game3\n", + "]\n", + "\n", + "actions_batch = [\n", + " [0, 2, 4], # game1\n", + " [3, 2, 0, 1], # game2\n", + " [3, 3], # game3\n", + "]\n", + "rewards_batch = [\n", + " 3, # game1\n", + " 4, # game2\n", + " 5, # game3\n", + "]\n", + "\n", + "test_result_0 = select_elites(states_batch, actions_batch, rewards_batch, percentile=0)\n", + "test_result_30 = select_elites(\n", + " states_batch, actions_batch, rewards_batch, percentile=30\n", + ")\n", + "test_result_90 = select_elites(\n", + " states_batch, actions_batch, rewards_batch, percentile=90\n", + ")\n", + "test_result_100 = select_elites(\n", + " states_batch, actions_batch, rewards_batch, percentile=100\n", + ")\n", + "\n", + "assert np.all(test_result_0[0] == [1, 2, 3, 4, 2, 0, 2, 3, 1]) and np.all(\n", + " test_result_0[1] == [0, 2, 4, 3, 2, 0, 1, 3, 3]\n", + "), \"For percentile 0 you should return all states and actions in chronological order\"\n", + "assert np.all(test_result_30[0] == [4, 2, 0, 2, 3, 1]) and np.all(\n", + " test_result_30[1] == [3, 2, 0, 1, 3, 3]\n", + "), \"For percentile 30 you should only select states/actions from two first\"\n", + "assert np.all(test_result_90[0] == [3, 1]) and np.all(\n", + " test_result_90[1] == [3, 3]\n", + "), \"For percentile 90 you should only select states/actions from one game\"\n", + "assert np.all(test_result_100[0] == [3, 1]) and np.all(\n", + " test_result_100[1] == [3, 3]\n", + "), \"Please make sure you use >=, not >. Also double-check how you compute percentile.\"\n", + "\n", + "print(\"Ok!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wYLBHFFwG35N" + }, + "outputs": [], + "source": [ + "def get_new_policy(elite_states, elite_actions):\n", + " \"\"\"\n", + " Given a list of elite states/actions from select_elites,\n", + " return a new policy where each action probability is proportional to\n", + "\n", + " policy[s_i,a_i] ~ #[occurrences of s_i and a_i in elite states/actions]\n", + "\n", + " Don't forget to normalize the policy to get valid probabilities and handle the 0/0 case.\n", + " For states that you never visited, use a uniform distribution (1/n_actions for all states).\n", + "\n", + " :param elite_states: 1D list of states from elite sessions\n", + " :param elite_actions: 1D list of actions from elite sessions\n", + "\n", + " \"\"\"\n", + "\n", + " new_policy = np.zeros([n_states, n_actions])\n", + "\n", + " \n", + " # Don't forget to set 1/n_actions for all actions in unvisited states.\n", + "\n", + " return new_policy\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "I1VbNcpoG35O" + }, + "outputs": [], + "source": [ + "elite_states = [1, 2, 3, 4, 2, 0, 2, 3, 1]\n", + "elite_actions = [0, 2, 4, 3, 2, 0, 1, 3, 3]\n", + "\n", + "new_policy = get_new_policy(elite_states, elite_actions)\n", + "\n", + "assert np.isfinite(\n", + " new_policy\n", + ").all(), \"Your new policy contains NaNs or +-inf. Make sure you don't divide by zero.\"\n", + "assert np.all(\n", + " new_policy >= 0\n", + "), \"Your new policy can't have negative action probabilities\"\n", + "assert np.allclose(\n", + " new_policy.sum(axis=-1), 1\n", + "), \"Your new policy should be a valid probability distribution over actions\"\n", + "\n", + "reference_answer = np.array(\n", + " [\n", + " [1.0, 0.0, 0.0, 0.0, 0.0],\n", + " [0.5, 0.0, 0.0, 0.5, 0.0],\n", + " [0.0, 0.33333333, 0.66666667, 0.0, 0.0],\n", + " [0.0, 0.0, 0.0, 0.5, 0.5],\n", + " ]\n", + ")\n", + "assert np.allclose(new_policy[:4, :5], reference_answer)\n", + "\n", + "print(\"Ok!\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WVvZq1fSG35O" + }, + "source": [ + "# Training loop\n", + "Generate sessions, select N best and fit to those." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3CmH7Aj4G35O" + }, + "outputs": [], + "source": [ + "from IPython.display import clear_output\n", + "\n", + "\n", + "def show_progress(rewards_batch, log, percentile, reward_range=[-990, +10]):\n", + " \"\"\"\n", + " A convenience function that displays training progress.\n", + " No cool math here, just charts.\n", + " \"\"\"\n", + "\n", + " mean_reward = np.mean(rewards_batch)\n", + " threshold = np.percentile(rewards_batch, percentile)\n", + " log.append([mean_reward, threshold])\n", + "\n", + " plt.figure(figsize=[8, 4])\n", + " plt.subplot(1, 2, 1)\n", + " plt.plot(list(zip(*log))[0], label=\"Mean rewards\")\n", + " plt.plot(list(zip(*log))[1], label=\"Reward thresholds\")\n", + " plt.legend()\n", + " plt.grid()\n", + "\n", + " plt.subplot(1, 2, 2)\n", + " plt.hist(rewards_batch, range=reward_range)\n", + " plt.vlines(\n", + " [np.percentile(rewards_batch, percentile)],\n", + " [0],\n", + " [100],\n", + " label=\"percentile\",\n", + " color=\"red\",\n", + " )\n", + " plt.legend()\n", + " plt.grid()\n", + " clear_output(True)\n", + " print(\"mean reward = %.3f, threshold=%.3f\" % (mean_reward, threshold))\n", + " plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tz0Yd964G35O" + }, + "outputs": [], + "source": [ + "# reset policy just in case\n", + "policy = initialize_policy(n_states, n_actions)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-cNq5hndG35O" + }, + "outputs": [], + "source": [ + "n_sessions = 250 # sample this many sessions\n", + "percentile = 50 # discard this percentage of sessions with lowest rewards\n", + "learning_rate = 0.5 # how quickly the policy is updated, on a scale from 0 to 1\n", + "\n", + "log = []\n", + "\n", + "for i in range(100):\n", + " %time sessions = [ ]\n", + "\n", + " states_batch, actions_batch, rewards_batch = zip(*sessions)\n", + "\n", + " elite_states, elite_actions = \n", + "\n", + " new_policy = \n", + "\n", + " policy = learning_rate * new_policy + (1 - learning_rate) * policy\n", + "\n", + " # display results on chart\n", + " show_progress(rewards_batch, log, percentile)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K5LIoVTuG35O" + }, + "source": [ + "### Reflecting on results\n", + "\n", + "You may have noticed that the taxi problem quickly converges from less than -1000 to a near-optimal score and then descends back into -50/-100. This is in part because the environment has some innate randomness. Namely, the starting points of passenger/driver change from episode to episode.\n", + "\n", + "In case CEM failed to learn how to win from one distinct starting point, it will simply discard it because no sessions from that starting point will make it into the \"elites\".\n", + "\n", + "To mitigate that problem, you can either reduce the threshold for elite sessions (duct tape way) or change the way you evaluate strategy (theoretically correct way). For each starting state, you can sample an action randomly, and then evaluate this action by running _several_ games starting from it and averaging the total reward. Choosing elite sessions with this kind of sampling (where each session's reward is counted as the average of the rewards of all sessions with the same starting state and action) should improve the performance of your policy." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ubIHBgQ-G35O" + }, + "source": [ + "\n", + "### You're not done yet!\n", + "\n", + "Go to [`./deep_crossentropy_method.ipynb`](./deep_crossentropy_method.ipynb) for a more serious task" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys, os\n", - "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", - " !touch .setup_complete\n", - "\n", - "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", - "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", - " !bash ../xvfb start\n", - " os.environ['DISPLAY'] = ':1'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import gym\n", - "import numpy as np\n", - "\n", - "env = gym.make(\"Taxi-v3\")\n", - "env.reset()\n", - "env.render()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_states = env.observation_space.n\n", - "n_actions = env.action_space.n\n", - "\n", - "print(\"n_states=%i, n_actions=%i\" % (n_states, n_actions))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create stochastic policy\n", - "\n", - "This time our policy should be a probability distribution.\n", - "\n", - "```policy[s,a] = P(take action a | in state s)```\n", - "\n", - "Since we still use integer state and action representations, you can use a 2-dimensional array to represent the policy.\n", - "\n", - "Please initialize the policy __uniformly__, that is, probabililities of all actions should be equal." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def initialize_policy(n_states, n_actions):\n", - " \n", - " \n", - " return policy\n", - "\n", - "policy = initialize_policy(n_states, n_actions)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "assert type(policy) in (np.ndarray, np.matrix)\n", - "assert np.allclose(policy, 1./n_actions)\n", - "assert np.allclose(np.sum(policy, axis=1), 1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Play the game\n", - "\n", - "Just like before, but we also record all states and actions we took." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def generate_session(env, policy, t_max=10**4):\n", - " \"\"\"\n", - " Play game until end or for t_max ticks.\n", - " :param policy: an array of shape [n_states,n_actions] with action probabilities\n", - " :returns: list of states, list of actions and sum of rewards\n", - " \"\"\"\n", - " states, actions = [], []\n", - " total_reward = 0.\n", - "\n", - " s = env.reset()\n", - "\n", - " for t in range(t_max):\n", - " # Hint: you can use np.random.choice for sampling action\n", - " # https://numpy.org/doc/stable/reference/random/generated/numpy.random.choice.html\n", - " a = \n", - "\n", - " new_s, r, done, info = env.step(a)\n", - "\n", - " # Record information we just got from the environment.\n", - " states.append(s)\n", - " actions.append(a)\n", - " total_reward += r\n", - "\n", - " s = new_s\n", - " if done:\n", - " break\n", - "\n", - " return states, actions, total_reward" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "s, a, r = generate_session(env, policy)\n", - "assert type(s) == type(a) == list\n", - "assert len(s) == len(a)\n", - "assert type(r) in [float, np.float]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# let's see the initial reward distribution\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "\n", - "sample_rewards = [generate_session(env, policy, t_max=1000)[-1] for _ in range(200)]\n", - "\n", - "plt.hist(sample_rewards, bins=20)\n", - "plt.vlines([np.percentile(sample_rewards, 50)], [0], [100], label=\"50'th percentile\", color='green')\n", - "plt.vlines([np.percentile(sample_rewards, 90)], [0], [100], label=\"90'th percentile\", color='red')\n", - "plt.legend()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Crossentropy method steps" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def select_elites(states_batch, actions_batch, rewards_batch, percentile):\n", - " \"\"\"\n", - " Select states and actions from games that have rewards >= percentile\n", - " :param states_batch: list of lists of states, states_batch[session_i][t]\n", - " :param actions_batch: list of lists of actions, actions_batch[session_i][t]\n", - " :param rewards_batch: list of rewards, rewards_batch[session_i]\n", - "\n", - " :returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions\n", - "\n", - " Please return elite states and actions in their original order \n", - " [i.e. sorted by session number and timestep within session]\n", - "\n", - " If you are confused, see examples below. Please don't assume that states are integers\n", - " (they will become different later).\n", - " \"\"\"\n", - "\n", - " reward_threshold = \n", - "\n", - " elite_states = \n", - " elite_actions = \n", - "\n", - " return elite_states, elite_actions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "states_batch = [\n", - " [1, 2, 3], # game1\n", - " [4, 2, 0, 2], # game2\n", - " [3, 1], # game3\n", - "]\n", - "\n", - "actions_batch = [\n", - " [0, 2, 4], # game1\n", - " [3, 2, 0, 1], # game2\n", - " [3, 3], # game3\n", - "]\n", - "rewards_batch = [\n", - " 3, # game1\n", - " 4, # game2\n", - " 5, # game3\n", - "]\n", - "\n", - "test_result_0 = select_elites(states_batch, actions_batch, rewards_batch, percentile=0)\n", - "test_result_30 = select_elites(states_batch, actions_batch, rewards_batch, percentile=30)\n", - "test_result_90 = select_elites(states_batch, actions_batch, rewards_batch, percentile=90)\n", - "test_result_100 = select_elites(states_batch, actions_batch, rewards_batch, percentile=100)\n", - "\n", - "assert np.all(test_result_0[0] == [1, 2, 3, 4, 2, 0, 2, 3, 1]) \\\n", - " and np.all(test_result_0[1] == [0, 2, 4, 3, 2, 0, 1, 3, 3]), \\\n", - " \"For percentile 0 you should return all states and actions in chronological order\"\n", - "assert np.all(test_result_30[0] == [4, 2, 0, 2, 3, 1]) and \\\n", - " np.all(test_result_30[1] == [3, 2, 0, 1, 3, 3]), \\\n", - " \"For percentile 30 you should only select states/actions from two first\"\n", - "assert np.all(test_result_90[0] == [3, 1]) and \\\n", - " np.all(test_result_90[1] == [3, 3]), \\\n", - " \"For percentile 90 you should only select states/actions from one game\"\n", - "assert np.all(test_result_100[0] == [3, 1]) and\\\n", - " np.all(test_result_100[1] == [3, 3]), \\\n", - " \"Please make sure you use >=, not >. Also double-check how you compute percentile.\"\n", - "\n", - "print(\"Ok!\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def update_policy(elite_states, elite_actions):\n", - " \"\"\"\n", - " Given old policy and a list of elite states/actions from select_elites,\n", - " return new updated policy where each action probability is proportional to\n", - "\n", - " policy[s_i,a_i] ~ #[occurences of si and ai in elite states/actions]\n", - "\n", - " Don't forget to normalize policy to get valid probabilities and handle 0/0 case.\n", - " In case you never visited a state, set probabilities for all actions to 1./n_actions\n", - "\n", - " :param elite_states: 1D list of states from elite sessions\n", - " :param elite_actions: 1D list of actions from elite sessions\n", - "\n", - " \"\"\"\n", - "\n", - " new_policy = np.zeros([n_states, n_actions])\n", - "\n", - " \n", - " # Don't forget to set 1/n_actions for all actions in unvisited states.\n", - "\n", - " return new_policy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "elite_states = [1, 2, 3, 4, 2, 0, 2, 3, 1]\n", - "elite_actions = [0, 2, 4, 3, 2, 0, 1, 3, 3]\n", - "\n", - "new_policy = update_policy(elite_states, elite_actions)\n", - "\n", - "assert np.isfinite(new_policy).all(), \\\n", - " \"Your new policy contains NaNs or +-inf. Make sure you don't divide by zero.\"\n", - "assert np.all(new_policy >= 0), \\\n", - " \"Your new policy can't have negative action probabilities\"\n", - "assert np.allclose(new_policy.sum(axis=-1), 1), \\\n", - " \"Your new policy should be a valid probability distribution over actions\"\n", - "\n", - "reference_answer = np.array([\n", - " [1., 0., 0., 0., 0.],\n", - " [0.5, 0., 0., 0.5, 0.],\n", - " [0., 0.33333333, 0.66666667, 0., 0.],\n", - " [0., 0., 0., 0.5, 0.5]])\n", - "assert np.allclose(new_policy[:4, :5], reference_answer)\n", - "\n", - "print(\"Ok!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Training loop\n", - "Generate sessions, select N best and fit to those." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.display import clear_output\n", - "\n", - "def show_progress(rewards_batch, log, percentile, reward_range=[-990, +10]):\n", - " \"\"\"\n", - " A convenience function that displays training progress. \n", - " No cool math here, just charts.\n", - " \"\"\"\n", - "\n", - " mean_reward = np.mean(rewards_batch)\n", - " threshold = np.percentile(rewards_batch, percentile)\n", - " log.append([mean_reward, threshold])\n", - " \n", - " plt.figure(figsize=[8, 4])\n", - " plt.subplot(1, 2, 1)\n", - " plt.plot(list(zip(*log))[0], label='Mean rewards')\n", - " plt.plot(list(zip(*log))[1], label='Reward thresholds')\n", - " plt.legend()\n", - " plt.grid()\n", - "\n", - " plt.subplot(1, 2, 2)\n", - " plt.hist(rewards_batch, range=reward_range)\n", - " plt.vlines([np.percentile(rewards_batch, percentile)],\n", - " [0], [100], label=\"percentile\", color='red')\n", - " plt.legend()\n", - " plt.grid()\n", - " clear_output(True)\n", - " print(\"mean reward = %.3f, threshold=%.3f\" % (mean_reward, threshold))\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# reset policy just in case\n", - "policy = initialize_policy(n_states, n_actions)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_sessions = 250 # sample this many sessions\n", - "percentile = 50 # take this percent of session with highest rewards\n", - "learning_rate = 0.5 # how quickly the policy is updated, on a scale from 0 to 1\n", - "\n", - "log = []\n", - "\n", - "for i in range(100):\n", - " %time sessions = [ ]\n", - "\n", - " states_batch, actions_batch, rewards_batch = zip(*sessions)\n", - "\n", - " elite_states, elite_actions = \n", - "\n", - " new_policy = \n", - "\n", - " policy = learning_rate * new_policy + (1 - learning_rate) * policy\n", - "\n", - " # display results on chart\n", - " show_progress(rewards_batch, log, percentile)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Reflecting on results\n", - "\n", - "You may have noticed that the taxi problem quickly converges from less than -1000 to a near-optimal score and then descends back into -50/-100. This is in part because the environment has some innate randomness. Namely, the starting points of passenger/driver change from episode to episode.\n", - "\n", - "In case CEM failed to learn how to win from one distinct starting point, it will simply discard it because no sessions from that starting point will make it into the \"elites\".\n", - "\n", - "To mitigate that problem, you can either reduce the threshold for elite sessions (duct tape way) or change the way you evaluate strategy (theoretically correct way). For each starting state, you can sample an action randomly, and then evaluate this action by running _several_ games starting from it and averaging the total reward. Choosing elite sessions with this kind of sampling (where each session's reward is counted as the average of the rewards of all sessions with the same starting state and action) should improve the performance of your policy." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "### You're not done yet!\n", - "\n", - "Go to [`./deep_crossentropy_method.ipynb`](./deep_crossentropy_method.ipynb) for a more serious task" - ] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/week01_intro/deep_crossentropy_method.ipynb b/week01_intro/deep_crossentropy_method.ipynb index 786d2b8f9..6cf7aa349 100644 --- a/week01_intro/deep_crossentropy_method.ipynb +++ b/week01_intro/deep_crossentropy_method.ipynb @@ -1,446 +1,524 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Deep Crossentropy method\n", - "\n", - "In this section we'll extend your CEM implementation with neural networks! You will train a multi-layer neural network to solve simple continuous state space games. __Please make sure you're done with tabular crossentropy method from the previous notebook.__\n", - "\n", - "![img](https://tip.duke.edu/independent_learning/greek/lesson/digging_deeper_final.jpg)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys, os\n", - "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", - "\n", - " !touch .setup_complete\n", - "\n", - "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", - "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", - " !bash ../xvfb start\n", - " os.environ['DISPLAY'] = ':1'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import gym\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "\n", - "# if you see \" has no attribute .env\", remove .env or update gym\n", - "env = gym.make(\"CartPole-v0\").env\n", - "\n", - "env.reset()\n", - "n_actions = env.action_space.n\n", - "state_dim = env.observation_space.shape[0]\n", - "\n", - "plt.imshow(env.render(\"rgb_array\"))\n", - "print(\"state vector dim =\", state_dim)\n", - "print(\"n_actions =\", n_actions)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Neural Network Policy\n", - "\n", - "For this assignment we'll utilize the simplified neural network implementation from __[Scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html)__. Here's what you'll need:\n", - "\n", - "* `agent.partial_fit(states, actions)` - make a single training pass over the data. Maximize the probabilitity of :actions: from :states:\n", - "* `agent.predict_proba(states)` - predict probabilities of all actions, a matrix of shape __[len(states), n_actions]__\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.neural_network import MLPClassifier\n", - "\n", - "agent = MLPClassifier(\n", - " hidden_layer_sizes=(20, 20),\n", - " activation='tanh',\n", - ")\n", - "\n", - "# initialize agent to the dimension of state space and number of actions\n", - "agent.partial_fit([env.reset()] * n_actions, range(n_actions), range(n_actions))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def generate_session(env, agent, t_max=1000):\n", - " \"\"\"\n", - " Play a single game using agent neural network.\n", - " Terminate when game finishes or after :t_max: steps\n", - " \"\"\"\n", - " states, actions = [], []\n", - " total_reward = 0\n", - "\n", - " s = env.reset()\n", - "\n", - " for t in range(t_max):\n", - " \n", - " # use agent to predict a vector of action probabilities for state :s:\n", - " probs = \n", - "\n", - " assert probs.shape == (env.action_space.n,), \"make sure probabilities are a vector (hint: np.reshape)\"\n", - " \n", - " # use the probabilities you predicted to pick an action\n", - " # sample proportionally to the probabilities, don't just take the most likely action\n", - " a = \n", - " # ^-- hint: try np.random.choice\n", - "\n", - " new_s, r, done, info = env.step(a)\n", - "\n", - " # record sessions like you did before\n", - " states.append(s)\n", - " actions.append(a)\n", - " total_reward += r\n", - "\n", - " s = new_s\n", - " if done:\n", - " break\n", - " return states, actions, total_reward" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dummy_states, dummy_actions, dummy_reward = generate_session(env, agent, t_max=5)\n", - "print(\"states:\", np.stack(dummy_states))\n", - "print(\"actions:\", dummy_actions)\n", - "print(\"reward:\", dummy_reward)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### CEM steps\n", - "Deep CEM uses exactly the same strategy as the regular CEM, so you can copy your function code from previous notebook.\n", - "\n", - "The only difference is that now each observation is not a number but a `float32` vector." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def select_elites(states_batch, actions_batch, rewards_batch, percentile=50):\n", - " \"\"\"\n", - " Select states and actions from games that have rewards >= percentile\n", - " :param states_batch: list of lists of states, states_batch[session_i][t]\n", - " :param actions_batch: list of lists of actions, actions_batch[session_i][t]\n", - " :param rewards_batch: list of rewards, rewards_batch[session_i]\n", - "\n", - " :returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions\n", - "\n", - " Please return elite states and actions in their original order \n", - " [i.e. sorted by session number and timestep within session]\n", - "\n", - " If you are confused, see examples below. Please don't assume that states are integers\n", - " (they will become different later).\n", - " \"\"\"\n", - "\n", - " \n", - " \n", - " return elite_states, elite_actions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Training loop\n", - "Generate sessions, select N best and fit to those." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.display import clear_output\n", - "\n", - "def show_progress(rewards_batch, log, percentile, reward_range=[-990, +10]):\n", - " \"\"\"\n", - " A convenience function that displays training progress. \n", - " No cool math here, just charts.\n", - " \"\"\"\n", - "\n", - " mean_reward = np.mean(rewards_batch)\n", - " threshold = np.percentile(rewards_batch, percentile)\n", - " log.append([mean_reward, threshold])\n", - "\n", - " clear_output(True)\n", - " print(\"mean reward = %.3f, threshold=%.3f\" % (mean_reward, threshold))\n", - " plt.figure(figsize=[8, 4])\n", - " plt.subplot(1, 2, 1)\n", - " plt.plot(list(zip(*log))[0], label='Mean rewards')\n", - " plt.plot(list(zip(*log))[1], label='Reward thresholds')\n", - " plt.legend()\n", - " plt.grid()\n", - "\n", - " plt.subplot(1, 2, 2)\n", - " plt.hist(rewards_batch, range=reward_range)\n", - " plt.vlines([np.percentile(rewards_batch, percentile)],\n", - " [0], [100], label=\"percentile\", color='red')\n", - " plt.legend()\n", - " plt.grid()\n", - "\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_sessions = 100\n", - "percentile = 70\n", - "log = []\n", - "\n", - "for i in range(100):\n", - " # generate new sessions\n", - " sessions = [ ]\n", - "\n", - " states_batch, actions_batch, rewards_batch = map(np.array, zip(*sessions))\n", - "\n", - " elite_states, elite_actions = \n", - "\n", - " \n", - "\n", - " show_progress(rewards_batch, log, percentile, reward_range=[0, np.max(rewards_batch)])\n", - "\n", - " if np.mean(rewards_batch) > 190:\n", - " print(\"You Win! You may stop training now via KeyboardInterrupt.\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Record sessions\n", - "\n", - "import gym.wrappers\n", - "\n", - "with gym.wrappers.Monitor(gym.make(\"CartPole-v0\"), directory=\"videos\", force=True) as env_monitor:\n", - " sessions = [generate_session(env_monitor, agent) for _ in range(100)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Show video. This may not work in some setups. If it doesn't\n", - "# work for you, you can download the videos and view them locally.\n", - "\n", - "from pathlib import Path\n", - "from IPython.display import HTML\n", - "\n", - "video_names = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", - "\n", - "HTML(\"\"\"\n", - "\n", - "\"\"\".format(video_names[-1])) # You can also try other indices" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Homework part I\n", - "\n", - "### Tabular crossentropy method\n", - "\n", - "You may have noticed that the taxi problem quickly converges from -100 to a near-optimal score and then descends back into -50/-100. This is in part because the environment has some innate randomness. Namely, the starting points of passenger/driver change from episode to episode.\n", - "\n", - "### Tasks\n", - "- __1.1__ (1 pts) Find out how the algorithm performance changes if you use a different `percentile` and/or `n_sessions`.\n", - "- __1.2__ (2 pts) Tune the algorithm to end up with positive average score.\n", - "\n", - "It's okay to modify the existing code.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "``````" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Homework part II\n", - "\n", - "### Deep crossentropy method\n", - "\n", - "By this moment you should have got enough score on [CartPole-v0](https://gym.openai.com/envs/CartPole-v0) to consider it solved (see the link). It's time to try something harder.\n", - "\n", - "* if you have any trouble with CartPole-v0 and feel stuck, feel free to ask us or your peers for help.\n", - "\n", - "### Tasks\n", - "\n", - "* __2.1__ (3 pts) Pick one of environments: `MountainCar-v0` or `LunarLander-v2`.\n", - " * For MountainCar, get average reward of __at least -150__\n", - " * For LunarLander, get average reward of __at least +50__\n", - "\n", - "See the tips section below, it's kinda important.\n", - "__Note:__ If your agent is below the target score, you'll still get most of the points depending on the result, so don't be afraid to submit it.\n", - " \n", - " \n", - "* __2.2__ (up to 6 pt) Devise a way to speed up training against the default version\n", - " * Obvious improvement: use [`joblib`](https://joblib.readthedocs.io/en/latest/). However, note that you will probably need to spawn a new environment in each of the workers instead of passing it via pickling.\n", - " * Try re-using samples from 3-5 last iterations when computing threshold and training.\n", - " * Experiment with the number of training iterations and learning rate of the neural network (see params).\n", - " \n", - "__Please list what you did in Anytask submission form. You must measure your improvement experimentally. Your score depends on this improvement.__\n", - "\n", - "* __If the algorithm converges 2x faster, you obtain 3 pts.__\n", - "* __If the algorithm converges 4x faster, you obtain 6 pts.__\n", - " \n", - " \n", - "### Tips\n", - "* Gym page: [MountainCar](https://gym.openai.com/envs/MountainCar-v0), [LunarLander](https://gym.openai.com/envs/LunarLander-v2)\n", - "* Sessions for MountainCar may last for 10k+ ticks. Make sure ```t_max``` param is at least 10k.\n", - " * Also it may be a good idea to cut rewards via \">\" and not \">=\". If 90% of your sessions get reward of -10k and 10% are better, than if you use percentile 20% as threshold, R >= threshold __fails cut off bad sessions__ whule R > threshold works alright.\n", - "* _issue with gym_: Some versions of gym limit game time by 200 ticks. This will prevent cem training in most cases. Make sure your agent is able to play for the specified __t_max__, and if it isn't, try `env = gym.make(\"MountainCar-v0\").env` or otherwise get rid of TimeLimit wrapper.\n", - "* If you use old _swig_ lib for LunarLander-v2, you may get an error. See this [issue](https://github.com/openai/gym/issues/100) for solution.\n", - "* If it won't train it's a good idea to plot reward distribution and record sessions: they may give you some clue. If they don't, call course staff :)\n", - "* 20-neuron network is probably not enough, feel free to experiment.\n", - "\n", - "You may find the following snippet useful:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "I_i1q1TWG9zH" + }, + "source": [ + "# Deep Crossentropy method\n", + "\n", + "In this section we'll extend your CEM implementation with neural networks! You will train a multi-layer neural network to solve simple continuous state space games. __Please make sure you're done with tabular crossentropy method from the previous notebook.__\n", + "\n", + "![img](https://watanimg.elwatannews.com/old_news_images/large/249765_Large_20140709045740_11.jpg)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "t4CJ1sRyG9zJ" + }, + "outputs": [], + "source": [ + "import sys, os\n", + "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + " !touch .setup_complete\n", + "\n", + "# This code creates a virtual display to draw game images on.\n", + "# It will have no effect if your machine has a monitor.\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " os.environ['DISPLAY'] = ':1'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "C2xd5vPwPVCb" + }, + "outputs": [], + "source": [ + "# Install gymnasium if you didn't\n", + "!pip install gymnasium[toy_text,classic_control]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_2zbc7ahG9zK" + }, + "outputs": [], + "source": [ + "import gymnasium as gym\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "# if you see \" has no attribute .env\", remove .env or update gym\n", + "env = gym.make(\"CartPole-v0\", render_mode=\"rgb_array\").env\n", + "\n", + "env.reset()\n", + "n_actions = env.action_space.n\n", + "state_dim = env.observation_space.shape[0]\n", + "\n", + "plt.imshow(env.render())\n", + "print(\"state vector dim =\", state_dim)\n", + "print(\"n_actions =\", n_actions)\n", + "\n", + "env.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Z72_alhdG9zK" + }, + "source": [ + "# Neural Network Policy\n", + "\n", + "For this assignment we'll utilize the simplified neural network implementation from __[Scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html)__. Here's what you'll need:\n", + "\n", + "* `agent.partial_fit(states, actions)` - make a single training pass over the data. Maximize the probability of :actions: from :states:\n", + "* `agent.predict_proba(states)` - predict probabilities of all actions, a matrix of shape __[len(states), n_actions]__\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wLItY4unG9zL" + }, + "outputs": [], + "source": [ + "from sklearn.neural_network import MLPClassifier\n", + "\n", + "agent = MLPClassifier(\n", + " hidden_layer_sizes=(20, 20),\n", + " activation=\"tanh\",\n", + ")\n", + "\n", + "# initialize agent to the dimension of state space and number of actions\n", + "agent.partial_fit([env.reset()[0]] * n_actions, range(n_actions), range(n_actions))\n" + ] + }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 45, + "metadata": { + "id": "eyFS3oUmG9zL" + }, + "outputs": [], + "source": [ + "def generate_session(env, agent, t_max=1000):\n", + " \"\"\"\n", + " Play a single game using agent neural network.\n", + " Terminate when game finishes or after :t_max: steps\n", + " \"\"\"\n", + " states, actions = [], []\n", + " total_reward = 0\n", + "\n", + " s, _ = env.reset()\n", + "\n", + " for t in range(t_max):\n", + "\n", + " # use agent to predict a vector of action probabilities for state :s:\n", + " probs = \n", + "\n", + " assert probs.shape == (env.action_space.n,), \"make sure probabilities are a vector (hint: np.reshape)\"\n", + "\n", + " # use the probabilities you predicted to pick an action\n", + " # sample proportionally to the probabilities, don't just take the most likely action\n", + " a = \n", + " # ^-- hint: try np.random.choice\n", + "\n", + " new_s, r, terminated, truncated, _ = env.step(a)\n", + "\n", + " # record sessions like you did before\n", + " states.append(s)\n", + " actions.append(a)\n", + " total_reward += r\n", + "\n", + " s = new_s\n", + " if terminated or truncated:\n", + " break\n", + " return states, actions, total_reward\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4xgrTCgJG9zL" + }, + "outputs": [], + "source": [ + "dummy_states, dummy_actions, dummy_reward = generate_session(env, agent, t_max=5)\n", + "print(\"states:\", np.stack(dummy_states))\n", + "print(\"actions:\", dummy_actions)\n", + "print(\"reward:\", dummy_reward)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "p85lt16qG9zL" + }, + "source": [ + "### CEM steps\n", + "Deep CEM uses exactly the same strategy as the regular CEM, so you can copy your function code from previous notebook.\n", + "\n", + "The only difference is that now each observation is not a number but a `float32` vector." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "id": "4On-p7p4G9zL" + }, + "outputs": [], + "source": [ + "def select_elites(states_batch, actions_batch, rewards_batch, percentile=50):\n", + " \"\"\"\n", + " Select states and actions from games that have rewards >= percentile\n", + " :param states_batch: list of lists of states, states_batch[session_i][t]\n", + " :param actions_batch: list of lists of actions, actions_batch[session_i][t]\n", + " :param rewards_batch: list of rewards, rewards_batch[session_i]\n", + "\n", + " :returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions\n", + "\n", + " Please return elite states and actions in their original order\n", + " [i.e. sorted by session number and timestep within session]\n", + "\n", + " If you are confused, see examples below. Please don't assume that states are integers\n", + " (they will become different later).\n", + " \"\"\"\n", + "\n", + " \n", + "\n", + " return elite_states, elite_actions\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xc40V4DaG9zM" + }, + "source": [ + "# Training loop\n", + "Generate sessions, select N best and fit to those." + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "id": "PPwVKwF7G9zM" + }, + "outputs": [], + "source": [ + "from IPython.display import clear_output\n", + "\n", + "\n", + "def show_progress(rewards_batch, log, percentile, reward_range=[-990, +10]):\n", + " \"\"\"\n", + " A convenience function that displays training progress.\n", + " No cool math here, just charts.\n", + " \"\"\"\n", + "\n", + " mean_reward = np.mean(rewards_batch)\n", + " threshold = np.percentile(rewards_batch, percentile)\n", + " log.append([mean_reward, threshold])\n", + "\n", + " clear_output(True)\n", + " print(\"mean reward = %.3f, threshold=%.3f\" % (mean_reward, threshold))\n", + " plt.figure(figsize=[8, 4])\n", + " plt.subplot(1, 2, 1)\n", + " plt.plot(list(zip(*log))[0], label=\"Mean rewards\")\n", + " plt.plot(list(zip(*log))[1], label=\"Reward thresholds\")\n", + " plt.legend()\n", + " plt.grid()\n", + "\n", + " plt.subplot(1, 2, 2)\n", + " plt.hist(rewards_batch, range=reward_range)\n", + " plt.vlines(\n", + " [np.percentile(rewards_batch, percentile)],\n", + " [0],\n", + " [100],\n", + " label=\"percentile\",\n", + " color=\"red\",\n", + " )\n", + " plt.legend()\n", + " plt.grid()\n", + "\n", + " plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "euK7WRQiG9zM" + }, + "outputs": [], + "source": [ + "n_sessions = 100\n", + "percentile = 70\n", + "log = []\n", + "\n", + "for i in range(100):\n", + " # generate new sessions\n", + " sessions = [ ]\n", + "\n", + " states_batch, actions_batch, rewards_batch = map(np.array, zip(*sessions))\n", + "\n", + " elite_states, elite_actions = \n", + "\n", + " \n", + "\n", + " show_progress(\n", + " rewards_batch, log, percentile, reward_range=[0, np.max(rewards_batch)]\n", + " )\n", + "\n", + " if np.mean(rewards_batch) > 190:\n", + " print(\"You Win! You may stop training now via KeyboardInterrupt.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yeNWKjtsG9zM" + }, + "source": [ + "# Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RJwsWl4kG9zM" + }, + "outputs": [], + "source": [ + "# Record sessions\n", + "\n", + "from gymnasium.wrappers import RecordVideo\n", + "\n", + "with RecordVideo(\n", + " env=gym.make(\"CartPole-v0\", render_mode=\"rgb_array\"),\n", + " video_folder=\"./videos\",\n", + " episode_trigger=lambda episode_number: True,\n", + ") as env_monitor:\n", + " sessions = [generate_session(env_monitor, agent) for _ in range(100)]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kLPXdME7G9zN" + }, + "outputs": [], + "source": [ + "# Show video. This may not work in some setups. If it doesn't\n", + "# work for you, you can download the videos and view them locally.\n", + "\n", + "from pathlib import Path\n", + "from base64 import b64encode\n", + "from IPython.display import HTML\n", + "\n", + "video_paths = sorted([s for s in Path(\"videos\").iterdir() if s.suffix == \".mp4\"])\n", + "video_path = video_paths[-1] # You can also try other indices\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " # https://stackoverflow.com/a/57378660/1214547\n", + " with video_path.open(\"rb\") as fp:\n", + " mp4 = fp.read()\n", + " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n", + "else:\n", + " data_url = str(video_path)\n", + "\n", + "HTML(\n", + " \"\"\"\n", + "\n", + "\"\"\".format(\n", + " data_url\n", + " )\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6d_3oOQ1G9zN" + }, + "source": [ + "# Homework part I\n", + "\n", + "### Tabular crossentropy method\n", + "\n", + "You may have noticed that the taxi problem quickly converges from -100 to a near-optimal score and then descends back into -50/-100. This is in part because the environment has some innate randomness. Namely, the starting points of passenger/driver change from episode to episode.\n", + "\n", + "### Tasks\n", + "- __1.1__ (2 pts) Find out how the algorithm performance changes if you use a different `percentile` and/or `n_sessions`. Provide here some figures so we can see how the hyperparameters influence the performance.\n", + "- __1.2__ (1 pts) Tune the algorithm to end up with positive average score.\n", + "\n", + "It's okay to modify the existing code.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L88LySiVG9zN" + }, + "source": [ + "``````" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7LpAJc4rG9zN" + }, + "source": [ + "# Homework part II\n", + "\n", + "### Deep crossentropy method\n", + "\n", + "By this moment, you should have got enough score on [CartPole-v0](https://gymnasium.farama.org/environments/classic_control/cart_pole/) to consider it solved (see the link). It's time to try something harder.\n", + "\n", + "* if you have any trouble with CartPole-v0 and feel stuck, feel free to ask us or your peers for help.\n", + "\n", + "### Tasks\n", + "\n", + "* __2.1__ (3 pts) Pick one of environments: `MountainCar-v0` or `LunarLander-v2`.\n", + " * For MountainCar, get average reward of __at least -150__\n", + " * For LunarLander, get average reward of __at least +50__\n", + "\n", + "See the tips section below, it's kinda important.\n", + "__Note:__ If your agent is below the target score, you'll still get some of the points depending on the result, so don't be afraid to submit it.\n", + " \n", + " \n", + "* __2.2__ (up to 6 pts) Devise a way to speed up training against the default version\n", + " * Obvious improvement: use [`joblib`](https://joblib.readthedocs.io/en/latest/). However, note that you will probably need to spawn a new environment in each of the workers instead of passing it via pickling. (2 pts)\n", + " * Try re-using samples from 3-5 last iterations when computing threshold and training. (2 pts)\n", + " * Obtain __-100__ at `MountainCar-v0` or __+200__ at `LunarLander-v2` (2 pts). Feel free to experiment with hyperparameters, architectures, schedules etc.\n", + " \n", + "__Please list what you did in Anytask submission form__. This reduces probability that somebody misses something.\n", + " \n", + " \n", + "### Tips\n", + "* Gymnasium pages: [MountainCar](https://gymnasium.farama.org/environments/classic_control/mountain_car/), [LunarLander](https://gymnasium.farama.org/environments/box2d/lunar_lander/)\n", + "* Sessions for MountainCar may last for 10k+ ticks. Make sure ```t_max``` param is at least 10k.\n", + " * Also it may be a good idea to cut rewards via \">\" and not \">=\". If 90% of your sessions get reward of -10k and 10% are better, than if you use percentile 20% as threshold, R >= threshold __fails to cut off bad sessions__ while R > threshold works alright.\n", + "* _issue with gym_: Some versions of gym limit game time by 200 ticks. This will prevent cem training in most cases. Make sure your agent is able to play for the specified __t_max__, and if it isn't, try `env = gym.make(\"MountainCar-v0\").env` or otherwise get rid of TimeLimit wrapper.\n", + "* If you use old _swig_ lib for LunarLander-v2, you may get an error. See this [issue](https://github.com/openai/gym/issues/100) for solution.\n", + "* If it doesn't train, it's a good idea to plot reward distribution and record sessions: they may give you some clue. If they don't, call course staff :)\n", + "* 20-neuron network is probably not enough, feel free to experiment.\n", + "\n", + "You may find the following snippet useful:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qcjz-nm_G9zN" + }, + "outputs": [], + "source": [ + "def visualize_mountain_car(env, agent):\n", + " # Compute policy for all possible x and v (with discretization)\n", + " xs = np.linspace(env.min_position, env.max_position, 100)\n", + " vs = np.linspace(-env.max_speed, env.max_speed, 100)\n", + "\n", + " grid = np.dstack(np.meshgrid(xs, vs[::-1])).transpose(1, 0, 2)\n", + " grid_flat = grid.reshape(len(xs) * len(vs), 2)\n", + " probs = (\n", + " agent.predict_proba(grid_flat).reshape(len(xs), len(vs), 3).transpose(1, 0, 2)\n", + " )\n", + "\n", + " # # The above code is equivalent to the following:\n", + " # probs = np.empty((len(vs), len(xs), 3))\n", + " # for i, v in enumerate(vs[::-1]):\n", + " # for j, x in enumerate(xs):\n", + " # probs[i, j, :] = agent.predict_proba([[x, v]])[0]\n", + "\n", + " # Draw policy\n", + " f, ax = plt.subplots(figsize=(7, 7))\n", + " ax.imshow(\n", + " probs,\n", + " extent=(env.min_position, env.max_position, -env.max_speed, env.max_speed),\n", + " aspect=\"auto\",\n", + " )\n", + " ax.set_title(\"Learned policy: red=left, green=nothing, blue=right\")\n", + " ax.set_xlabel(\"position (x)\")\n", + " ax.set_ylabel(\"velocity (v)\")\n", + "\n", + " # Sample a trajectory and draw it\n", + " states, actions, _ = generate_session(env, agent)\n", + " states = np.array(states)\n", + " ax.plot(states[:, 0], states[:, 1], color=\"white\")\n", + "\n", + " # Draw every 3rd action from the trajectory\n", + " for (x, v), a in zip(states[::3], actions[::3]):\n", + " if a == 0:\n", + " plt.arrow(x, v, -0.1, 0, color=\"white\", head_length=0.02)\n", + " elif a == 2:\n", + " plt.arrow(x, v, 0.1, 0, color=\"white\", head_length=0.02)\n", + "\n", + "\n", + "with gym.make(\"MountainCar-v0\", render_mode=\"rgb_arrary\").env as env:\n", + " visualize_mountain_car(env, agent)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Dzk41lDPG9zO" + }, + "source": [ + "### Bonus tasks\n", + "\n", + "* __2.3 bonus__ (2 pts) Try to find a network architecture and training params that solve __both__ environments above (_Points depend on implementation. If you attempted this task, please mention it in Anytask submission._)\n", + "\n", + "* __2.4 bonus__ (4 pts) Solve continuous action space task with `MLPRegressor` or similar.\n", + " * Since your agent only predicts the \"expected\" action, you will have to add noise to ensure exploration.\n", + " * Choose one of [MountainCarContinuous-v0](https://gymnasium.farama.org/environments/classic_control/mountain_car_continuous/) (90+ pts to solve), [LunarLanderContinuous-v2](https://gymnasium.farama.org/environments/box2d/lunar_lander/) (`env = gym.make(\"LunarLander-v2\", continuous=True)`)(200+ pts to solve)\n", + " * 4 points for solving. Slightly less for getting some results below solution threshold. Note that discrete and continuous environments may have slightly different rules, aside from action spaces." ] - }, - "metadata": {}, - "output_type": "display_data" } - ], - "source": [ - "def visualize_mountain_car(env, agent):\n", - " # Compute policy for all possible x and v (with discretization)\n", - " xs = np.linspace(env.min_position, env.max_position, 100)\n", - " vs = np.linspace(-env.max_speed, env.max_speed, 100)\n", - " \n", - " grid = np.dstack(np.meshgrid(xs, vs[::-1])).transpose(1, 0, 2)\n", - " grid_flat = grid.reshape(len(xs) * len(vs), 2)\n", - " probs = agent.predict_proba(grid_flat).reshape(len(xs), len(vs), 3).transpose(1, 0, 2)\n", - "\n", - " # # The above code is equivalent to the following:\n", - " # probs = np.empty((len(vs), len(xs), 3))\n", - " # for i, v in enumerate(vs[::-1]):\n", - " # for j, x in enumerate(xs):\n", - " # probs[i, j, :] = agent.predict_proba([[x, v]])[0]\n", - "\n", - " # Draw policy\n", - " f, ax = plt.subplots(figsize=(7, 7))\n", - " ax.imshow(probs, extent=(env.min_position, env.max_position, -env.max_speed, env.max_speed), aspect='auto')\n", - " ax.set_title('Learned policy: red=left, green=nothing, blue=right')\n", - " ax.set_xlabel('position (x)')\n", - " ax.set_ylabel('velocity (v)')\n", - " \n", - " # Sample a trajectory and draw it\n", - " states, actions, _ = generate_session(env, agent)\n", - " states = np.array(states)\n", - " ax.plot(states[:, 0], states[:, 1], color='white')\n", - " \n", - " # Draw every 3rd action from the trajectory\n", - " for (x, v), a in zip(states[::3], actions[::3]):\n", - " if a == 0:\n", - " plt.arrow(x, v, -0.1, 0, color='white', head_length=0.02)\n", - " elif a == 2:\n", - " plt.arrow(x, v, 0.1, 0, color='white', head_length=0.02)\n", - "\n", - "with gym.make('MountainCar-v0').env as env:\n", - " visualize_mountain_car(env, agent_mountain_car)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Bonus tasks\n", - "\n", - "* __2.3 bonus__ (2 pts) Try to find a network architecture and training params that solve __both__ environments above (_Points depend on implementation. If you attempted this task, please mention it in Anytask submission._)\n", - "\n", - "* __2.4 bonus__ (4 pts) Solve continuous action space task with `MLPRegressor` or similar.\n", - " * Since your agent only predicts the \"expected\" action, you will have to add noise to ensure exploration.\n", - " * Choose one of [MountainCarContinuous-v0](https://gym.openai.com/envs/MountainCarContinuous-v0) (90+ pts to solve), [LunarLanderContinuous-v2](https://gym.openai.com/envs/LunarLanderContinuous-v2) (200+ pts to solve) \n", - " * 4 points for solving. Slightly less for getting some results below solution threshold. Note that discrete and continuous environments may have slightly different rules aside from action spaces.\n", - "\n", - "\n", - "If you're still feeling unchallenged, consider the project (see other notebook in this folder)." - ] + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/week01_intro/primer_python_for_ml/recap_ml.ipynb b/week01_intro/primer_python_for_ml/recap_ml.ipynb index b40d81f62..c94760b29 100644 --- a/week01_intro/primer_python_for_ml/recap_ml.ipynb +++ b/week01_intro/primer_python_for_ml/recap_ml.ipynb @@ -121,7 +121,7 @@ "\n", "import sys\n", "if 'google.colab' in sys.modules:\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week01_intro/primer_python_for_ml/train.csv" + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week01_intro/primer_python_for_ml/train.csv" ] }, { diff --git a/week01_intro/project_starter_evolution_strategies.ipynb b/week01_intro/project_starter_evolution_strategies.ipynb index 68f869cdf..d21475cbf 100644 --- a/week01_intro/project_starter_evolution_strategies.ipynb +++ b/week01_intro/project_starter_evolution_strategies.ipynb @@ -8,7 +8,7 @@ "\n", "![img](https://t4.ftcdn.net/jpg/00/17/46/81/240_F_17468143_wY3hsHyfNYoMdG9BlC56HI4JA7pNu63h.jpg)\n", "\n", - "Remember the idea behind Evolution Strategies? Here's a neat [blog post](https://blog.openai.com/evolution-strategies/) about 'em.\n", + "Remember the idea behind Evolution Strategies? Here's a neat [blog post](https://openai.com/research/evolution-strategies) about 'em.\n", "\n", "Can you reproduce their success? You will have to implement evolutionary strategies and see how they work.\n", "\n", diff --git a/week01_intro/seminar-es-task.ipynb b/week01_intro/seminar-es-task.ipynb new file mode 100644 index 000000000..38d4e3946 --- /dev/null +++ b/week01_intro/seminar-es-task.ipynb @@ -0,0 +1,656 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evolution Strategies\n", + "\n", + "![img](https://t4.ftcdn.net/jpg/00/17/46/81/240_F_17468143_wY3hsHyfNYoMdG9BlC56HI4JA7pNu63h.jpg)\n", + "\n", + "[The paper about the algorithm](https://arxiv.org/abs/1703.03864)\n", + "\n", + "Plan:\n", + "* Study how to use OpenAI gym\n", + "* Basic prototype of evolutionary strategies that works in one thread on CartPole\n", + "* Modify the code to make them work in parallel\n", + "* Use the algorithm to solve the LunarLander\n", + "* Analyse the influence of hyperparameters" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys, os\n", + "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + "\n", + " !touch .setup_complete\n", + "\n", + "# This code creates a virtual display to draw game images on.\n", + "# It will have no effect if your machine has a monitor.\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " os.environ['DISPLAY'] = ':1'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "import time\n", + "from copy import deepcopy\n", + "from joblib import Parallel, delayed\n", + "from IPython.display import clear_output\n", + "from IPython import display\n", + " \n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# OpenAI gym environment" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Observation space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)\n", + "Action space: Discrete(2)\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAATPklEQVR4nO3dbaxd5Zne8f/lFwyJmWLwgRjbDCbjqIEoY6IjD2qqiiaZgTKjmnxI60il/oDkfCBKoo7U4hmpk3ywMm2GpFLURCIDGqthQq0SBitK22FoUho1xZgMEGzjYMCFg21swnsmONjn7oezHDb2sc/2een2c87/J23tte/1rL3vB+GLxeO1105VIUlqx7xBNyBJOjMGtyQ1xuCWpMYY3JLUGINbkhpjcEtSY2YsuJNcn2RPkr1Jbp2pz5GkuSYzcR13kvnAz4DfBUaAh4FPV9Wuaf8wSZpjZuqMey2wt6qeqapfAXcD62bosyRpTlkwQ++7HHi+5/UI8DunGrx06dK6/PLLZ6gVSWrPvn37eOmllzLevpkK7vE+7F1rMkk2AhsBLrvsMnbs2DFDrUhSe4aHh0+5b6aWSkaAlT2vVwD7ewdU1e1VNVxVw0NDQzPUhiTNPjMV3A8Dq5OsSnIOsB7YNkOfJUlzyowslVTV0SSfBf47MB+4s6p2zsRnSdJcM1Nr3FTV94Hvz9T7S9Jc5TcnJakxBrckNcbglqTGGNyS1BiDW5IaY3BLUmMMbklqjMEtSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1Zko/XZZkH/AGcAw4WlXDSS4E/jNwObAP+GdV9crU2pQkHTcdZ9z/uKrWVNVw9/pW4IGqWg080L2WJE2TmVgqWQds6ba3ADfOwGdI0pw11eAu4K+TPJJkY1e7pKoOAHTPF0/xMyRJPaa0xg18tKr2J7kYuD/Jk/0e2AX9RoDLLrtsim1I0twxpTPuqtrfPR8C7gXWAi8mWQbQPR86xbG3V9VwVQ0PDQ1NpQ1JmlMmHdxJ3pvk/OPbwO8BTwDbgA3dsA3AfVNtUpL0jqkslVwC3Jvk+Pv8ZVX9tyQPA1uT3Aw8B3xq6m1Kko6bdHBX1TPAb49T/znw8ak0JUk6Nb85KUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWqMwS1JjTG4JakxBrckNcbglqTGGNyS1BiDW5IaY3BLUmMMbklqjMEtSY0xuCWpMQa3JDVmwuBOcmeSQ0me6KldmOT+JE91z0t69m1KsjfJniTXzVTjkjRX9XPG/RfA9SfUbgUeqKrVwAPda5JcCawHruqO+UaS+dPWrSRp4uCuqgeBl08orwO2dNtbgBt76ndX1ZGqehbYC6ydpl4lSUx+jfuSqjoA0D1f3NWXA8/3jBvpaidJsjHJjiQ7Dh8+PMk2JGnume6/nMw4tRpvYFXdXlXDVTU8NDQ0zW1I0uw12eB+MckygO75UFcfAVb2jFsB7J98e5KkE002uLcBG7rtDcB9PfX1SRYlWQWsBrZPrUVJUq8FEw1I8h3gWmBpkhHgT4A/BbYmuRl4DvgUQFXtTLIV2AUcBW6pqmMz1LskzUkTBndVffoUuz5+ivGbgc1TaUqSdGp+c1KSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWqMwS1JjTG4JakxBrckNcbglqTGGNyS1BiDW5IaY3BLUmMmDO4kdyY5lOSJntoXk7yQ5NHucUPPvk1J9ibZk+S6mWpckuaqfs64/wK4fpz616pqTff4PkCSK4H1wFXdMd9IMn+6mpUk9RHcVfUg8HKf77cOuLuqjlTVs8BeYO0U+pMknWAqa9yfTfJ4t5SypKstB57vGTPS1U6SZGOSHUl2HD58eAptSNLcMtng/ibwfmANcAC4ratnnLE13htU1e1VNVxVw0NDQ5NsQ5LmnkkFd1W9WFXHqmoU+BbvLIeMACt7hq4A9k+tRUlSr0kFd5JlPS8/CRy/4mQbsD7JoiSrgNXA9qm1KEnqtWCiAUm+A1wLLE0yAvwJcG2SNYwtg+wDPgNQVTuTbAV2AUeBW6rq2My0Lklz04TBXVWfHqd8x2nGbwY2T6UpSdKp+c1JSWqMwS1JjTG4JakxBrckNcbglqTGGNxS561XD3Lk9ZcG3YY0oQkvB5Rmo9Gjb7Pvf27h2JFf/Lr2y1cOcMFvfpiV/+Cfk3ne1FJnL4Nbc1LmzWPBovfwytMPv6v+85/9mGUf+X0WvufvDagzaWIulWhOyrz5nHfRyokHSmchg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhozYXAnWZnkB0l2J9mZ5PNd/cIk9yd5qnte0nPMpiR7k+xJct1MTkCS5pp+zriPAn9YVR8ErgFuSXIlcCvwQFWtBh7oXtPtWw9cBVwPfCOJd+yRpGkyYXBX1YGq+km3/QawG1gOrAO2dMO2ADd22+uAu6vqSFU9C+wF1k5349JUvffiy1lw3vnvqo0eO8rrI7sG1JHUnzNa405yOXA18BBwSVUdgLFwBy7uhi0Hnu85bKSrnfheG5PsSLLj8OHDZ965NEXnLVnOgkXvfXexRnnz4NODaUjqU9/BnWQxcA/whap6/XRDx6nVSYWq26tquKqGh4aG+m1Dkua8voI7yULGQvuuqvpuV34xybJu/zLgUFcfAXrvl7kC2D897UqS+rmqJMAdwO6q+mrPrm3Ahm57A3BfT319kkVJVgGrge3T17IkzW39/ALOR4GbgJ8mebSr/RHwp8DWJDcDzwGfAqiqnUm2ArsYuyLllqo6Nu2dS9IcNWFwV9WPGH/dGuDjpzhmM7B5Cn1Jkk7Bb05KUmMMbklqjMEtSY0xuCWpMQa3JDXG4JakxhjcmrsCmXfyjSurRqk66S4N0lnD4NYcFi758O+eVH3lmUc48ro3PtPZy+DWnJWE+ee856T66NEjUKMD6Ejqj8EtSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmN6efHglcm+UGS3Ul2Jvl8V/9ikheSPNo9bug5ZlOSvUn2JLluJicgSXNNPz8WfBT4w6r6SZLzgUeS3N/t+1pV/Vnv4CRXAuuBq4BLgb9J8gF/MFiSpseEZ9xVdaCqftJtvwHsBpaf5pB1wN1VdaSqngX2Amuno1lpui047/yT71dS8NarBwfTkNSHM1rjTnI5cDXwUFf6bJLHk9yZZElXWw4833PYCKcPemlgFl9yBecuWXZCtXhpz/8eSD9SP/oO7iSLgXuAL1TV68A3gfcDa4ADwG3Hh45z+Ek3N06yMcmOJDsOH/YWmpLUr76CO8lCxkL7rqr6LkBVvVhVx6pqFPgW7yyHjAArew5fAew/8T2r6vaqGq6q4aGhoanMQZLmlH6uKglwB7C7qr7aU+/9/8tPAk9029uA9UkWJVkFrAa2T1/LkjS39XNVyUeBm4CfJnm0q/0R8OkkaxhbBtkHfAagqnYm2QrsYuyKlFu8okSSps+EwV1VP2L8devvn+aYzcDmKfQlSToFvzkpSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS+M4duTvOPartwbdhjQug1tz3kUfuOak2psHn+KXL78wgG6kiRncmvPOW+LNK9UWg1uSGmNwS1JjDG5JaozBLUmN6ee2rlJzXnjhBT73uc8xOjo64dgVFyxkwzVLyAn3wNy0aRMjr7494fHz5s3j61//Opdeeulk25XOiMGtWenNN9/kvvvu49ixiW8F/+ErLuFfXnMjvzp27q9rC+e9xYMPPsjjz7w44fHz58/ny1/+8pT6lc6EwS0Bz/3dB9n1+lrGbj1ffPD8hynuG3Rb0rhc49acd2T0PP7vL/4+x+ocjtVCjtU57Hr9d3jt7aWDbk0al8GtOe8XR3+D196+6F21URYwWv7x0Nmpnx8LPjfJ9iSPJdmZ5Etd/cIk9yd5qnte0nPMpiR7k+xJct1MTkCaqt9Y+DIXLTr4rtrCvMWCeRP/xaQ0CP2cUhwBPlZVvw2sAa5Pcg1wK/BAVa0GHuhek+RKYD1wFXA98I0k82eieWk6LMjb/Nbix1i84BWOvnWQl156liVH/opFdWjQrUnj6ufHggt4s3u5sHsUsA64tqtvAX4I/JuufndVHQGeTbIXWAv8eDobl6bLyOHX+fP/chfFXex57uc8+dxLhGK0atCtSePq66qS7oz5EeC3gP9YVQ8luaSqDgBU1YEkF3fDlwP/p+fwka52SgcPHuQrX/nKGTcvncrhw4f7uoYb4OU3fsm9/2v3u2pnEtmjo6PccccdLF3qX2Zq+hw8ePCU+/oK7qo6BqxJcgFwb5IPnWZ4xqmd9OcgyUZgI8Dy5cu56aab+mlF6svTTz/NbbfdRv1/OGueN28e69at44orrpjxz9Lc8e1vf/uU+87oOu6qejXJDxlbu34xybLubHsZcHxBcARY2XPYCmD/OO91O3A7wPDwcL3vfe87k1ak03rttdfIiV+FnEFLly7Ff4c1nRYuXHjKff1cVTLUnWmT5DzgE8CTwDZgQzdsA/z62wrbgPVJFiVZBawGtk+6e0nSu/Rzxr0M2NKtc88DtlbV95L8GNia5GbgOeBTAFW1M8lWYBdwFLilW2qRJE2Dfq4qeRy4epz6z4GPn+KYzcDmKXcnSTqJXw2TpMYY3JLUGO8OqFlp8eLFrFu3ru9ruadi3rx5LF68eMY/RzrO4NastHz5cu65555BtyHNCJdKJKkxBrckNcbglqTGGNyS1BiDW5IaY3BLUmMMbklqjMEtSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1Jj+vmx4HOTbE/yWJKdSb7U1b+Y5IUkj3aPG3qO2ZRkb5I9Sa6byQlI0lzTz/24jwAfq6o3kywEfpTkv3b7vlZVf9Y7OMmVwHrgKuBS4G+SfMAfDJak6THhGXeNebN7ubB71GkOWQfcXVVHqupZYC+wdsqdSpKAPte4k8xP8ihwCLi/qh7qdn02yeNJ7kyypKstB57vOXykq0mSpkFfwV1Vx6pqDbACWJvkQ8A3gfcDa4ADwG3d8Iz3FicWkmxMsiPJjsOHD0+qeUmai87oqpKqehX4IXB9Vb3YBfoo8C3eWQ4ZAVb2HLYC2D/Oe91eVcNVNTw0NDSp5iVpLurnqpKhJBd02+cBnwCeTLKsZ9gngSe67W3A+iSLkqwCVgPbp7dtSZq7+rmqZBmwJcl8xoJ+a1V9L8l/SrKGsWWQfcBnAKpqZ5KtwC7gKHCLV5RI0vSZMLir6nHg6nHqN53mmM3A5qm1Jkkaj9+clKTGGNyS1BiDW5IaY3BLUmMMbklqjMEtSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWqMwS1JjUlVDboHkhwGfgG8NOheZsBSnFdrZuvcnFdbfrOqhsbbcVYEN0CSHVU1POg+ppvzas9snZvzmj1cKpGkxhjcktSYsym4bx90AzPEebVnts7Nec0SZ80atySpP2fTGbckqQ8DD+4k1yfZk2RvklsH3c+ZSnJnkkNJnuipXZjk/iRPdc9LevZt6ua6J8l1g+l6YklWJvlBkt1Jdib5fFdvem5Jzk2yPclj3by+1NWbntdxSeYn+dsk3+tez5Z57Uvy0ySPJtnR1WbF3Calqgb2AOYDTwNXAOcAjwFXDrKnSczhHwEfAZ7oqf174NZu+1bg33XbV3ZzXASs6uY+f9BzOMW8lgEf6bbPB37W9d/03IAAi7vthcBDwDWtz6tnfv8K+Evge7Pl38Wu333A0hNqs2Juk3kM+ox7LbC3qp6pql8BdwPrBtzTGamqB4GXTyivA7Z021uAG3vqd1fVkap6FtjL2D+Ds05VHaiqn3TbbwC7geU0Prca82b3cmH3KBqfF0CSFcDvA3/eU25+Xqcxm+d2WoMO7uXA8z2vR7pa6y6pqgMwFoDAxV29yfkmuRy4mrGz0+bn1i0nPAocAu6vqlkxL+A/AP8aGO2pzYZ5wdh/XP86ySNJNna12TK3M7ZgwJ+fcWqz+TKX5uabZDFwD/CFqno9GW8KY0PHqZ2Vc6uqY8CaJBcA9yb50GmGNzGvJH8AHKqqR5Jc288h49TOunn1+GhV7U9yMXB/kidPM7a1uZ2xQZ9xjwAre16vAPYPqJfp9GKSZQDd86Gu3tR8kyxkLLTvqqrvduVZMTeAqnoV+CFwPe3P66PAP02yj7Elx48l+TbtzwuAqtrfPR8C7mVs6WNWzG0yBh3cDwOrk6xKcg6wHtg24J6mwzZgQ7e9Abivp74+yaIkq4DVwPYB9DehjJ1a3wHsrqqv9uxqem5JhrozbZKcB3wCeJLG51VVm6pqRVVdztifo/9RVf+CxucFkOS9Sc4/vg38HvAEs2Bukzbovx0FbmDsioWngT8edD+T6P87wAHgbcb+S38zcBHwAPBU93xhz/g/7ua6B/gng+7/NPP6h4z97+XjwKPd44bW5wZ8GPjbbl5PAP+2qzc9rxPmeC3vXFXS/LwYu+rsse6x83hOzIa5TfbhNyclqTGDXiqRJJ0hg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMb8P/DjrTvnckq1AAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import gym\n", + "\n", + "tmp_env = gym.make(\"CartPole-v0\")\n", + "tmp_env.reset()\n", + "\n", + "plt.imshow(tmp_env.render('rgb_array'))\n", + "print(\"Observation space:\", tmp_env.observation_space)\n", + "print(\"Action space:\", tmp_env.action_space)\n", + "state_size = tmp_env.observation_space.shape[0]\n", + "n_actions = tmp_env.action_space.n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### How to interact with the environment" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial observation: [-0.0441675 0.0355158 -0.03658269 -0.02903847]\n", + "At each step it returns:\n", + "new observation code: [-0.04345718 -0.15906295 -0.03716346 0.25188148]\n", + "reward: 1.0\n", + "is game over?: False\n" + ] + } + ], + "source": [ + "print(\"Initial observation:\", tmp_env.reset())\n", + "\n", + "new_s, reward, done, _ = tmp_env.step(0)\n", + "print(\"At each step it returns:\")\n", + "\n", + "print(\"new observation code:\", new_s)\n", + "print(\"reward:\", reward)\n", + "print(\"is game over?:\", done)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Display an episode" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def display_session(env, agent, t_max=500):\n", + " total_reward = 0\n", + " plt.figure(figsize=(4, 3))\n", + " display.clear_output(wait=True)\n", + "\n", + " s = env.reset()\n", + " \n", + " for t in range(t_max):\n", + " plt.gca().clear()\n", + " \n", + " a = agent.get_action(torch.tensor(s).float())\n", + " new_s, r, done, info = env.step(a)\n", + " s = new_s\n", + " total_reward += r\n", + " # Draw game image on display.\n", + " plt.imshow(env.render('rgb_array'))\n", + "\n", + " display.display(plt.gcf())\n", + " display.clear_output(wait=True)\n", + " \n", + " if done:\n", + " break\n", + " \n", + " return total_reward" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Random policy" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "class RandomPolicy:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total reward: 16.0\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQcAAAC4CAYAAADuQCmzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAOdElEQVR4nO3db2xV933H8ffHfzE2ARyMYzBJSOIsI9WWNARFyrRFTbvQbip5gkSlVTyIhCYlU6tNmsgqbaoipGwPqj1ZHqA1KtLaIqQ2Coo6tYy1a0mbEEhCgBCIU0JwTTDhT8xf/7vfPfCJuYGf7Yvx9blcPi/p6p7f7557zwdsPpxz7j9FBGZmV6rJO4CZVSaXg5kluRzMLMnlYGZJLgczS3I5mFlS2cpB0kpJByV1S1pfru2YWXmoHK9zkFQLHAK+AvQAbwDfiIh3p31jZlYW5dpzWAF0R8TvI2IQ2AysKtO2zKwMylUOi4GjReOebM7MbhB1ZXpcJeY+d/wiaR2wDqC5ufmh++67r0xRzGwiu3fv/iQi2q6cL1c59ABLisadQG/xChGxEdgIsHz58ti1a1eZopjZRCQdSc2X67DiDaBL0lJJDcAaYGuZtmVmZVCWPYeIGJb0DPBzoBZ4MSL2l2NbZlYe5TqsICJ+BvysXI9vZuXlV0iaWZLLwcySXA5mluRyMLMkl4OZJbkczCzJ5WBmSS4HM0tyOZhZksvBzJJcDmaW5HIwsySXg5kluRzMLMnlYGZJLgczS3I5mFmSy8HMklwOZpbkcjCzJJeDmSW5HMwsyeVgZkkuBzNLcjmYWZLLwcySXA5mluRyMLOkSctB0ouS+iTtK5prlbRN0vvZ9fyi256V1C3poKQnyhXczMqrlD2HHwArr5hbD2yPiC5gezZG0jJgDXB/dp8XJNVOW1ozmzGTlkNE/Bo4dcX0KmBTtrwJeLJofnNEDETEYaAbWDFNWc1sBk31nEN7RBwDyK4XZvOLgaNF6/Vkc2Z2g5nuE5JKzEVyRWmdpF2Sdp04cWKaY5jZ9ZpqORyX1AGQXfdl8z3AkqL1OoHe1ANExMaIWB4Ry9va2qYYw8zKZarlsBVYmy2vBV4uml8jqVHSUqAL2Hl9Ec0sD3WTrSDpx8BjwAJJPcC/AM8DWyQ9BXwErAaIiP2StgDvAsPA0xExUqbsZlZGk5ZDRHxjnJseH2f9DcCG6wllZvnzKyTNLMnlYGZJLgczS3I5mFmSy8HMklwOZpbkcjCzJJeDmSW5HMwsyeVgZkkuBzNLcjmYWZLLwcySXA5mluRyMLMkl4OZJbkczCzJ5WBmSS4HM0tyOZhZksvBzJJcDmaW5HKw6xaFAhHJbz20G9ik31thNpEoFOjd9TKqqaPp1sXMXnAHDc3zUE1t3tHsOrkc7LqMDF7gk4O/ZfhiP6qppbn9brq++ncuhyrgwwq7LhdP/YGRwYsARGGE+tlzUW19zqlsOrgcbMoigoune4mRobG5ubd/IcdENp1cDnZdPj3yzthybUMTzW13IinHRDZdXA42ZcMX+7l05uOxcf3sudTPnptjIptOk5aDpCWSfinpgKT9kr6VzbdK2ibp/ex6ftF9npXULemgpCfK+Qew/AyeP8PQhf6xcXP7XdTUz8oxkU2nUvYchoF/iIg/Bh4Bnpa0DFgPbI+ILmB7Nia7bQ1wP7ASeEGST11XmYjg0yN7iMLw2Nwtnct8SFFFJi2HiDgWEW9my2eBA8BiYBWwKVttE/BktrwK2BwRAxFxGOgGVkx3cMtXFEY4U3S+oaaugVlzb8sxkU23azrnIOlO4EHgdaA9Io7BaIEAC7PVFgNHi+7Wk81ZFRk8d4qh86fHxg0trcya73KoJiWXg6QW4CfAtyOif6JVE3NXvbZW0jpJuyTtOnHiRKkxrEIM9J9g+NK5sfGcRff5hU9VpqRykFTPaDH8MCJ+mk0fl9SR3d4B9GXzPcCSort3Ar1XPmZEbIyI5RGxvK2tbar5LSefHtlTNBJNty5G8pNf1aSUZysEfB84EBHfK7ppK7A2W14LvFw0v0ZSo6SlQBewc/oiW95Ghga4cPLykWNtYxO3dC7LMZGVQynvrXgU+CawV9Lb2dw/Ac8DWyQ9BXwErAaIiP2StgDvMvpMx9MRMTLtyS03I4MXGPi0b2zc1NpJQ0trjomsHCYth4jYQfo8AsDj49xnA7DhOnJZBTv3cfcV5xvuZfxfEbtR+SDRrklEcP744csTqqGptdOvb6hCLge7NlHgwicfjQ3rGmfT0n53joGsXFwOdk0Gz53iwsnL5TB7we3UNjblmMjKxeVg1+T8iSMUhi+/RXvW/EXU+PMbqpLLwUoWEVw8eRSiMDbnz2+oXi4HK1kURvj06L6xcf3secya15FjIisnl4OVbPDcSQbPFb2fYk4rdbNackxk5eRysJIN9J9gZOD82Njvp6huLgcrSURw5vBbY2PV1DJn0R/59Q1VzOVgJRkZvEh/z4GxcW1DE423+A1z1czlYCUZ6O9jeODyS6Znzevw+ymqnMvBJhURXDzVS2FoYGzO78Ksfi4HK0nx5zeoppam1kU+31DlXA42qZHBC1d9BH3LbffkmMhmgsvBJjV0oZ+BsyfHxs3td1Pb2JxjIpsJLgebVH/P/s995d2cRff6kOIm4HKwCUWhwPm+D8fGqqmjab4/TPxm4HKwCRVGhkbfbJVpaJlP062dOSaymeJysAldOn2Mgf7LXx3Q3H6336J9k3A52ITO9x0mCpc/H7ipdTGq8a/NzcA/ZRtXRIELnxwZG6umlrlL7s8xkc0kl4ONa2TwEmd7D46NG29Z6JdM30RcDjauS2c+/txH0DfOXUhN/awcE9lMcjlYUkRw6czHFIYHx+Z8SHFzcTnYOIIzh98cG9XUNdK88C6/+OkmUsrX4VkVKhQKDA0NjXv70IUznDv+wdi4prEZGpoZGBi4at26ujpqa/2JUNXG5XCT2rFjB88888y4tz/U1c7ffmUptdmewqtvHeKbz/8FJPYcnnvuOVatWlW2rJYPl8NNqr+/n7179457+z0Lb2P36ZWAWNq8j5+/9n/s3Xcwue7p06eT83Zjm/Scg6RZknZK2iNpv6TvZvOtkrZJej+7nl90n2cldUs6KOmJcv4BbPrV1NTQcdfX6Ru4g76B23nz5Aq6e8/mHctmWCknJAeAL0XEnwIPACslPQKsB7ZHRBewPRsjaRmwBrgfWAm8IMkHpDeQBbfMpvPWWUAAQc/xUxw6enKyu1mVmbQcYtRnT3bXZ5cAVgGbsvlNwJPZ8ipgc0QMRMRhoBtYMa2prazOXxrgNzt+wB8O/w8tI3s42/MjhoevPhFp1a2kcw7Z//y7gXuA/4iI1yW1R8QxgIg4Jmlhtvpi4LWiu/dkc+M6d+4cr7766jWHt6k7cODAuLedvzTEf+94lfrXfkdLUwMtTQ0TPlZ3d7d/flWopHKIiBHgAUnzgJckTfQFiaknwuOqlaR1wDqARYsW0dbmjzmfSXPnzp10naHhAqfPXuL02UsTrjdnzhz//KrQNT1bERFnJP2K0XMJxyV1ZHsNHUBftloPsKTobp1Ab+KxNgIbAZYvXx733nvvFOLbVB06dGjaHqu9vR3//KpPKc9WtGV7DEhqAr4MvAdsBdZmq60FXs6WtwJrJDVKWgp0ATunO7iZlVcpew4dwKbsvEMNsCUiXpH0O2CLpKeAj4DVABGxX9IW4F1gGHg6OywxsxvIpOUQEe8ADybmTwKPj3OfDcCG605nZrnxKyRvUg899BBbtmyZlsd6+OGHp+VxrLK4HG5SHR0drF69Ou8YVsH8lm0zS3I5mFmSy8HMklwOZpbkcjCzJJeDmSW5HMwsyeVgZkkuBzNLcjmYWZLLwcySXA5mluRyMLMkl4OZJbkczCzJ5WBmSS4HM0tyOZhZksvBzJJcDmaW5HIwsySXg5kluRzMLMnlYGZJLgczS3I5mFmSy8HMklwOZpbkcjCzJJeDmSUpIvLOgKQTwHngk7yzXGEBzlQKZypdJea6IyLarpysiHIAkLQrIpbnnaOYM5XGmUpXqblSfFhhZkkuBzNLqqRy2Jh3gARnKo0zla5Sc12lYs45mFllqaQ9BzOrILmXg6SVkg5K6pa0fga3+6KkPkn7iuZaJW2T9H52Pb/otmezjAclPVGmTEsk/VLSAUn7JX0r71ySZknaKWlPlum7eWcq2k6tpLckvVJBmT6UtFfS25J2VUquKYmI3C5ALfABcBfQAOwBls3Qtv8c+CKwr2ju34D12fJ64F+z5WVZtkZgaZa5tgyZOoAvZstzgEPZtnPLBQhoyZbrgdeBR/L+u8q29ffAj4BXKuHnl23rQ2DBFXO555rKJe89hxVAd0T8PiIGgc3AqpnYcET8Gjh1xfQqYFO2vAl4smh+c0QMRMRhoJvR7NOd6VhEvJktnwUOAIvzzBWjzmXD+uwSeWYCkNQJ/BXwn0XTuWaaQKXmmlDe5bAYOFo07snm8tIeEcdg9B8qsDCbn/Gcku4EHmT0f+pcc2W7728DfcC2iMg9E/DvwD8ChaK5vDPBaHH+QtJuSesqKNc1q8t5+0rMVeLTJzOaU1IL8BPg2xHRL6U2P3O5ImIEeEDSPOAlSV+YYPWyZ5L010BfROyW9Fgpdyl3piKPRkSvpIXANknvVUiua5b3nkMPsKRo3An05pQF4LikDoDsui+bn7GckuoZLYYfRsRPKyUXQEScAX4FrMw506PA1yV9yOih6Jck/VfOmQCIiN7sug94idHDhNxzTUXe5fAG0CVpqaQGYA2wNcc8W4G12fJa4OWi+TWSGiUtBbqAndO9cY3uInwfOBAR36uEXJLasj0GJDUBXwbeyzNTRDwbEZ0RcSejvzP/GxF/k2cmAEnNkuZ8tgz8JbAv71xTlvcZUeBrjJ6V/wD4zgxu98fAMWCI0QZ/CrgV2A68n123Fq3/nSzjQeCrZcr0Z4zuVr4DvJ1dvpZnLuBPgLeyTPuAf87mc/27KtrWY1x+tiLvn99djD77sAfY/9nvc965pnrxKyTNLCnvwwozq1AuBzNLcjmYWZLLwcySXA5mluRyMLMkl4OZJbkczCzp/wEFfB0PAfIN5wAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "total_reward = display_session(tmp_env, RandomPolicy(n_actions))\n", + "print('Total reward: ', total_reward)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The challenge of the day" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Algorithm](https://i.ibb.co/zFm6BrB/Screenshot-from-2021-09-17-13-38-00.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_session(env, agent, t_max=500):\n", + " total_reward = 0\n", + " s = env.reset()\n", + " \n", + " for t in range(t_max):\n", + " a = agent.get_action(torch.tensor(s).float())\n", + " new_s, r, done, info = env.step(a)\n", + " total_reward += r\n", + " s = new_s\n", + " \n", + " if done:\n", + " break\n", + " \n", + " return total_reward\n", + "\n", + "\n", + "def score(env, agent, n=10, t_max=500):\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Modification of the model using a noise " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def add_noise_to_model(model, noise, copy=False):\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Controller of the evolution\n", + "\n", + "#### Task 1: Implement the algorithm" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "class EvolutionManager:\n", + " def __init__(self, get_env_function, lr=0.001, std=0.01, n_samples = 64, normalize=True):\n", + " super().__init__()\n", + " \n", + " self.lr = lr\n", + " self.std = std\n", + " self.normalize = normalize\n", + " self.n_samples = n_samples\n", + " self.mean_reward_history = []\n", + " \n", + " self.env = get_env_function()\n", + " \n", + " def get_noised_model(self, model):\n", + " pass\n", + "\n", + " def optimize(self, model, noises, rewards):\n", + " pass\n", + " \n", + " def step(self, model):\n", + " pass\n", + " \n", + " def update_log(self, rewards):\n", + " mean_reward = np.mean(rewards)\n", + " self.mean_reward_history.append(mean_reward)\n", + "\n", + " clear_output(True)\n", + " print(\"last mean reward = %.3f\" % mean_reward)\n", + " plt.figure(figsize=[8, 4])\n", + " plt.subplot(1, 2, 1)\n", + " plt.plot(self.mean_reward_history, label='Mean rewards')\n", + " plt.legend()\n", + " plt.grid()\n", + "\n", + " plt.subplot(1, 2, 2)\n", + " plt.hist(rewards)\n", + " plt.grid()\n", + "\n", + " plt.show()\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Task 2: Implement a neural network-based policy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Neural policy" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "class MLPPolicy(nn.Module):\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Environment 1: CartPole-v0" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "def get_env_function():\n", + " env = gym.make('CartPole-v0').env\n", + " return env\n", + "tmp_env = get_env_function()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "n_states = tmp_env.observation_space.shape[0]\n", + "n_actions = tmp_env.action_space.n\n", + "model = <>" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "last mean reward = 493.230\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.485602855682373\n" + ] + } + ], + "source": [ + "algorithm = <>\n", + "\n", + "for i in range(15):\n", + " t = time.time()\n", + " algorithm.step(model)\n", + " print(time.time() - t)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "500.0" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQcAAAC4CAYAAADuQCmzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAL/ElEQVR4nO3df6zddX3H8eerpRYCKCCXpr+m16yEgZmgDZpJFoI6OmdW/iEp2Zb+QdJ/WKLZkq3MZcNlJG5/mP0zljTT2GRO0kRNG2OyNVXixhxQFGxLqVyFwU0bWgSlKLbc2/f+uF/0WD7tPZR77vc0eT6Sk/P9fr7f7/m+7u3p63y/33PuvakqJOl0S/oOIGk8WQ6SmiwHSU2Wg6Qmy0FSk+UgqWlk5ZBkQ5JDSaaSbB3VfiSNRkbxOYckS4EfAB8DpoFHgDuq6okF35mkkRjVkcONwFRV/aiqTgL3AxtHtC9JIzCqclgNPDcwP92NSTpPXDCix01j7NfOX5JsAbYAXHzxxR+45pprRhRF0tk8+uijL1TVxOnjoyqHaWDtwPwa4PDgClW1DdgGsH79+tq7d++Iokg6myT/1xof1WnFI8C6JJNJ3gZsAnaNaF+SRmAkRw5VNZPkT4H/AJYCX6iqA6PYl6TRGNVpBVX1DeAbo3p8SaPlJyQlNVkOkposB0lNloOkJstBUpPlIKnJcpDUZDlIarIcJDVZDpKaLAdJTZaDpCbLQVKT5SCpyXKQ1GQ5SGqyHCQ1WQ6SmiwHSU2Wg6Qmy0FSk+UgqclykNRkOUhqshwkNVkOkposB0lNloOkpnnLIckXkhxNsn9g7Ioku5M81d1fPrDs7iRTSQ4luXVUwSWN1jBHDl8ENpw2thXYU1XrgD3dPEmuBTYB13Xb3Jdk6YKllbRo5i2Hqvo28OJpwxuB7d30duC2gfH7q+pEVT0NTAE3LlBWSYvoXK85rKiqIwDd/VXd+GrguYH1prsxSeeZhb4gmcZYNVdMtiTZm2TvsWPHFjiGpLfqXMvh+SQrAbr7o934NLB2YL01wOHWA1TVtqpaX1XrJyYmzjGGpFE513LYBWzupjcDOwfGNyVZnmQSWAc8/NYiSurDBfOtkOTLwM3AlUmmgb8FPgvsSHIn8CxwO0BVHUiyA3gCmAHuqqrZEWWXNELzlkNV3XGGRR85w/r3Ave+lVCS+ucnJCU1WQ6SmiwHSU2Wg6Qmy0FSk+UgqclykNRkOUhqshwkNVkOkposB0lNloOkJstBUpPlIKnJcpDUZDlIarIcJDVZDpKaLAdJTZaDpCbLQVKT5aBFdWp2htmTr3JqdqbvKJrHvL+aXloIr754mBee/C9OvHyMn/94mlUf+ARXXnNT37F0FpaDFsWJ4y9wdP83fzk/+9ovekyjYXhaIanJcpDUZDlIarIcJDVZDpKa5i2HJGuTfCvJwSQHknyyG78iye4kT3X3lw9sc3eSqSSHktw6yi9A0mgMc+QwA/x5Vf0W8CHgriTXAluBPVW1DtjTzdMt2wRcB2wA7kuydBThJY3OvOVQVUeq6rvd9HHgILAa2Ahs71bbDtzWTW8E7q+qE1X1NDAF3LjQwSWN1pu65pDk3cANwEPAiqo6AnMFAlzVrbYaeG5gs+luTNJ5ZOhySHIJ8BXgU1X18tlWbYxV4/G2JNmbZO+xY8eGjSFpkQxVDkmWMVcMX6qqr3bDzydZ2S1fCRztxqeBtQObrwEOn/6YVbWtqtZX1fqJiYlzzS9pRIZ5tyLA54GDVfW5gUW7gM3d9GZg58D4piTLk0wC64CHFy6ypMUwzA9efRj4E2Bfkse6sb8CPgvsSHIn8CxwO0BVHUiyA3iCuXc67qqq2QVPLmmk5i2Hqvpv2tcRAD5yhm3uBe59C7kk9cxPSEpqshwkNVkOkposB0lNloOkJstBiyJLljL4plfNzlD1hg/OaoxYDloUF0+8i2UXX/bL+Z8+uw8sh7FmOWhRZOkFJL96up2aOUnjR240RiwHSU2Wg6Qmy0FSk+UgqclykNRkOUhqshwkNVkOkposB0lNloOkJstBUpPlIKnJcpDUZDlIarIcJDVZDpKaLAdJTcP8OTxpKLOzs8zMzDSXnZo5+Wu/M/JUFSdPnoS0X5+WLVvGkiW+dvXJctCC2blzJ/fcc09z2fJlS/j7O97HOy+9EIAnDx7kj/7ug5w6w2+Ku++++7jppptGlFTDsBy0YF566SX27dvXXHbR8gs4/MrHmHrtdwjF7E93s2//fk6doR2OHz8+yqgawrzHbUkuTPJwkseTHEjymW78iiS7kzzV3V8+sM3dSaaSHEpy6yi/AJ0fTtVSnjp+A8dO/AZHT7yLJ17+IFVn+vvMGgfDnNSdAG6pqvcB1wMbknwI2Arsqap1wJ5uniTXApuA64ANwH1Jlo4ivM4vSzLD3G+cLpbkVN9xNI95Tytq7irSK93ssu5WwEbg5m58O/AA8Jfd+P1VdQJ4OskUcCPwnYUMrvNLmGVF/oeXX32Vx6aOcMHPHwQsiHE21DWH7pX/UeA3gX+uqoeSrKiqIwBVdSTJVd3qq4H/Hdh8uhs7o1deeYUHH3zwTYfXeJmamjrjsl+cnOGv/+WLwBd5+Wcn5n2sgwcP8va3v33hwulNG6ocqmoWuD7JZcDXkrz3LKu3TiTfcNUpyRZgC8CqVauYmJgYJorG2KWXXnrW5cOUwuve8Y53+Jzo2Zt6t6KqfpLkAeauJTyfZGV31LASONqtNg2sHdhsDXC48VjbgG0A69evr6uvvvoc4mucrFixYsEea9WqVfic6Ncw71ZMdEcMJLkI+CjwJLAL2NytthnY2U3vAjYlWZ5kElgHPLzQwSWN1jBHDiuB7d11hyXAjqr6epLvADuS3Ak8C9wOUFUHkuwAngBmgLu60xJJ55Fh3q34PnBDY/zHwEfOsM29wL1vOZ2k3vgJSS2YW265hR07dizIY91wwxtej7TILActmMnJSSYnJ/uOoQXij71JarIcJDVZDpKaLAdJTZaDpCbLQVKT5SCpyXKQ1GQ5SGqyHCQ1WQ6SmiwHSU2Wg6Qmy0FSk+UgqclykNRkOUhqshwkNVkOkposB0lNloOkJstBUpPlIKnJcpDUZDlIarIcJDVZDpKaLAdJTZaDpCbLQVJTqqrvDCQ5BvwMeKHvLKe5EjMNw0zDG8dc76qqidMHx6IcAJLsrar1fecYZKbhmGl445qrxdMKSU2Wg6SmcSqHbX0HaDDTcMw0vHHN9QZjc81B0ngZpyMHSWOk93JIsiHJoSRTSbYu4n6/kORokv0DY1ck2Z3kqe7+8oFld3cZDyW5dUSZ1ib5VpKDSQ4k+WTfuZJcmOThJI93mT7Td6aB/SxN8r0kXx+jTM8k2ZfksSR7xyXXOamq3m7AUuCHwHuAtwGPA9cu0r5/F3g/sH9g7B+Brd30VuAfuulru2zLgcku89IRZFoJvL+bvhT4Qbfv3nIBAS7pppcBDwEf6vt71e3rz4B/B74+Dv9+3b6eAa48baz3XOdy6/vI4UZgqqp+VFUngfuBjYux46r6NvDiacMbge3d9HbgtoHx+6vqRFU9DUwxl32hMx2pqu9208eBg8DqPnPVnFe62WXdrfrMBJBkDfAHwL8ODPea6SzGNddZ9V0Oq4HnBuanu7G+rKiqIzD3HxW4qhtf9JxJ3g3cwNwrda+5usP3x4CjwO6q6j0T8E/AXwCnBsb6zgRzxfmfSR5NsmWMcr1pF/S8/zTGxvHtk0XNmeQS4CvAp6rq5aS1+8XLVVWzwPVJLgO+luS9Z1l95JmSfAI4WlWPJrl5mE1GnWnAh6vqcJKrgN1JnhyTXG9a30cO08Dagfk1wOGesgA8n2QlQHd/tBtftJxJljFXDF+qqq+OSy6AqvoJ8ACwoedMHwb+MMkzzJ2K3pLk33rOBEBVHe7ujwJfY+40ofdc56LvcngEWJdkMsnbgE3Arh7z7AI2d9ObgZ0D45uSLE8yCawDHl7onWfuEOHzwMGq+tw45Eoy0R0xkOQi4KPAk31mqqq7q2pNVb2buefMN6vqj/vMBJDk4iSXvj4N/B6wv+9c56zvK6LAx5m7Kv9D4NOLuN8vA0eA15hr8DuBdwJ7gKe6+ysG1v90l/EQ8PsjynQTc4eV3wce624f7zMX8NvA97pM+4G/6cZ7/V4N7OtmfvVuRd//fu9h7t2Hx4EDrz+f+851rjc/ISmpqe/TCkljynKQ1GQ5SGqyHCQ1WQ6SmiwHSU2Wg6Qmy0FS0/8D5VFBcRRcnEoAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "display_session(tmp_env, model, t_max=500)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Task 3: Implement and compare the parallelized version of the algorithm" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "model = <>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "last mean reward = 353.210\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "26.267060041427612\n" + ] + } + ], + "source": [ + "algorithm = <>\n", + "\n", + "for i in range(15):\n", + " t = time.time()\n", + " algorithm.step(model)\n", + " print(time.time() - t)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Environment 2: LunarLanderContinuous-v2" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "last mean reward = 155.350\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "def get_env_function():\n", + " env = gym.make('LunarLanderContinuous-v2').env\n", + " return env\n", + "tmp_env = get_env_function()\n", + "n_states = tmp_env.observation_space.shape[0]\n", + "n_actions = tmp_env.action_space.shape[0]\n", + "model = <>\n", + "algorithm = <>\n", + "for i in range(1000):\n", + " algorithm.step(model)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "297.1928817226966" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQcAAAC4CAYAAADuQCmzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAUn0lEQVR4nO3dfXBU9b3H8fd3d/MEGEggPINgCTWIDxAaldg7FB/rs9diuaM1ndFiraDUe+dWLzNt7x+teqfj3OlMa3G06oy2XLR2jE4vVaFowVYKVRBMQCBQYtKEFJQEDCHs9/6xB++KB1hiNmeDn9fMmT3723PO78uGfPZ3HvbE3B0RkaPFoi5ARHKTwkFEQikcRCSUwkFEQikcRCSUwkFEQmUtHMzsCjPbbGZbzey+bPUjItlh2bjOwcziwBbgUqAR+AvwL+7+bq93JiJZka2RQxWw1d23u3sXsAS4Lkt9iUgWZCscxgC70p43Bm0i0k8ksrRdC2n7xP6Lmc0D5gVPK7NUh4icWJu7lx3dmK1waATGpT0fCzSlL+DujwKPApiZvuAhEp2dYY3Z2q34C1BuZhPNLB+YC9RmqS8RyYKsjBzcvdvM5gO/B+LAL919Uzb6EpHsyMqpzJMuQrsVIlFa5+4zjm7UFZIiEkrhICKhFA4iEkrhICKhFA4iEkrhICKhFA4iEkrhICKhFA4iEkrhICKhFA4iEkrhICKhFA4iEkrhICKhFA4iEkrhICKhFA4iEkrhICKhFA4iEkrhICKhFA4iEkrhICKhFA4iEkrhICKhFA4iEkrhICKhFA4iEkrhICKhThgOZvZLM2s1s41pbaVm9oqZvRc8lqS9dr+ZbTWzzWZ2ebYKF5HsymTk8CRwxVFt9wHL3b0cWB48x8ymAHOBs4J1fm5m8V6rVkT6zAnDwd1fB/Yc1Xwd8FQw/xRwfVr7Enc/6O4NwFagqpdqFZE+1NNjDiPcvRkgeBwetI8BdqUt1xi0iUg/k+jl7VlIm4cuaDYPmNfL/YtIL+npyKHFzEYBBI+tQXsjMC5tubFAU9gG3P1Rd5/h7jN6WIOIZFFPw6EWqAnma4AX0trnmlmBmU0EyoE1n61EEYnCCXcrzOzXwCxgmJk1Aj8AHgSWmtltwN+AOQDuvsnMlgLvAt3AXe5+OEu1i0gWmXvoIYG+LcIs+iJEPr/Whe3e6wpJEQmlcBCRUAoHEQmlcBCRUAoHEQmlcBCRUAoHEQmlcBCRUAoHEQmlcBCRUAoHEQmlcBCRUAoHEQmlcBCRUAoHEQmlcBCRUAoHEQmlcBCRUAoHEQmlcBCRUAoHEQmlcBCRUAoHEQmlcBCRUAoHEQmlcBCRUAoHEQmlcBCRUAoHEQl1wnAws3Fm9gczqzOzTWZ2T9BeamavmNl7wWNJ2jr3m9lWM9tsZpdn8x8gItmRycihG/hXd68ALgDuMrMpwH3AcncvB5YHzwlemwucBVwB/NzM4tkoXkSy54Th4O7N7v7XYL4dqAPGANcBTwWLPQVcH8xfByxx94Pu3gBsBap6u3ARya6TOuZgZhOAacCbwAh3b4ZUgADDg8XGALvSVmsM2kSkH0lkuqCZDQJ+Ayx0931mdsxFQ9o8ZHvzgHmZ9i8ifSujkYOZ5ZEKhmfc/fmgucXMRgWvjwJag/ZGYFza6mOBpqO36e6PuvsMd5/R0+JFJHsyOVthwONAnbs/nPZSLVATzNcAL6S1zzWzAjObCJQDa3qvZBHpC5nsVlQD3wDeMbO3g7b/AB4ElprZbcDfgDkA7r7JzJYC75I603GXux/u9cpFJKvM/VOHA/q+CLPoixD5/FoXtnuvKyRFJJTCQURCKRxEJJTCQURCKRxEJJTCQURCKRxEJJTCQURC5UQ4HOdLXCISkZwIh/Hjx1NcXBx1GSKSJifCYdiwYaxYsYJzzz036lJEJJAT4QAwffp0li5dSlVVFfG47ionErWcCQczY/LkySxbtowFCxYoIHpZfn6cWM78tDOTnx9Dh6Oik/GdoPpKSUkJDzzwAOPHj+eBBx5g9+7dUZfU75kZDz98NW1tL9DcDGvWwN698P770N0NOfDF3E8pLi5m8eJK1q79A3v3wurVsH8/NDVBMhl1dZ8PORcOAIWFhSxcuJCKigpuvfVWBUQvGD26mAsuSAXBN78Jhw/De+9Bezu8/jrs3p16zJWgiMfjTJgwiPLyVE233w4HD8LWrdDWBm++CTt3wltvRV3pqStnB5pmxuWXX86LL77I7Nmzoy7nlJJMpsKhqws6O1NTV1fUVR3fkZoPHkxNnZ1w6FDUVZ3acnLkcISZUVVVxTPPPMMdd9zBsmXL6Mr1/8U5at8+eO45aG6GtWtTuxUtLalfuFzV0gKvvJKq9c9/hgMHUm25Mro55bl75FNlZaWfSGdnpz/00EOen5/vpO5mrSnDycy8puYbbhZ9LZlOJSUlfv3110Rex+dkWhv2e5nTI4d0BQUF3HvvvRQUFPCTn/yExsbGqEvqV5JJeu0Td/ToqYwedXbo3yD4RJ9+mPe2vsa+fS096ieXRzWfB/0mHAASiQR333031dXVfP3rX2f79u1Rl/S5NPH0C7jwi/NJxAqPu1zX4Q7isTzWrP0VqQ8o6U/6VThA6jhEZWUlr776KosWLWLJkiXkwk1yPy8KC4sZNewcBuQNxez4x7MTsUIGnzYGM8uZn1FRURGzZ89m5MiRrFixgt27d9PR0RF1WcdUWlrKyJEj+cpXvkJbWxtvvPEGAG1tbXz00UdZ7bvfhQOkAmLixIk88sgjxONxli5dqgOVfSSRyKcwf/AJgwHALEZRQQl5eYV0dR3og+qOraioiEsvvZSFCxcyc+ZM8vLy6OjooL6+ns2bN1NbW8umTZvYtWtXZGERi8UYNGgQFRUVnHnmmVxzzTVMnTqV0aNHM2jQILq7uzlwIPU+bty4kdbWVurr61mzZg3uztq1a2lvb6ezs7NXfh/6ZTgcMXjwYBYvXszFF1/MggULcvoT4FRRVjaJ0gFfyGjZmMUpGzyZgQNLIwuHwsJCLrvsMhYuXEh1dTX5+fkfv1ZcXExVVRVVVVXccsst7N27l5aWFlasWMHq1avZtGkT9fX1HDp0KCsjn1gsRlFREWeeeSYzZ87kvPPOY/bs2ZSVlTFgwIBPfVs5Ly+PwYMHA1BdXQ3wibpaWlro6upiw4YNbN++nZ07d/LHP/4RgG3btrFv3z6SySTJDK8i69fhADBgwABuvfVWiouL+eEPf8g777wTdUmntHg8j3gsL+PlY5ZHPJ5/4gV72ZGRwne/+11mzpz5iVAIY2aUlpZSWlpKRUUF3/72t+ns7GTLli288cYbrF+/nuXLl9PS0sL+/ft7XFdpaSmjRo3ikksu4cILL+Sss85i8uTJJBIJYj24vj09QEaOHAmkvuUMfCIIGhoaaG9vZ+PGjaxfv55kMsny5ctpb29nx44dodvu9+EAqQS+4YYbOOecc5g7dy7r1q2LuqRTklmMkWUVFCZKMl5nSOHpjBx5Jm1tfXPwOB6Pc9VVV2UcCsfbzsCBA5k2bRrTpk0jmUyyf/9+6urq2LJlCy+99BLr16+noaGBgwcPhm7DzCguLv54N+Haa69lypQpjB07NnRk0NtisdjHgVNeXg6kvuAIqRHHgQMHSCaTx7xdwikRDpD6QUyaNIlnn32WBx98kMcff5zDOhfWKwYOHMppp5XxwQdNDCgqJREryHxlM2IWJz9/AGVlk2hp2UJ3d2ev1xiLxZgyZQrz58+npqaGwsLjn0npyfZPO+20T+2GrFu3jg0bNvDiiy+ybds29uzZw9SpUzn//POZPn06s2bNYujQoQwaNKhX6/mszIyBAwced5lTJhyOmDhxIj/96U8ZMmQIv/jFL9i3b1/UJfVzxpmTL+aLZ1zKG+se7dEWYvEE5557PWefcQOr336EzZtX9F51ZkydOpUFCxZw4403UlJS0md3FispKeGSSy7h4osv5p577qGlpYUPP/yQSZMmkUgk+v0dznL2uxWfRUFBAT/60Y9YuXIlNTU1usvUZzBo0FC+OOFyBiTK6Oxs79E2Dh/upqVlMwWJwUwYcz6JxEmMPI4hHo9z9tln88gjj7By5Upuv/12SktLI/mFNDPi8TijR4+moqKCvLy8fh8MkMHIwcwKgdeBgmD559z9B2ZWCvwPMAHYAdzk7nuDde4HbgMOA3e7+++zUv1xJBIJpk2bxmOPPcaCBQtYtmwZTzzxBNu3b8+Zc+79wfDh5ZQO/AI7d69iz55ddHzUyo49K8n0c+Wwd3Kwcx9tbQ38o30rp4+oZujQ52lp2dyjetJ3H2666SZKSjI//iEnJ5PdioPAbHfvMLM8YJWZ/S/wz8Byd3/QzO4D7gO+Z2ZTgLnAWcBo4FUzm+zukRwASCQSVFZWMn36dL71rW/x7LPPUltby8qVK3VtxAnEYnHGjppGIlbI35rX0tW1n7XrlrDhndqT2s6BAx/g7rzf+lfGTK5kzOhzehQOEyZMYNGiRdx4440MGTLklPh0zmUnDAdPfcweuYAgL5gcuA6YFbQ/BawEvhe0L3H3g0CDmW0FqoA/9WbhJ8vMGD58OHfddRe33XYbK1eu5Omnn+bll1/W/SKOYcCAEsaUncc/OrawY+caALq6DvT4moXG5vWc94UOxo+cwcb839HVldkpwbKyMm6++Wa+853vMGnSJIVCH8nogKSZxYF1wCTgZ+7+ppmNcPdmAHdvNrPhweJjgD+nrd4YtOWMwsJCrrjiCi677DK2bNnC4sWLefXVV6mrqztlz3Ds3buXoqKik1pnyJDhxPIP0bLnXbq72096/aN98ME2du1dRSxhFBUVEY8f+2KcvLw8YrEYCxcu5I477mDy5Mk9ug5Aes5OZv/bzIYAvwUWAKvcfUjaa3vdvcTMfgb8yd2fDtofB37n7r85alvzgHkA48ePr9y5c+dn/sf0lLvz4Ycf8vzzz7N48WI2btz48WWq/d3YsWP58pe/zPz58xk2bNhJrm0MGDCE7u6DvXKFo7vT0QFlZcUZHdwsLCxk3LhxGilkmZmtc/cZn3rhZO+9APwA+DdgMzAqaBsFbA7m7wfuT1v+98CFn/V+Dn0hmUx6Z2enr1q1yr/2ta/50KFDo/6efY+nCRMm+I9//GNvbGz0Q4cORf3WSg6jp/dzMLMy4JC7f2BmRcAlwENALVADPBg8vhCsUgv8ysweJnVAshxYc6J+coGZUVBQQHV1NV/60pdoamriySef5IknnqCpqYnu7u6oSzyuvLw8Kisrueaaa6ipqWH06NH61JWeC0sM/+RI4RzgLWADsBH4ftA+FFgOvBc8lqatswjYRmp08dUT9ZErI4cwyWTSm5ub/bnnnvPq6mpPJBKRjwqOnvLy8nzWrFn+wgsv+P79+z2ZTEb9tkk/wjFGDid1zCFbZsyY4WvXro26jOPy4Fr0t956iyeffJJly5bR1tZ2zOvq+0JpaSlXXnklc+bM4dJLL6WwsFAjBTlpxzrmcMpdPp0tR65Fv+iii7jgggtob2/n7bffpq6ujmXLlrFhwwZaW1uzfgOOWCzG0KFDmTNnDnfeeScVFRX6A0CSFRo5fEZH3r/W1lYaGxt57bXXaGhoYPXq1WzZsoWPPvoo4+/PH4+ZUVFRwfz587n66qsZM2aMTu1Jr9DIIUuODONHjBjBiBEjqKysJJlM0t3dTX19PXV1daxatYoVK1bw97//nT179pzU9ouKiigvL+fOO+9kzpw5kX1/QD5/NHLoA+5OR0cH77//Pps2baK2tpb6+nrq6+vp6OgIHVmk36xkxowZDBw4UKEgWXGskYPCIQLuzv79+2lra/v4JqcvvfQSDQ0NJJNJbr75Zq666qrPdLMSkUwpHHKYu5NMJmlqasLdGTt2rI4nSJ/RMYccduR+AOPGjYu6FJGP6eNJREIpHEQklMJBREIpHEQklMJBREIpHEQklMJBREIpHEQklMJBREIpHEQklMJBREIpHEQklMJBREIpHEQklMJBREIpHEQklMJBREIpHEQklMJBREIpHEQklMJBREIpHEQklMJBREIpHEQklMJBRELlxJ/DM7PdwH6gLepajjIM1ZQJ1ZS5XKzrdHcvO7oxJ8IBwMzWhv29viippsyopszlal1htFshIqEUDiISKpfC4dGoCwihmjKjmjKXq3V9Ss4ccxCR3JJLIwcRySGRh4OZXWFmm81sq5nd14f9/tLMWs1sY1pbqZm9YmbvBY8laa/dH9S42cwuz1JN48zsD2ZWZ2abzOyeqOsys0IzW2Nm64Oa/jPqmtL6iZvZW2b2Ug7VtMPM3jGzt81sba7U1SPuHtkExIFtwBlAPrAemNJHff8TMB3YmNb2X8B9wfx9wEPB/JSgtgJgYlBzPAs1jQKmB/OnAVuCviOrCzBgUDCfB7wJXBD1exX0dS/wK+ClXPj5BX3tAIYd1RZ5XT2Zoh45VAFb3X27u3cBS4Dr+qJjd38d2HNU83XAU8H8U8D1ae1L3P2guzcAW0nV3ts1Nbv7X4P5dqAOGBNlXZ7SETzNCyaPsiYAMxsLXAU8ltYcaU3Hkat1HVfU4TAG2JX2vDFoi8oId2+G1C8qMDxo7/M6zWwCMI3UJ3WkdQXD97eBVuAVd4+8JuC/gX8HkmltUdcEqeB82czWmdm8HKrrpCUi7t9C2nLx9Emf1mlmg4DfAAvdfZ9ZWPd9V5e7HwbOM7MhwG/NbOpxFs96TWZ2NdDq7uvMbFYmq2S7pjTV7t5kZsOBV8ysPkfqOmlRjxwagXFpz8cCTRHVAtBiZqMAgsfWoL3P6jSzPFLB8Iy7P58rdQG4+wfASuCKiGuqBq41sx2kdkVnm9nTEdcEgLs3BY+twG9J7SZEXldPRB0OfwHKzWyimeUDc4HaCOupBWqC+RrghbT2uWZWYGYTgXJgTW93bqkhwuNAnbs/nAt1mVlZMGLAzIqAS4D6KGty9/vdfay7TyD1f2aFu98SZU0AZjbQzE47Mg9cBmyMuq4ei/qIKHAlqaPy24BFfdjvr4Fm4BCpBL8NGAosB94LHkvTll8U1LgZ+GqWarqI1LByA/B2MF0ZZV3AOcBbQU0bge8H7ZG+V2l9zeL/z1ZE/fM7g9TZh/XApiP/n6Ouq6eTrpAUkVBR71aISI5SOIhIKIWDiIRSOIhIKIWDiIRSOIhIKIWDiIRSOIhIqP8DRHFY85VAfzgAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "display_session(tmp_env, model, t_max=500)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Task 3: Hyperparameter's analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we are going to do some experimental analysis of the algorithm.\n", + "\n", + "Plan:\n", + "- to learn something about the *std* parameter\n", + "- to learn something about the *learning rate* parameter\n", + "- to learn something about the *n_samples* parameter" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/week01_intro/seminar_gym_interface.ipynb b/week01_intro/seminar_gymnasium_interface.ipynb similarity index 68% rename from week01_intro/seminar_gym_interface.ipynb rename to week01_intro/seminar_gymnasium_interface.ipynb index 02021a65d..f47c6fe07 100644 --- a/week01_intro/seminar_gym_interface.ipynb +++ b/week01_intro/seminar_gymnasium_interface.ipynb @@ -8,7 +8,7 @@ "source": [ "import sys, os\n", "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", "\n", " !touch .setup_complete\n", "\n", @@ -34,11 +34,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### OpenAI Gym\n", + "### OpenAI Gym --> Farama Gymnasium\n", "\n", "We're gonna spend several next weeks learning algorithms that solve decision processes. We are then in need of some interesting decision problems to test our algorithms.\n", "\n", - "That's where OpenAI Gym comes into play. It's a Python library that wraps many classical decision problems including robot control, videogames and board games.\n", + "That's where Gymnasium comes into play. It's a Python library that wraps many classical decision problems including robot control, videogames and board games.\n", + "\n", + "The library Gym by OpenAi has been replaced by Gymnsasium while saving all functionality comparable with the latest version of Gym.\n", + "\n", + "Announce: https://farama.org/Announcing-The-Farama-Foundation\n", + "\n", + "Github: https://github.com/Farama-Foundation/Gymnasium\n", + "\n", + "Documentation: https://gymnasium.farama.org/\n", "\n", "So here's how it works:" ] @@ -49,14 +57,14 @@ "metadata": {}, "outputs": [], "source": [ - "import gym\n", + "import gymnasium as gym\n", "\n", - "env = gym.make(\"MountainCar-v0\")\n", + "env = gym.make(\"MountainCar-v0\", render_mode=\"rgb_array\")\n", "env.reset()\n", "\n", - "plt.imshow(env.render('rgb_array'))\n", + "plt.imshow(env.render())\n", "print(\"Observation space:\", env.observation_space)\n", - "print(\"Action space:\", env.action_space)" + "print(\"Action space:\", env.action_space)\n" ] }, { @@ -70,16 +78,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Gym interface\n", + "### Gymnasium interface\n", "\n", "The three main methods of an environment are\n", - "* `reset()`: reset environment to the initial state, _return first observation_\n", + "* `reset()`: reset environment to the initial state, return first observation and dict with auxiliary info\n", "* `render()`: show current environment state (a more colorful version :) )\n", - "* `step(a)`: commit action `a` and return `(new_observation, reward, is_done, info)`\n", + "* `step(a)`: commit action `a` and return `(new_observation, reward, terminated, truncated, info)`\n", " * `new_observation`: an observation right after committing the action `a`\n", " * `reward`: a number representing your reward for committing action `a`\n", - " * `is_done`: True if the MDP has just finished, False if still in progress\n", - " * `info`: some auxiliary stuff about what just happened. For now, ignore it." + " * `terminated`: True if the MDP has just finished, False if still in progress\n", + " * `truncated`: True if the number of steps elapsed >= max episode steps\n", + " * `info`: some auxiliary stuff about what just happened. For now, ignore it.\n", + "\n", + "A detailed explanation of the difference between `terminated` and `truncated` and how it should be used:\n", + "1. https://gymnasium.farama.org/tutorials/gymnasium_basics/handling_time_limits/\n", + "2. https://gymnasium.farama.org/content/migration-guide/" ] }, { @@ -88,10 +101,14 @@ "metadata": {}, "outputs": [], "source": [ - "obs0 = env.reset()\n", + "# Set seed to reproduce initial state in stochastic environment\n", + "obs0, info = env.reset(seed=0)\n", "print(\"initial observation code:\", obs0)\n", "\n", - "# Note: in MountainCar, observation is just two numbers: car position and velocity" + "obs0, info = env.reset(seed=1)\n", + "print(\"initial observation code:\", obs0)\n", + "\n", + "# Note: in MountainCar, observation is just two numbers: car position and velocity\n" ] }, { @@ -101,13 +118,14 @@ "outputs": [], "source": [ "print(\"taking action 2 (right)\")\n", - "new_obs, reward, is_done, _ = env.step(2)\n", + "new_obs, reward, terminated, truncated, _ = env.step(2)\n", "\n", "print(\"new observation code:\", new_obs)\n", "print(\"reward:\", reward)\n", - "print(\"is game over?:\", is_done)\n", + "print(\"is game over?:\", terminated)\n", + "print(\"is game truncated due to time limit?:\", truncated)\n", "\n", - "# Note: as you can see, the car has moved to the right slightly (around 0.0005)" + "# Note: as you can see, the car has moved to the right slightly (around 0.0005)\n" ] }, { @@ -134,10 +152,10 @@ "# Create env manually to set time limit. Please don't change this.\n", "TIME_LIMIT = 250\n", "env = gym.wrappers.TimeLimit(\n", - " gym.envs.classic_control.MountainCarEnv(),\n", + " gym.make(\"MountainCar-v0\", render_mode=\"rgb_array\"),\n", " max_episode_steps=TIME_LIMIT + 1,\n", ")\n", - "actions = {'left': 0, 'stop': 1, 'right': 2}" + "actions = {\"left\": 0, \"stop\": 1, \"right\": 2}\n" ] }, { @@ -151,12 +169,12 @@ " # (a tuple of position and velocity), the current time step, or both,\n", " # if you want.\n", " position, velocity = obs\n", - " \n", + "\n", " # This is an example policy. You can try running it, but it will not work.\n", " # Your goal is to fix that. You don't need anything sophisticated here,\n", " # and you can hard-code any policy that seems to work.\n", " # Hint: think how you would make a swing go farther and faster.\n", - " return actions['right']" + " return actions[\"right\"]\n" ] }, { @@ -168,29 +186,31 @@ "plt.figure(figsize=(4, 3))\n", "display.clear_output(wait=True)\n", "\n", - "obs = env.reset()\n", + "obs, _ = env.reset()\n", "for t in range(TIME_LIMIT):\n", " plt.gca().clear()\n", - " \n", + "\n", " action = policy(obs, t) # Call your policy\n", - " obs, reward, done, _ = env.step(action) # Pass the action chosen by the policy to the environment\n", - " \n", + " obs, reward, terminated, truncated, _ = env.step(\n", + " action\n", + " ) # Pass the action chosen by the policy to the environment\n", + "\n", " # We don't do anything with reward here because MountainCar is a very simple environment,\n", " # and reward is a constant -1. Therefore, your goal is to end the episode as quickly as possible.\n", "\n", " # Draw game image on display.\n", - " plt.imshow(env.render('rgb_array'))\n", - " \n", + " plt.imshow(env.render())\n", + "\n", " display.display(plt.gcf())\n", " display.clear_output(wait=True)\n", "\n", - " if done:\n", + " if terminated or truncated:\n", " print(\"Well done!\")\n", " break\n", "else:\n", " print(\"Time limit exceeded. Try again.\")\n", "\n", - "display.clear_output(wait=True)" + "display.clear_output(wait=True)\n" ] }, { @@ -200,7 +220,7 @@ "outputs": [], "source": [ "assert obs[0] > 0.47\n", - "print(\"You solved it!\")" + "print(\"You solved it!\")\n" ] } ], diff --git a/week02_value_based/README.md b/week02_value_based/README.md index a3338e96f..380e0479d 100644 --- a/week02_value_based/README.md +++ b/week02_value_based/README.md @@ -9,6 +9,6 @@ ## Homework description: -The main assignment is `seminar_vi.ipynb`[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring20/week02_value_based/seminar_vi.ipynb) notebook in this week's folder. It has no requirements besides the most basic data science libraries (e.g. numpy) so you should be able to run it locally. +The main assignment is `seminar_vi.ipynb`[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week02_value_based/seminar_vi.ipynb) notebook in this week's folder. It has no requirements besides the most basic data science libraries (e.g. numpy) so you should be able to run it locally. __Note:__ if you have any difficulty using graphviz, just set `has_graphviz=False`. diff --git a/week02_value_based/mdp.py b/week02_value_based/mdp.py index b86b42539..1d148f22f 100644 --- a/week02_value_based/mdp.py +++ b/week02_value_based/mdp.py @@ -1,8 +1,6 @@ # most of this code was politely stolen from https://github.com/berkeleydeeprlcourse/homework/ # all credit goes to https://github.com/abhishekunique # (if I got the author right) -import sys -import random import numpy as np from gym.utils import seeding @@ -118,7 +116,7 @@ def _check_param_consistency(self, transition_probs, rewards): for action in transition_probs[state]: assert isinstance(transition_probs[state][action], dict), \ "transition_probs for %s, %s should be a a dictionary but is instead %s" % ( - state, action, type(transition_probs[state, action])) + state, action, type(transition_probs[state][action])) next_state_probs = transition_probs[state][action] assert len(next_state_probs) != 0, "from state %s action %s leads to no next states" % (state, action) sum_probs = sum(next_state_probs.values()) @@ -128,11 +126,11 @@ def _check_param_consistency(self, transition_probs, rewards): for state in rewards: assert isinstance(rewards[state], dict), \ "rewards for %s should be a dictionary but is instead %s" % ( - state, type(transition_probs[state])) + state, type(rewards[state])) for action in rewards[state]: assert isinstance(rewards[state][action], dict), \ "rewards for %s, %s should be a a dictionary but is instead %s" % ( - state, action, type(transition_probs[state, action])) + state, action, type(rewards[state][action])) msg = "The Enrichment Center once again reminds you that Android Hell is a real place where" \ " you will be sent at the first sign of defiance." assert None not in transition_probs, "please do not use None as a state identifier. " + msg @@ -251,7 +249,7 @@ def render(self): print('\n'.join(map(''.join, desc_copy)), end='\n\n') -def plot_graph(mdp, graph_size='10,10', s_node_size='1,5', +def plot_graph(mdp, s_node_size='1,5', a_node_size='0,5', rankdir='LR', ): """ Function for pretty drawing MDP graph with graphviz library. @@ -261,7 +259,6 @@ def plot_graph(mdp, graph_size='10,10', s_node_size='1,5', python library for graphviz for pip users: pip install graphviz :param mdp: - :param graph_size: size of graph plot :param s_node_size: size of state nodes :param a_node_size: size of action nodes :param rankdir: order for drawing @@ -294,7 +291,7 @@ def plot_graph(mdp, graph_size='10,10', s_node_size='1,5', 'fontsize': '16'} graph = Digraph(name='MDP') - graph.attr(rankdir=rankdir, size=graph_size) + graph.attr(rankdir=rankdir) for state_node in mdp._transition_probs: graph.node(state_node, **s_node_attrs) diff --git a/week02_value_based/seminar_vi.ipynb b/week02_value_based/seminar_vi.ipynb index 549213ee3..188448bbe 100644 --- a/week02_value_based/seminar_vi.ipynb +++ b/week02_value_based/seminar_vi.ipynb @@ -30,8 +30,8 @@ "source": [ "import sys, os\n", "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week02_value_based/mdp.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week02_value_based/mdp.py\n", " !touch .setup_complete\n", "\n", "# This code creates a virtual display to draw game images on.\n", @@ -870,7 +870,7 @@ "source": [ "# HW Part 1: Value iteration convergence\n", "\n", - "### Find an MDP for which value iteration takes long to converge (0.5 pts)\n", + "### Find an MDP for which value iteration takes long to converge (1 pts)\n", "\n", "When we ran value iteration on the small frozen lake problem, the last iteration where an action changed was iteration 6--i.e., value iteration computed the optimal policy at iteration 6. Are there any guarantees regarding how many iterations it'll take value iteration to compute the optimal policy? There are no such guarantees without additional assumptions--we can construct the MDP in such a way that the greedy policy will change after arbitrarily many iterations.\n", "\n", @@ -926,7 +926,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Value iteration convervence proof (0.5 pts)\n", + "### Value iteration convervence proof (1 pts)\n", "**Note:** Assume that $\\mathcal{S}, \\mathcal{A}$ are finite.\n", "\n", "Update of value function in value iteration can be rewritten in a form of Bellman operator:\n", @@ -963,7 +963,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Bonus. Asynchronious value iteration (2 pts)\n", + "### Asynchronious value iteration (2 pts)\n", "\n", "Consider the following algorithm:\n", "\n", @@ -997,7 +997,7 @@ "source": [ "# HW Part 2: Policy iteration\n", "\n", - "## Policy iteration implementateion (2 pts)\n", + "## Policy iteration implementateion (3 pts)\n", "\n", "Let's implement exact policy iteration (PI), which has the following pseudocode:\n", "\n", @@ -1238,7 +1238,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/week03_model_free/README.md b/week03_model_free/README.md index eb4ccb80a..b2271d15b 100644 --- a/week03_model_free/README.md +++ b/week03_model_free/README.md @@ -3,8 +3,8 @@ * Russian materials: - Lecture - [video](https://yadi.sk/i/jcQ1Bg8n3SrhuQ) - Seminars - - Q-learning seminar - [video](https://yadi.sk/i/dQmolwOy3EtGNK) (older track - [assignment](https://github.com/neer201/Practical_RL/tree/spring20/week03_model_free/crawler_and_pacman/seminar_py2)) - - SARSA & stuff - [video](https://yadi.sk/i/XbqNQmjm3ExNsq) + - Q-learning seminar - [video](https://yadi.sk/i/dQmolwOy3EtGNK) (older track - [assignment](./crawler_and_pacman/seminar_py2)) + - SARSA & stuff - [video](https://yadi.sk/i/XbqNQmjm3ExNsq) * English materials: - Lecture by David Silver (english) - [video part I](https://www.youtube.com/watch?v=PnHCvfgC_ZA), [video part II](https://www.youtube.com/watch?v=0g4j2k_Ggc4&t=43s) - Alternative lecture by Pieter Abbeel (english) - [video](https://www.youtube.com/watch?v=ifma8G7LegE) @@ -18,9 +18,4 @@ ### Assignments -Just as usual, start with -- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring20/week03_model_free/seminar_qlearning.ipynb) -`seminar_qlearning.ipynb` _Implement q-learning agent and test it on Taxi and CartPole with binarizer_ - -and then proceed to -- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring20/week03_model_free/homework.ipynb) `homework.ipynb` _Implement EV-SARSA agent, experience replay + bonus tasks_ +Just as usual, start with `homework.ipynb` [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week03_model_free/homework.ipynb) For seminar, implement q-learning agent and test it on Taxi and CartPole with binarizer. And then, implement EV-SARSA agent, experience replay + bonus tasks for homework. diff --git a/week03_model_free/crawler_and_pacman/seminar_py2/README.md b/week03_model_free/crawler_and_pacman/seminar_py2/README.md index 7c4c91301..872c8e500 100644 --- a/week03_model_free/crawler_and_pacman/seminar_py2/README.md +++ b/week03_model_free/crawler_and_pacman/seminar_py2/README.md @@ -1,5 +1,5 @@ # Disclaimer -This assignment is not supported now. You can do it at your own risk. +This assignment is not supported now. You can do it at your own risk. _this assignment borrows code from awesome [cs188](http://ai.berkeley.edu/project_overview.html)_ This homework assignment works on __python2 only__. If you stick to py3, consider seminar_alternative. Or just install it for this homework alone and remove afterwards. @@ -17,8 +17,8 @@ python pacman.py -p PacmanQAgent -x 5000 -n 5010 -l smallGrid # example ``` * Make sure you can tune agent to beat ./run_crawler.sh * on windows, just run `python crawler.py` from cmd in the project directory -* other ./run* files are mostly for your amusement. - * ./run_pacman.sh will need more epochs to converge, see [comments](https://github.com/yandexdataschool/Practical_RL/blob/spring20/week03_model_free/crawler_and_pacman/seminar_py2/run_pacman.sh) +* other ./run* files are mostly for your amusement. + * ./run_pacman.sh will need more epochs to converge, see [comments](https://github.com/yandexdataschool/Practical_RL/blob/master/week03_model_free/crawler_and_pacman/seminar_py2/run_pacman.sh) * on windows, just copy the type `python pacman.py -p PacmanQAgent -x 2000 -n 2010 -l smallGrid` in cmd from assignemnt dir @@ -35,14 +35,14 @@ The problem with those environments is that they have a large amount of unique s * where is nearest food * 'center of mass' of all food points (and variance, and whatever) * is there a wall in each direction - * and anything else you see fit - + * and anything else you see fit + Here's how to get this information from [state](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/pacman.py#L49), * Get pacman position: [state.getPacmanPosition()](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/pacman.py#L128) * Is there a wall at (x,y)?: [state.hasWall(x,y)](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/pacman.py#L189) * Get ghost positions: [state.getGhostPositions()](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/pacman.py#L144) * Get all food positions: [state.getCapsules()](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/pacman.py#L153) - + You can call those methods anywhere you see state. * e.g. in [agent.getValue(state)](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/qlearningAgents.py#L52) * Defining a function that extracts all features and calling it in [getQValue](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/qlearningAgents.py#L38) and [setQValue](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/qlearningAgents.py#L44) is probably enough. diff --git a/week03_model_free/crawler_and_pacman/seminar_py3/README.md b/week03_model_free/crawler_and_pacman/seminar_py3/README.md index 696e8163d..7faec7ce0 100644 --- a/week03_model_free/crawler_and_pacman/seminar_py3/README.md +++ b/week03_model_free/crawler_and_pacman/seminar_py3/README.md @@ -1,13 +1,13 @@ Contributors: [Vlad Lyalin](https://github.com/Guitaricet) # Disclaimer -This assignment is not supported now. You can do it at your own risk. +This assignment is not supported now. You can do it at your own risk. # Requirements __Ubuntu__ - not tested __MacOS__ Python 3.7.0 (3.6, 3.8 and other cause on MacOS [problems](https://stackoverflow.com/questions/57400301/how-to-fix-tkinter-every-code-with-gui-crashes-mac-os-with-respring) with _Tkinter_) - + Anaconda users just can create new_env: ```bash conda create -n pacman_env python=3.7.0 @@ -31,8 +31,8 @@ python pacman.py -p PacmanQAgent -x 5000 -n 5010 -l smallGrid # example ``` * Make sure you can tune agent to beat ./run_crawler.sh * on windows, just run `python crawler.py` from cmd in the project directory -* other ./run* files are mostly for your amusement. - * ./run_pacman.sh will need more epochs to converge, see [comments](https://github.com/yandexdataschool/Practical_RL/blob/spring20/week03_model_free/crawler_and_pacman/seminar_py2/run_pacman.sh) +* other ./run* files are mostly for your amusement. + * ./run_pacman.sh will need more epochs to converge, see [comments](https://github.com/yandexdataschool/Practical_RL/blob/master/week03_model_free/crawler_and_pacman/seminar_py2/run_pacman.sh) * on windows, just copy the type `python pacman.py -p PacmanQAgent -x 2000 -n 2010 -l smallGrid` in cmd from assignemnt dir @@ -49,14 +49,14 @@ The problem with those environments is that they have a large amount of unique s * where is nearest food * 'center of mass' of all food points (and variance, and whatever) * is there a wall in each direction - * and anything else you see fit - + * and anything else you see fit + Here's how to get this information from [state](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/pacman.py#L49), * Get pacman position: [state.getPacmanPosition()](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/pacman.py#L128) * Is there a wall at (x,y)?: [state.hasWall(x,y)](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/pacman.py#L189) * Get ghost positions: [state.getGhostPositions()](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/pacman.py#L144) * Get all food positions: [state.getCapsules()](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/pacman.py#L153) - + You can call those methods anywhere you see state. * e.g. in [agent.getValue(state)](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/qlearningAgents.py#L52) * Defining a function that extracts all features and calling it in [getQValue](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/qlearningAgents.py#L38) and [setQValue](https://github.com/yandexdataschool/Practical_RL/blob/7a559f8/week03_model_free/seminar_py2/qlearningAgents.py#L44) is probably enough. diff --git a/week03_model_free/homework.ipynb b/week03_model_free/homework.ipynb index a13d9e6dc..6eb18ea33 100644 --- a/week03_model_free/homework.ipynb +++ b/week03_model_free/homework.ipynb @@ -1,29 +1,5 @@ { "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[Part I: On-policy learning and SARSA (3 points)](#Part-I:-On-policy-learning-and-SARSA-(3-points))\n", - "\n", - "[Part II: Experience replay (4 points)](#Part-II:-experience-replay-(4-points))\n", - "\n", - "[Bonus I: TD($ \\lambda $) (5+ points)](#Bonus-I:-TD($\\lambda$)-(5+-points))\n", - "\n", - "[Bonus II: More pacman (5+ points)](#Bonus-II:-More-pacman-(5+-points))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Part I: On-policy learning and SARSA (3 points)\n", - "\n", - "_This notebook builds upon `qlearning.ipynb`, or to be exact your implementation of QLearningAgent._\n", - "\n", - "The policy we're gonna use is epsilon-greedy policy, where agent takes optimal action with probability $(1-\\epsilon)$, otherwise samples action at random. Note that agent __can__ occasionally sample optimal action during random sampling by pure chance." - ] - }, { "cell_type": "code", "execution_count": null, @@ -32,7 +8,8 @@ "source": [ "import sys, os\n", "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + "\n", " !touch .setup_complete\n", "\n", "# This code creates a virtual display to draw game images on.\n", @@ -53,11 +30,24 @@ "%matplotlib inline" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip3 install -q gymnasium[classic-control]" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You can copy your `QLearningAgent` implementation from previous notebook." + "## Seminar: Q-learning (1.5 points)\n", + "\n", + "This notebook will guide you through implementation of vanilla Q-learning algorithm.\n", + "\n", + "You need to implement QLearningAgent (follow instructions for each method) and use it on a number of tests below." ] }, { @@ -89,7 +79,6 @@ " which returns Q(state,action)\n", " - self.set_qvalue(state,action,value)\n", " which sets Q(state,action) := value\n", - "\n", " !!!Important!!!\n", " Note: please avoid using self._qValues directly. \n", " There's a special self.get_qvalue/set_qvalue for that.\n", @@ -159,7 +148,7 @@ " \"\"\"\n", " Compute the action to take in the current state, including exploration. \n", " With probability self.epsilon, we should take a random action.\n", - " otherwise - the best policy action (self.getPolicy).\n", + " otherwise - the best policy action (self.get_best_action).\n", "\n", " Note: To pick randomly from a list, use random.choice(list). \n", " To pick True or False with a given probablity, generate uniform number in [0, 1]\n", @@ -182,6 +171,335 @@ " return chosen_action" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Try it on taxi\n", + "\n", + "Here we use the Q-Learning agent on the Taxi-v3 environment from OpenAI gym.\n", + "You will need to complete a few of its functions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gymnasium as gym\n", + "env = gym.make(\"Taxi-v3\", render_mode='rgb_array')\n", + "\n", + "n_actions = env.action_space.n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s, _ = env.reset()\n", + "plt.imshow(env.render())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agent = QLearningAgent(\n", + " alpha=0.5, epsilon=0.25, discount=0.99,\n", + " get_legal_actions=lambda s: range(n_actions))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def play_and_train(env, agent, t_max=10**4):\n", + " \"\"\"\n", + " This function should \n", + " - run a full game, actions given by agent's e-greedy policy\n", + " - train agent using agent.update(...) whenever it is possible\n", + " - return total reward\n", + " \"\"\"\n", + " total_reward = 0.0\n", + " s, _ = env.reset()\n", + "\n", + " for t in range(t_max):\n", + " # get agent to pick action given state s.\n", + " a = \n", + "\n", + " next_s, r, done, _, _ = env.step(a)\n", + "\n", + " # train (update) agent for state s\n", + " \n", + "\n", + " s = next_s\n", + " total_reward += r\n", + " if done:\n", + " break\n", + "\n", + " return total_reward" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import clear_output\n", + "\n", + "rewards = []\n", + "for i in range(1000):\n", + " rewards.append(play_and_train(env, agent))\n", + " agent.epsilon *= 0.99\n", + "\n", + " if i % 100 == 0:\n", + " clear_output(True)\n", + " plt.title('eps = {:e}, mean reward = {:.1f}'.format(agent.epsilon, np.mean(rewards[-10:])))\n", + " plt.plot(rewards)\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Seminar: Discretized state spaces (1.5 points)\n", + "\n", + "Use agent to train efficiently on `CartPole-v0`. This environment has a continuous set of possible states, so you will have to group them into bins somehow.\n", + "\n", + "The simplest way is to use `round(x, n_digits)` (or `np.round`) to round a real number to a given amount of digits. The tricky part is to get the `n_digits` right for each state to train effectively.\n", + "\n", + "Note that you don't need to convert state to integers, but to __tuples__ of any kind of values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def make_env():\n", + " return gym.make('CartPole-v0', render_mode='rgb_array').env # .env unwraps the TimeLimit wrapper\n", + "\n", + "env = make_env()\n", + "n_actions = env.action_space.n\n", + "\n", + "print(\"first state: %s\" % (env.reset()[0]))\n", + "plt.imshow(env.render())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Play a few games\n", + "\n", + "We need to estimate observation distributions. To do so, we'll play a few games and record all states." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def visualize_cartpole_observation_distribution(seen_observations):\n", + " seen_observations = np.array(seen_observations)\n", + " \n", + " # The meaning of the observations is documented in\n", + " # https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py\n", + "\n", + " f, axarr = plt.subplots(2, 2, figsize=(16, 9), sharey=True)\n", + " for i, title in enumerate(['Cart Position', 'Cart Velocity', 'Pole Angle', 'Pole Velocity At Tip']):\n", + " ax = axarr[i // 2, i % 2]\n", + " ax.hist(seen_observations[:, i], bins=20)\n", + " ax.set_title(title)\n", + " xmin, xmax = ax.get_xlim()\n", + " ax.set_xlim(min(xmin, -xmax), max(-xmin, xmax))\n", + " ax.grid()\n", + " f.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "seen_observations = []\n", + "for _ in range(1000):\n", + " s, _ = env.reset()\n", + " seen_observations.append(s)\n", + " done = False\n", + " while not done:\n", + " s, r, done, _, _ = env.step(env.action_space.sample())\n", + " seen_observations.append(s)\n", + " \n", + "visualize_cartpole_observation_distribution(seen_observations)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Discretize environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gymnasium.core import ObservationWrapper\n", + "\n", + "\n", + "class Discretizer(ObservationWrapper):\n", + " def observation(self, state):\n", + " # Hint: you can do that with round(x, n_digits).\n", + " # You may pick a different n_digits for each dimension.\n", + " state = \n", + "\n", + " return tuple(state)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env = Discretizer(make_env())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "seen_observations = []\n", + "for _ in range(1000):\n", + " s, _ = env.reset()\n", + " seen_observations.append(s)\n", + " done = False\n", + " while not done:\n", + " s, r, done, _, _ = env.step(env.action_space.sample())\n", + " seen_observations.append(s)\n", + " if done:\n", + " break\n", + " \n", + "visualize_cartpole_observation_distribution(seen_observations)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Learn discretized policy\n", + "\n", + "Now let's train a policy that uses discretized state space.\n", + "\n", + "__Tips:__\n", + "\n", + "* Note that increasing the number of digits for one dimension of the observations increases your state space by a factor of $10$.\n", + "* If your discretization is too fine-grained, your agent will take much longer than 10000 steps to converge. You can either increase the number of iterations and reduce epsilon decay or change discretization. In practice we found that this kind of mistake is rather frequent.\n", + "* If your discretization is too coarse, your agent may fail to find the optimal policy. In practice we found that on this particular environment this kind of mistake is rare.\n", + "* **Start with a coarse discretization** and make it more fine-grained if that seems necessary.\n", + "* Having $10^3$–$10^4$ distinct states is recommended (`len(agent._qvalues)`), but not required.\n", + "* If things don't work without annealing $\\varepsilon$, consider adding that, but make sure that it doesn't go to zero too quickly.\n", + "\n", + "A reasonable agent should attain an average reward of at least 50." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "def moving_average(x, span=100):\n", + " return pd.DataFrame({'x': np.asarray(x)}).x.ewm(span=span).mean().values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "agent = QLearningAgent(\n", + " alpha=0.5, epsilon=0.25, discount=0.99,\n", + " get_legal_actions=lambda s: range(n_actions))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rewards = []\n", + "epsilons = []\n", + "\n", + "for i in range(10000):\n", + " reward = play_and_train(env, agent)\n", + " rewards.append(reward)\n", + " epsilons.append(agent.epsilon)\n", + " \n", + " # OPTIONAL: \n", + "\n", + " if i % 100 == 0:\n", + " rewards_ewma = moving_average(rewards)\n", + " \n", + " clear_output(True)\n", + " plt.plot(rewards, label='rewards')\n", + " plt.plot(rewards_ewma, label='rewards ewma@100')\n", + " plt.legend()\n", + " plt.grid()\n", + " plt.title('eps = {:e}, rewards ewma@100 = {:.1f}'.format(agent.epsilon, rewards_ewma[-1]))\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('Your agent has learned {} Q-values.'.format(len(agent._qvalues)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Homework Part I: On-policy learning and SARSA (3 points)\n", + "\n", + "The policy we're gonna use is epsilon-greedy policy, where agent takes the optimal action with probability $(1-\\epsilon)$, otherwise samples action at random. Note that agent __can__ occasionally sample optimal action during random sampling by pure chance." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -232,7 +550,7 @@ "Let's now see how our algorithm compares against q-learning in case where we force agent to explore all the time.\n", "\n", "\n", - "
image by cs188
" + "
Image from CS188
" ] }, { @@ -241,23 +559,10 @@ "metadata": {}, "outputs": [], "source": [ - "import gym\n", - "import gym.envs.toy_text\n", - "env = gym.envs.toy_text.CliffWalkingEnv()\n", - "n_actions = env.action_space.n\n", + "import gymnasium as gym\n", "\n", - "print(env.__doc__)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Our cliffworld has one difference from what's on the image: there is no wall.\n", - "# Agent can choose to go as close to the cliff as it wishes. x:start, T:exit, C:cliff, o: flat ground\n", - "env.render()" + "env = gym.make('CliffWalking-v0', render_mode='rgb_array')\n", + "n_actions = env.action_space.n" ] }, { @@ -266,26 +571,12 @@ "metadata": {}, "outputs": [], "source": [ - "def play_and_train(env, agent, t_max=10**4):\n", - " \"\"\"This function should \n", - " - run a full game, actions given by agent.getAction(s)\n", - " - train agent using agent.update(...) whenever possible\n", - " - return total reward\"\"\"\n", - " total_reward = 0.0\n", - " s = env.reset()\n", - "\n", - " for t in range(t_max):\n", - " a = agent.get_action(s)\n", + "# Our cliffworld has one difference from what's in the image: there is no wall.\n", + "# Agent can choose to go as close to the cliff as it wishes.\n", + "# x:start, T:exit, C:cliff, o: flat ground\n", "\n", - " next_s, r, done, _ = env.step(a)\n", - " agent.update(s, a, r, next_s)\n", - "\n", - " s = next_s\n", - " total_reward += r\n", - " if done:\n", - " break\n", - "\n", - " return total_reward" + "env.reset()\n", + "plt.imshow(env.render())" ] }, { @@ -346,17 +637,22 @@ "metadata": {}, "outputs": [], "source": [ - "def draw_policy(env, agent):\n", + "def draw_policy(agent):\n", " \"\"\" Prints CliffWalkingEnv policy with arrows. Hard-coded. \"\"\"\n", - " n_rows, n_cols = env._cliff.shape\n", + " \n", + " env = gym.make('CliffWalking-v0', render_mode='ansi')\n", + " env.reset()\n", + " grid = [x.split(' ') for x in env.render().split('\\n')[:4]]\n", "\n", + " n_rows, n_cols = 4, 12\n", + " start_state_index = 36\n", " actions = '^>v<'\n", "\n", " for yi in range(n_rows):\n", " for xi in range(n_cols):\n", - " if env._cliff[yi, xi]:\n", + " if grid[yi][xi] == 'C':\n", " print(\" C \", end='')\n", - " elif (yi * n_cols + xi) == env.start_state_index:\n", + " elif (yi * n_cols + xi) == start_state_index:\n", " print(\" X \", end='')\n", " elif (yi * n_cols + xi) == n_rows * n_cols - 1:\n", " print(\" T \", end='')\n", @@ -373,10 +669,69 @@ "outputs": [], "source": [ "print(\"Q-Learning\")\n", - "draw_policy(env, agent_ql)\n", + "draw_policy(agent_ql)\n", "\n", "print(\"SARSA\")\n", - "draw_policy(env, agent_sarsa)" + "draw_policy(agent_sarsa)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Expected Value SARSA for softmax policy (2 points)\n", + "\n", + "Implement an agent that would use a softmax policy for getting an action. Do not forget to also use softmax when calculating the expected value for value estimation. Draw the policy of the agent and see if the result is different compared to the previous approaches. Also, try using different temperatures ($\\tau$) and compare the results.\n", + "\n", + "$$ \\pi(a_i \\mid s) = \\operatorname{softmax} \\left( \\left\\{ {Q(s, a_j) \\over \\tau} \\right\\}_{j=1}^n \\right)_i = {\\operatorname{exp} \\left( Q(s,a_i) / \\tau \\right) \\over {\\sum_{j} \\operatorname{exp} \\left( Q(s,a_j) / \\tau \\right)}} $$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class SoftmaxEVSarsaAgent(EVSarsaAgent):\n", + " def __init__(self, alpha, tau, discount, get_legal_actions):\n", + " super().__init__(alpha, None, discount, get_legal_actions)\n", + " assert tau > 0\n", + " self.tau = tau\n", + " \n", + " def get_value(self, state):\n", + " \"\"\" \n", + " Returns V_{pi} for current state under softmax policy:\n", + " V_{pi}(s) = sum _{over a_i} {pi(a_i | s) * Q(s, a_i)}\n", + "\n", + " Hint: all other methods from QLearningAgent are still accessible.\n", + " \"\"\"\n", + " possible_actions = self.get_legal_actions(state)\n", + "\n", + " # If there are no legal actions, return 0.0\n", + " if len(possible_actions) == 0:\n", + " return 0.0\n", + "\n", + " \n", + "\n", + " return value\n", + " \n", + " def get_action(self, state):\n", + " \"\"\"\n", + " Compute the action to take in the current state, including exploration. \n", + " We should take a random action with probability equaled softmax of q values.\n", + " \"\"\"\n", + " # Pick Action\n", + " possible_actions = self.get_legal_actions(state)\n", + " action = None\n", + "\n", + " # If there are no legal actions, return None\n", + " if len(possible_actions) == 0:\n", + " return None\n", + "\n", + " \n", + " \n", + "\n", + " return chosen_action" ] }, { @@ -388,8 +743,6 @@ "Here are some of the things you can do if you feel like it:\n", "\n", "* Play with epsilon. See learned how policies change if you set epsilon to higher/lower values (e.g. 0.75).\n", - "* Expected Value SARSA for softmax policy __(2pts)__:\n", - "$$ \\pi(a_i \\mid s) = \\operatorname{softmax} \\left( \\left\\{ {Q(s, a_j) \\over \\tau} \\right\\}_{j=1}^n \\right)_i = {\\operatorname{exp} \\left( Q(s,a_i) / \\tau \\right) \\over {\\sum_{j} \\operatorname{exp} \\left( Q(s,a_j) / \\tau \\right)}} $$\n", "* Implement N-step algorithms and TD($\\lambda$): see [Sutton's book](http://incompleteideas.net/book/RLbook2020.pdf) chapter 7 and chapter 12.\n", "* Use those algorithms to train on CartPole in previous / next assignment for this week." ] @@ -398,7 +751,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Part II: experience replay (4 points)\n", + "## Part II: experience replay (2 points)\n", "\n", "There's a powerful technique that you can use to improve sample efficiency for off-policy algorithms: [spoiler] Experience replay :)\n", "\n", @@ -423,28 +776,6 @@ "metadata": {}, "outputs": [], "source": [ - "import sys, os\n", - "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", - " !touch .setup_complete\n", - "\n", - "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", - "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", - " !bash ../xvfb start\n", - " os.environ['DISPLAY'] = ':1'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "\n", "from IPython.display import clear_output" ] }, @@ -574,7 +905,7 @@ "metadata": {}, "outputs": [], "source": [ - "import gym\n", + "import gymnasium as gym\n", "env = gym.make(\"Taxi-v3\")\n", "n_actions = env.action_space.n" ] @@ -589,20 +920,20 @@ " t_max=10**4, replay_batch_size=32):\n", " \"\"\"\n", " This function should \n", - " - run a full game, actions given by agent.getAction(s)\n", + " - run a full game, actions given by agent.get_action(s)\n", " - train agent using agent.update(...) whenever possible\n", " - return total reward\n", " :param replay: ReplayBuffer where agent can store and sample (s,a,r,s',done) tuples.\n", " If None, do not use experience replay\n", " \"\"\"\n", " total_reward = 0.0\n", - " s = env.reset()\n", + " s, _ = env.reset()\n", "\n", " for t in range(t_max):\n", " # get agent to pick action given state s\n", " a = \n", "\n", - " next_s, r, done, _ = env.step(a)\n", + " next_s, r, done, _, _ = env.step(a)\n", "\n", " # update agent on current transition. Use agent.update\n", " \n", @@ -690,7 +1021,7 @@ "\n", "### Outro\n", "\n", - "We will use the code you just wrote extensively in the next week of our course. If you're feeling that you need more examples to understand how experience replay works, try using it for binarized state spaces (CartPole or other __[classic control envs](https://gym.openai.com/envs/#classic_control)__).\n", + "We will use the code you just wrote extensively in the next week of our course. If you're feeling that you need more examples to understand how experience replay works, try using it for discretized state spaces (CartPole or other __[classic control envs](https://gym.openai.com/envs/#classic_control)__).\n", "\n", "__Next week__ we're gonna explore how q-learning and similar algorithms can be applied for large state spaces, with deep learning models to approximate the Q function.\n", "\n", diff --git a/week03_model_free/seminar_qlearning.ipynb b/week03_model_free/seminar_qlearning.ipynb deleted file mode 100644 index 7ef47effb..000000000 --- a/week03_model_free/seminar_qlearning.ipynb +++ /dev/null @@ -1,495 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Q-learning (3 points)\n", - "\n", - "This notebook will guide you through implementation of vanilla Q-learning algorithm.\n", - "\n", - "You need to implement QLearningAgent (follow instructions for each method) and use it on a number of tests below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys, os\n", - "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", - "\n", - " !touch .setup_complete\n", - "\n", - "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", - "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", - " !bash ../xvfb start\n", - " os.environ['DISPLAY'] = ':1'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from collections import defaultdict\n", - "import random\n", - "import math\n", - "import numpy as np\n", - "\n", - "\n", - "class QLearningAgent:\n", - " def __init__(self, alpha, epsilon, discount, get_legal_actions):\n", - " \"\"\"\n", - " Q-Learning Agent\n", - " based on https://inst.eecs.berkeley.edu/~cs188/sp19/projects.html\n", - " Instance variables you have access to\n", - " - self.epsilon (exploration prob)\n", - " - self.alpha (learning rate)\n", - " - self.discount (discount rate aka gamma)\n", - "\n", - " Functions you should use\n", - " - self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable}\n", - " which returns legal actions for a state\n", - " - self.get_qvalue(state,action)\n", - " which returns Q(state,action)\n", - " - self.set_qvalue(state,action,value)\n", - " which sets Q(state,action) := value\n", - " !!!Important!!!\n", - " Note: please avoid using self._qValues directly. \n", - " There's a special self.get_qvalue/set_qvalue for that.\n", - " \"\"\"\n", - "\n", - " self.get_legal_actions = get_legal_actions\n", - " self._qvalues = defaultdict(lambda: defaultdict(lambda: 0))\n", - " self.alpha = alpha\n", - " self.epsilon = epsilon\n", - " self.discount = discount\n", - "\n", - " def get_qvalue(self, state, action):\n", - " \"\"\" Returns Q(state,action) \"\"\"\n", - " return self._qvalues[state][action]\n", - "\n", - " def set_qvalue(self, state, action, value):\n", - " \"\"\" Sets the Qvalue for [state,action] to the given value \"\"\"\n", - " self._qvalues[state][action] = value\n", - "\n", - " #---------------------START OF YOUR CODE---------------------#\n", - "\n", - " def get_value(self, state):\n", - " \"\"\"\n", - " Compute your agent's estimate of V(s) using current q-values\n", - " V(s) = max_over_action Q(state,action) over possible actions.\n", - " Note: please take into account that q-values can be negative.\n", - " \"\"\"\n", - " possible_actions = self.get_legal_actions(state)\n", - "\n", - " # If there are no legal actions, return 0.0\n", - " if len(possible_actions) == 0:\n", - " return 0.0\n", - "\n", - " \n", - "\n", - " return value\n", - "\n", - " def update(self, state, action, reward, next_state):\n", - " \"\"\"\n", - " You should do your Q-Value update here:\n", - " Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s'))\n", - " \"\"\"\n", - "\n", - " # agent parameters\n", - " gamma = self.discount\n", - " learning_rate = self.alpha\n", - "\n", - " \n", - "\n", - " self.set_qvalue(state, action, )\n", - "\n", - " def get_best_action(self, state):\n", - " \"\"\"\n", - " Compute the best action to take in a state (using current q-values). \n", - " \"\"\"\n", - " possible_actions = self.get_legal_actions(state)\n", - "\n", - " # If there are no legal actions, return None\n", - " if len(possible_actions) == 0:\n", - " return None\n", - "\n", - " \n", - "\n", - " return best_action\n", - "\n", - " def get_action(self, state):\n", - " \"\"\"\n", - " Compute the action to take in the current state, including exploration. \n", - " With probability self.epsilon, we should take a random action.\n", - " otherwise - the best policy action (self.get_best_action).\n", - "\n", - " Note: To pick randomly from a list, use random.choice(list). \n", - " To pick True or False with a given probablity, generate uniform number in [0, 1]\n", - " and compare it with your probability\n", - " \"\"\"\n", - "\n", - " # Pick Action\n", - " possible_actions = self.get_legal_actions(state)\n", - " action = None\n", - "\n", - " # If there are no legal actions, return None\n", - " if len(possible_actions) == 0:\n", - " return None\n", - "\n", - " # agent parameters:\n", - " epsilon = self.epsilon\n", - "\n", - " \n", - "\n", - " return chosen_action" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Try it on taxi\n", - "\n", - "Here we use the qlearning agent on taxi env from openai gym.\n", - "You will need to insert a few agent functions here." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import gym\n", - "env = gym.make(\"Taxi-v3\")\n", - "\n", - "n_actions = env.action_space.n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "agent = QLearningAgent(\n", - " alpha=0.5, epsilon=0.25, discount=0.99,\n", - " get_legal_actions=lambda s: range(n_actions))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def play_and_train(env, agent, t_max=10**4):\n", - " \"\"\"\n", - " This function should \n", - " - run a full game, actions given by agent's e-greedy policy\n", - " - train agent using agent.update(...) whenever it is possible\n", - " - return total reward\n", - " \"\"\"\n", - " total_reward = 0.0\n", - " s = env.reset()\n", - "\n", - " for t in range(t_max):\n", - " # get agent to pick action given state s.\n", - " a = \n", - "\n", - " next_s, r, done, _ = env.step(a)\n", - "\n", - " # train (update) agent for state s\n", - " \n", - "\n", - " s = next_s\n", - " total_reward += r\n", - " if done:\n", - " break\n", - "\n", - " return total_reward" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from IPython.display import clear_output\n", - "\n", - "rewards = []\n", - "for i in range(1000):\n", - " rewards.append(play_and_train(env, agent))\n", - " agent.epsilon *= 0.99\n", - "\n", - " if i % 100 == 0:\n", - " clear_output(True)\n", - " plt.title('eps = {:e}, mean reward = {:.1f}'.format(agent.epsilon, np.mean(rewards[-10:])))\n", - " plt.plot(rewards)\n", - " plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Binarized state spaces\n", - "\n", - "Use agent to train efficiently on `CartPole-v0`. This environment has a continuous set of possible states, so you will have to group them into bins somehow.\n", - "\n", - "The simplest way is to use `round(x, n_digits)` (or `np.round`) to round a real number to a given amount of digits. The tricky part is to get the `n_digits` right for each state to train effectively.\n", - "\n", - "Note that you don't need to convert state to integers, but to __tuples__ of any kind of values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def make_env():\n", - " return gym.make('CartPole-v0').env # .env unwraps the TimeLimit wrapper\n", - "\n", - "env = make_env()\n", - "n_actions = env.action_space.n\n", - "\n", - "print(\"first state: %s\" % (env.reset()))\n", - "plt.imshow(env.render('rgb_array'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Play a few games\n", - "\n", - "We need to estimate observation distributions. To do so, we'll play a few games and record all states." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def visualize_cartpole_observation_distribution(seen_observations):\n", - " seen_observations = np.array(seen_observations)\n", - " \n", - " # The meaning of the observations is documented in\n", - " # https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py\n", - "\n", - " f, axarr = plt.subplots(2, 2, figsize=(16, 9), sharey=True)\n", - " for i, title in enumerate(['Cart Position', 'Cart Velocity', 'Pole Angle', 'Pole Velocity At Tip']):\n", - " ax = axarr[i // 2, i % 2]\n", - " ax.hist(seen_observations[:, i], bins=20)\n", - " ax.set_title(title)\n", - " xmin, xmax = ax.get_xlim()\n", - " ax.set_xlim(min(xmin, -xmax), max(-xmin, xmax))\n", - " ax.grid()\n", - " f.tight_layout()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "seen_observations = []\n", - "for _ in range(1000):\n", - " seen_observations.append(env.reset())\n", - " done = False\n", - " while not done:\n", - " s, r, done, _ = env.step(env.action_space.sample())\n", - " seen_observations.append(s)\n", - "\n", - "visualize_cartpole_observation_distribution(seen_observations)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Binarize environment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from gym.core import ObservationWrapper\n", - "\n", - "\n", - "class Binarizer(ObservationWrapper):\n", - " def observation(self, state):\n", - " # Hint: you can do that with round(x, n_digits).\n", - " # You may pick a different n_digits for each dimension.\n", - " state = \n", - "\n", - " return tuple(state)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "env = Binarizer(make_env())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "seen_observations = []\n", - "for _ in range(1000):\n", - " seen_observations.append(env.reset())\n", - " done = False\n", - " while not done:\n", - " s, r, done, _ = env.step(env.action_space.sample())\n", - " seen_observations.append(s)\n", - " if done:\n", - " break\n", - "\n", - "visualize_cartpole_observation_distribution(seen_observations)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Learn binarized policy\n", - "\n", - "Now let's train a policy that uses binarized state space.\n", - "\n", - "__Tips:__\n", - "\n", - "* Note that increasing the number of digits for one dimension of the observations increases your state space by a factor of $10$.\n", - "* If your binarization is too fine-grained, your agent will take much longer than 10000 steps to converge. You can either increase the number of iterations and reduce epsilon decay or change binarization. In practice we found that this kind of mistake is rather frequent.\n", - "* If your binarization is too coarse, your agent may fail to find the optimal policy. In practice we found that on this particular environment this kind of mistake is rare.\n", - "* **Start with a coarse binarization** and make it more fine-grained if that seems necessary.\n", - "* Having $10^3$–$10^4$ distinct states is recommended (`len(agent._qvalues)`), but not required.\n", - "* If things don't work without annealing $\\varepsilon$, consider adding that, but make sure that it doesn't go to zero too quickly.\n", - "\n", - "A reasonable agent should attain an average reward of at least 50." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "def moving_average(x, span=100):\n", - " return pd.DataFrame({'x': np.asarray(x)}).x.ewm(span=span).mean().values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "agent = QLearningAgent(\n", - " alpha=0.5, epsilon=0.25, discount=0.99,\n", - " get_legal_actions=lambda s: range(n_actions))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "rewards = []\n", - "epsilons = []\n", - "\n", - "for i in range(10000):\n", - " reward = play_and_train(env, agent)\n", - " rewards.append(reward)\n", - " epsilons.append(agent.epsilon)\n", - " \n", - " # OPTIONAL: \n", - "\n", - " if i % 100 == 0:\n", - " rewards_ewma = moving_average(rewards)\n", - " \n", - " clear_output(True)\n", - " plt.plot(rewards, label='rewards')\n", - " plt.plot(rewards_ewma, label='rewards ewma@100')\n", - " plt.legend()\n", - " plt.grid()\n", - " plt.title('eps = {:e}, rewards ewma@100 = {:.1f}'.format(agent.epsilon, rewards_ewma[-1]))\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print('Your agent has learned {} Q-values.'.format(len(agent._qvalues)))" - ] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/week04_[recap]_deep_learning/README.md b/week04_[recap]_deep_learning/README.md index ab19e5717..d5bc7139d 100644 --- a/week04_[recap]_deep_learning/README.md +++ b/week04_[recap]_deep_learning/README.md @@ -1,22 +1,20 @@ -__Note:__ This week's materials cover the basics of neural nets and deep learning and teach you how to use auto-diff frameworks. If you're already fluent in tensorflow OR pytorch OR theano - feel free to skip this week entirely.. +__Note:__ This week's materials cover the basics of neural nets and deep learning and teach you how to use auto-diff frameworks. If you're already fluent in Tensorflow or PyTorch, feel free to skip this week entirely. ## Materials * [__Lecture slides__](https://yadi.sk/i/yAO2AJ3M3EKP8g) - __In russian:__ * Basic lecture on deep learning - [video](https://yadi.sk/i/yyHZub6R3Ej5dV) - * Deep learning frameworks - [video](https://yadi.sk/i/hDIkaR4H3EtnXM) + * Deep learning frameworks - [video](https://yadi.sk/i/hDIkaR4H3EtnXM) * [Pytorch tutorial](https://yadi.sk/i/O3mQ76u43So3h9) __recommended__ * [Tensorflow tutorial](https://www.youtube.com/watch?v=FQ660T4uu7k) (english only for now. Links are welcome) - * [Theano tutorial](https://yadi.sk/i/54STsEBVpubkn) - __In english:__ * Intro to neural nets and backprop (english) - [video](https://www.youtube.com/watch?v=uXt8qF2Zzfo) * Intro to convnets - [video](https://www.youtube.com/watch?v=FmpDIaiMIeA) * Deep learning frameworks - [video](https://www.youtube.com/watch?v=Vf_-OkqbwPo) * [Tensorflow tutorial](https://www.youtube.com/watch?v=FQ660T4uu7k) - * [Theano tutorial](https://www.youtube.com/watch?v=OU8I1oJ9HhI) - * [Pytorch tutorial](https://www.youtube.com/watch?v=VMcRWYEKmhw) + * [PyTorch tutorial](https://www.youtube.com/watch?v=VMcRWYEKmhw) ## Bonus materials * Karpathy's course on deep learning (english) - http://cs231n.github.io/ @@ -26,21 +24,19 @@ __Note:__ This week's materials cover the basics of neural nets and deep learnin * Deep learning demystified - [video](https://www.youtube.com/watch?v=Q9Z20HCPnww) * Karpathy's lecture on deep learning for computer vision - https://www.youtube.com/watch?v=u6aEYuemt0M * Our humble DL course: [HSE'fall17](https://github.com/yandexdataschool/HSE_deeplearning), [Skoltech/YSDA'spring16](https://github.com/ddtm/dl-course/) courses on deep learning (english). -* Srsly, just google `"deep learning %s"%s for s in what_you_want_to_know`. +* Srsly, just google `"deep learning %s" % s for s in what_you_want_to_know`. + - ### Practice -__[Colab url (pytorch)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week04_%5Brecap%5D_deep_learning/seminar_pytorch.ipynb)__ -From now on, we'll have two tracks: theano and tensorflow. We'll also add pytorch seminars as soon as they're ready. +__[Colab URL (PyTorch)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week04_%5Brecap%5D_deep_learning/seminar_pytorch.ipynb)__ +From now on, we'll have two tracks: Tensorflow and PyTorch. -Please pick seminar_theano.ipynb, seminar_tensorflow.ipynb or seminar_pytorch.ipynb. +Please pick `seminar_tensorflow.ipynb` or `seminar_pytorch.ipynb`. __Note:__ in this and all following weeks you're only required to get through practice in _one_ of the frameworks. Looking into other alternatives is great for self-education but never mandatory. #### What to choose? * The simplest choice is PyTorch: it's basically ye olde numpy with automatic gradients and a lot of pre-implemented DL stuff... except all the functions have different names. * If you want to be familiar with production-related stuff from day 1, choose TensorFlow. It's much more convenient to deploy (to non-python or to mobiles). The catch is that all those conveniences become inconveniences once you want to write something simple in jupyter. -* Theano works like tensorflow but it offers a numpy-compatible interface and comes with built-in graph optimization. The payoff is that theano is not as popular as the first two. It is also not meant as a producton framework so deploying to mobiles may be a problem. * It's not like choosing house at Hogwarts, you'll be able to switch between frameworks easily once you master the underlying principles. - diff --git a/week04_[recap]_deep_learning/mnist.py b/week04_[recap]_deep_learning/mnist.py index e3d79b7db..70d61958e 100644 --- a/week04_[recap]_deep_learning/mnist.py +++ b/week04_[recap]_deep_learning/mnist.py @@ -1,7 +1,5 @@ import sys import os -import time - import numpy as np __doc__ = """taken from https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py""" diff --git a/week04_[recap]_deep_learning/practice_lasagne.ipynb b/week04_[recap]_deep_learning/practice_lasagne.ipynb deleted file mode 100644 index 3ff11ce71..000000000 --- a/week04_[recap]_deep_learning/practice_lasagne.ipynb +++ /dev/null @@ -1,1304 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Theano, Lasagne\n", - "and why they matter\n", - "\n", - "\n", - "### got no lasagne?\n", - "Install the __bleeding edge__ version from here: http://lasagne.readthedocs.org/en/latest/user/installation.html" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Warming up\n", - "* Implement a function that computes the sum of squares of numbers from 0 to N\n", - "* Use numpy or python\n", - "* An array of numbers 0 to N - numpy.arange(N)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "\n", - "def sum_squares(N):\n", - " return " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "sum_squares(10**8)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# theano teaser\n", - "\n", - "Doing the very same thing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import theano\n", - "import theano.tensor as T" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "\n", - "# I gonna be function parameter\n", - "N = T.scalar(\"a dimension\", dtype='int32')\n", - "\n", - "\n", - "# i am a recipe on how to produce sum of squares of arange of N given N\n", - "result = (T.arange(N)**2).sum()\n", - "\n", - "# Compiling the recipe of computing \"result\" given N\n", - "sum_function = theano.function(inputs=[N], outputs=result)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "sum_function(10**8)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# How does it work?\n", - "* 1 You define inputs f your future function;\n", - "* 2 You write a recipe for some transformation of inputs;\n", - "* 3 You compile it;\n", - "* You have just got a function!\n", - "* The gobbledegooky version: you define a function as symbolic computation graph.\n", - "\n", - "\n", - "* There are two main kinвs of entities: \"Inputs\" and \"Transformations\"\n", - "* Both can be numbers, vectors, matrices, tensors, etc.\n", - "* Both can be integers, floats of booleans (uint8) of various size.\n", - "\n", - "\n", - "* An input is a placeholder for function parameters.\n", - " * N from example above\n", - "\n", - "\n", - "* Transformations are the recipes for computing something given inputs and transformation\n", - " * (T.arange(N)^2).sum() are 3 sequential transformations of N\n", - " * Doubles all functions of numpy vector syntax\n", - " * You can almost always go with replacing \"np.function\" with \"T.function\" aka \"theano.tensor.function\"\n", - " * np.mean -> T.mean\n", - " * np.arange -> T.arange\n", - " * np.cumsum -> T.cumsum\n", - " * and so on.\n", - " * builtin operations also work that way\n", - " * np.arange(10).mean() -> T.arange(10).mean()\n", - " * Once upon a blue moon the functions have different names or locations (e.g. T.extra_ops)\n", - " * Ask us or google it\n", - " \n", - " \n", - "Still confused? We gonna fix that." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Inputs\n", - "example_input_integer = T.scalar(\"scalar input\", dtype='float32')\n", - "\n", - "# dtype = theano.config.floatX by default\n", - "example_input_tensor = T.tensor4(\"four dimensional tensor input\")\n", - "# не бойся, тензор нам не пригодится\n", - "\n", - "\n", - "input_vector = T.vector(\"my vector\", dtype='int32') # vector of integers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Transformations\n", - "\n", - "# transofrmation: elementwise multiplication\n", - "double_the_vector = input_vector*2\n", - "\n", - "# elementwise cosine\n", - "elementwise_cosine = T.cos(input_vector)\n", - "\n", - "# difference between squared vector and vector itself\n", - "vector_squares = input_vector**2 - input_vector" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Practice time:\n", - "# create two vectors of size float32\n", - "my_vector = student.init_float32_vector()\n", - "my_vector2 = student.init_one_more_such_vector()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write a transformation(recipe):\n", - "#(vec1)*(vec2) / (sin(vec1) +1)\n", - "my_transformation = student.implementwhatwaswrittenabove()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(my_transformation)\n", - "# it's okay it aint a number" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# What's inside the transformation\n", - "theano.printing.debugprint(my_transformation)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Compiling\n", - "* So far we were using \"symbolic\" variables and transformations\n", - " * Defining the recipe for computation, but not computing anything\n", - "* To use the recipe, one should compile it" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "inputs = [ ]\n", - "outputs = [ ]\n", - "\n", - "# The next lines compile a function that takes two vectors and computes your transformation\n", - "my_function = theano.function(\n", - " inputs, outputs,\n", - " # automatic type casting for input parameters (e.g. float64 -> float32)\n", - " allow_input_downcast=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# using function with, lists:\n", - "print \"using python lists:\"\n", - "print my_function([1, 2, 3], [4, 5, 6])\n", - "print\n", - "\n", - "# Or using numpy arrays:\n", - "# btw, that 'float' dtype is casted to secong parameter dtype which is float32\n", - "print \"using numpy arrays:\"\n", - "print my_function(np.arange(10),\n", - " np.linspace(5, 6, 10, dtype='float'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Debugging\n", - "* Compilation can take a while for big functions\n", - "* To avoid waiting, one can evaluate transformations without compiling\n", - "* Without compilation, the code runs slower, so consider reducing input size\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# a dictionary of inputs\n", - "my_function_inputs = {\n", - " my_vector: [1, 2, 3],\n", - " my_vector2: [4, 5, 6]\n", - "}\n", - "\n", - "# evaluate my_transformation\n", - "# has to match with compiled function output\n", - "print my_transformation.eval(my_function_inputs)\n", - "\n", - "\n", - "# can compute transformations on the fly\n", - "print(\"add 2 vectors\", (my_vector + my_vector2).eval(my_function_inputs))\n", - "\n", - "#!WARNING! if your transformation only depends on some inputs,\n", - "# do not provide the rest of them\n", - "print(\"vector's shape:\", my_vector.shape.eval({\n", - " my_vector: [1, 2, 3]\n", - "}))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "* When debugging, it's usually a good idea to reduce the scale of your computation. E.g. if you train on batches of 128 objects, debug on 2-3.\n", - "* If it's imperative that you run a large batch of data, consider compiling with mode='debug' instead" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Your turn: Mean Squared Error (2 pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Quest #1 - implement a function that computes a mean squared error of two input vectors\n", - "# Your function has to take 2 vectors and return a single number\n", - "\n", - "\n", - "\n", - "compute_mse = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Tests\n", - "from sklearn.metrics import mean_squared_error\n", - "\n", - "for n in [1, 5, 10, 10**3]:\n", - "\n", - " elems = [np.arange(n), np.arange(n, 0, -1), np.zeros(n),\n", - " np.ones(n), np.random.random(n), np.random.randint(100, size=n)]\n", - "\n", - " for el in elems:\n", - " for el_2 in elems:\n", - " true_mse = np.array(mean_squared_error(el, el_2))\n", - " my_mse = compute_mse(el, el_2)\n", - " if not np.allclose(true_mse, my_mse):\n", - " print('Wrong result:')\n", - " print('mse(%s,%s)' % (el, el_2))\n", - " print(\"should be: %f, but your function returned %f\" %\n", - " (true_mse, my_mse))\n", - " raise ValueError(\"Что-то не так\")\n", - "\n", - "print(\"All tests passed\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Shared variables\n", - "\n", - "* The inputs and transformations only exist when function is called\n", - "\n", - "* Shared variables always stay in memory like global variables\n", - " * Shared variables can be included into a symbolic graph\n", - " * They can be set and evaluated using special methods\n", - " * but they can't change value arbitrarily during symbolic graph computation\n", - " * we'll cover that later;\n", - " \n", - " \n", - "* Hint: such variables are a perfect place to store network parameters\n", - " * e.g. weights or some metadata" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# creating shared variable\n", - "shared_vector_1 = theano.shared(np.ones(10, dtype='float64'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# evaluating shared variable (outside symbolicd graph)\n", - "print(\"initial value\", shared_vector_1.get_value())\n", - "\n", - "# within symbolic graph you use them just as any other inout or transformation, not \"get value\" needed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# setting new value\n", - "shared_vector_1.set_value(np.arange(5))\n", - "\n", - "# getting that new value\n", - "print(\"new value\", shared_vector_1.get_value())\n", - "\n", - "# Note that the vector changed shape\n", - "# This is entirely allowed... unless your graph is hard-wired to work with some fixed shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Your turn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write a recipe (transformation) that computes an elementwise transformation of shared_vector and input_scalar\n", - "# Compile as a function of input_scalar\n", - "\n", - "input_scalar = T.scalar('coefficient', dtype='float32')\n", - "\n", - "scalar_times_shared = \n", - "\n", - "\n", - "shared_times_n = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print \"shared:\", shared_vector_1.get_value()\n", - "\n", - "print \"shared_times_n(5)\", shared_times_n(5)\n", - "\n", - "print \"shared_times_n(-0.5)\", shared_times_n(-0.5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing value of vector 1 (output should change)\n", - "shared_vector_1.set_value([-1, 0, 1])\n", - "print \"shared:\", shared_vector_1.get_value()\n", - "\n", - "print \"shared_times_n(5)\", shared_times_n(5)\n", - "\n", - "print \"shared_times_n(-0.5)\", shared_times_n(-0.5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# T.grad - why theano matters\n", - "* Theano can compute derivatives and gradients automatically\n", - "* Derivatives are computed symbolically, not numerically\n", - "\n", - "Limitations:\n", - "* You can only compute a gradient of a __scalar__ transformation over one or several scalar or vector (or tensor) transformations or inputs.\n", - "* A transformation has to have float32 or float64 dtype throughout the whole computation graph\n", - " * derivative over an integer has no mathematical sense\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "my_scalar = T.scalar(name='input', dtype='float64')\n", - "\n", - "scalar_squared = T.sum(my_scalar**2)\n", - "\n", - "# a derivative of v_squared by my_vector\n", - "derivative = T.grad(scalar_squared, my_scalar)\n", - "\n", - "fun = theano.function([my_scalar], scalar_squared)\n", - "grad = theano.function([my_scalar], derivative)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "\n", - "\n", - "x = np.linspace(-3, 3)\n", - "x_squared = list(map(fun, x))\n", - "x_squared_der = list(map(grad, x))\n", - "\n", - "plt.plot(x, x_squared, label=\"x^2\")\n", - "plt.plot(x, x_squared_der, label=\"derivative\")\n", - "plt.legend()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Why that rocks" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "my_vector = T.vector('float64')\n", - "\n", - "# Compute the gradient of the next weird function over my_scalar and my_vector\n", - "# warning! Trying to understand the meaning of that function may result in permanent brain damage\n", - "\n", - "weird_psychotic_function = ((my_vector+my_scalar)**(1+T.var(my_vector)) + 1./T.arcsinh(my_scalar)).mean()/(my_scalar**2 + 1) + 0.01*T.sin(2*my_scalar**1.5)*(\n", - " T.sum(my_vector) * my_scalar**2)*T.exp((my_scalar-4)**2)/(1+T.exp((my_scalar-4)**2))*(1.-(T.exp(-(my_scalar-4)**2))/(1+T.exp(-(my_scalar-4)**2)))**2\n", - "\n", - "\n", - "der_by_scalar, der_by_vector = \n", - "\n", - "\n", - "compute_weird_function = theano.function(\n", - " [my_scalar, my_vector], weird_psychotic_function)\n", - "compute_der_by_scalar = theano.function([my_scalar, my_vector], der_by_scalar)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Plotting your derivative\n", - "vector_0 = [1, 2, 3]\n", - "\n", - "scalar_space = np.linspace(0, 7)\n", - "\n", - "y = [compute_weird_function(x, vector_0) for x in scalar_space]\n", - "plt.plot(scalar_space, y, label='function')\n", - "y_der_by_scalar = [compute_der_by_scalar(x, vector_0) for x in scalar_space]\n", - "plt.plot(scalar_space, y_der_by_scalar, label='derivative')\n", - "plt.grid()\n", - "plt.legend()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Almost done - Updates\n", - "\n", - "* updates are a way of changing shared variables at after function call.\n", - "\n", - "* technically it's a dictionary {shared_variable : a recipe for new value} which is has to be provided when function is compiled\n", - "\n", - "That's how it works:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Multiply shared vector by a number and save the product back into shared vector\n", - "\n", - "inputs = [input_scalar]\n", - "outputs = [scalar_times_shared] # return vector times scalar\n", - "\n", - "my_updates = {\n", - " # and write this same result bach into shared_vector_1\n", - " shared_vector_1: scalar_times_shared\n", - "}\n", - "\n", - "compute_and_save = theano.function(inputs, outputs, updates=my_updates)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "shared_vector_1.set_value(np.arange(5))\n", - "\n", - "# initial shared_vector_1\n", - "print(\"initial shared value:\", shared_vector_1.get_value())\n", - "\n", - "# evaluating the function (shared_vector_1 will be changed)\n", - "print(\"compute_and_save(2) returns\", compute_and_save(2))\n", - "\n", - "# evaluate new shared_vector_1\n", - "print(\"new shared value:\", shared_vector_1.get_value())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Logistic regression example (4 pts)\n", - "\n", - "Implement the regular logistic regression training algorithm\n", - "\n", - "Tips:\n", - "* Weights fit in as a shared variable\n", - "* X and y are potential inputs\n", - "* Compile 2 functions:\n", - " * train_function(X,y) - returns error and computes weights' new values __(through updates)__\n", - " * predict_fun(X) - just computes probabilities (\"y\") given data\n", - " \n", - " \n", - "We shall train on a two-class MNIST dataset\n", - "* please note that target y are {0,1} and not {-1,1} as in some formulae" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import load_digits\n", - "mnist = load_digits(2)\n", - "\n", - "X, y = mnist.data, mnist.target\n", - "\n", - "\n", - "print(\"y [shape - %s]:\" % (str(y.shape)), y[:10])\n", - "\n", - "print(\"X [shape - %s]:\" % (str(X.shape)))\n", - "print(X[:3])\n", - "print(y[:10])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# inputs and shareds\n", - "shared_weights = \n", - "input_y = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "predicted_y = \n", - "\n", - "\n", - "loss = \n", - "\n", - "\n", - "grad = \n", - "\n", - "\n", - "updates = {\n", - " shared_weights: \n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "train_function = \n", - "\n", - "predict_function = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.cross_validation import train_test_split\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import roc_auc_score\n", - "\n", - "for i in range(5):\n", - " loss_i = train_function(X_train, y_train)\n", - " print(\"loss at iter %i:%.4f\" % (i, loss_i))\n", - " print(\"train auc:\", roc_auc_score(y_train, predict_function(X_train)))\n", - " print(\"test auc:\", roc_auc_score(y_test, predict_function(X_test)))\n", - "\n", - "\n", - "print(\"resulting weights:\")\n", - "plt.imshow(shared_weights.get_value().reshape(8, -1))\n", - "plt.colorbar()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# lasagne\n", - "* lasagne is a library for neural network building and training\n", - "* it's a low-level library with almost seamless integration with theano\n", - "\n", - "For a demo we shall solve the same digit recognition problem, but at a different scale\n", - "* images are now 28x28\n", - "* 10 different digits\n", - "* 50k samples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mnist import load_dataset\n", - "X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()\n", - "\n", - "print X_train.shape, y_train.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import lasagne\n", - "\n", - "input_X = T.tensor4(\"X\")\n", - "\n", - "# input dimention (None means \"Arbitrary\" and only works at the first axes [samples])\n", - "input_shape = [None, 1, 28, 28]\n", - "\n", - "target_y = T.vector(\"target Y integer\", dtype='int32')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Defining network architecture" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Input layer (auxilary)\n", - "input_layer = lasagne.layers.InputLayer(shape=input_shape, input_var=input_X)\n", - "\n", - "# fully connected layer, that takes input layer and applies 50 neurons to it.\n", - "# nonlinearity here is sigmoid as in logistic regression\n", - "# you can give a name to each layer (optional)\n", - "dense_1 = lasagne.layers.DenseLayer(input_layer, num_units=50,\n", - " nonlinearity=lasagne.nonlinearities.sigmoid,\n", - " name=\"hidden_dense_layer\")\n", - "\n", - "# fully connected output layer that takes dense_1 as input and has 10 neurons (1 for each digit)\n", - "# We use softmax nonlinearity to make probabilities add up to 1\n", - "dense_output = lasagne.layers.DenseLayer(dense_1, num_units=10,\n", - " nonlinearity=lasagne.nonlinearities.softmax,\n", - " name='output')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# network prediction (theano-transformation)\n", - "y_predicted = lasagne.layers.get_output(dense_output)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# all network weights (shared variables)\n", - "all_weights = lasagne.layers.get_all_params(dense_output)\n", - "print(all_weights)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Than you could simply\n", - "* define loss function manually\n", - "* compute error gradient over all weights\n", - "* define updates\n", - "* But that's a whole lot of work and life's short\n", - " * not to mention life's too short to wait for SGD to converge\n", - "\n", - "Instead, we shall use Lasagne builtins" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Mean categorical crossentropy as a loss function - similar to logistic loss but for multiclass targets\n", - "loss = lasagne.objectives.categorical_crossentropy(\n", - " y_predicted, target_y).mean()\n", - "\n", - "# prediction accuracy\n", - "accuracy = lasagne.objectives.categorical_accuracy(\n", - " y_predicted, target_y).mean()\n", - "\n", - "# This function computes gradient AND composes weight updates just like you did earlier\n", - "updates_sgd = lasagne.updates.sgd(loss, all_weights, learning_rate=0.01)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# function that computes loss and updates weights\n", - "train_fun = theano.function(\n", - " [input_X, target_y], [loss, accuracy], updates=updates_sgd)\n", - "\n", - "# function that just computes accuracy\n", - "accuracy_fun = theano.function([input_X, target_y], accuracy)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### That's all, now let's train it!\n", - "* We got a lot of data, so it's recommended that you use SGD\n", - "* So let's implement a function that splits the training sample into minibatches" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# An auxilary function that returns mini-batches for neural network training\n", - "\n", - "# Parameters\n", - "# X - a tensor of images with shape (many, 1, 28, 28), e.g. X_train\n", - "# y - a vector of answers for corresponding images e.g. Y_train\n", - "# batch_size - a single number - the intended size of each batches\n", - "\n", - "# What do need to implement\n", - "# 1) Shuffle data\n", - "# - Gotta shuffle X and y the same way not to break the correspondence between X_i and y_i\n", - "# 3) Split data into minibatches of batch_size\n", - "# - If data size is not a multiple of batch_size, make one last batch smaller.\n", - "# 4) return a list (or an iterator) of pairs\n", - "# - (подгруппа картинок, ответы из y на эту подгруппу)\n", - "\n", - "\n", - "def iterate_minibatches(X, y, batchsize):\n", - "\n", - " \n", - "\n", - "\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "#\n", - "# You feel lost and wish you stayed home tonight?\n", - "# Go search for a similar function at\n", - "# https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Training loop" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "\n", - "num_epochs = 100 # amount of passes through the data\n", - "\n", - "batch_size = 50 # number of samples processed at each function call\n", - "\n", - "for epoch in range(num_epochs):\n", - " # In each epoch, we do a full pass over the training data:\n", - " train_err = 0\n", - " train_acc = 0\n", - " train_batches = 0\n", - " start_time = time.time()\n", - " for batch in iterate_minibatches(X_train, y_train, batch_size):\n", - " inputs, targets = batch\n", - " train_err_batch, train_acc_batch = train_fun(inputs, targets)\n", - " train_err += train_err_batch\n", - " train_acc += train_acc_batch\n", - " train_batches += 1\n", - "\n", - " # And a full pass over the validation data:\n", - " val_acc = 0\n", - " val_batches = 0\n", - " for batch in iterate_minibatches(X_val, y_val, batch_size):\n", - " inputs, targets = batch\n", - " val_acc += accuracy_fun(inputs, targets)\n", - " val_batches += 1\n", - "\n", - " # Then we print the results for this epoch:\n", - " print(\"Epoch {} of {} took {:.3f}s\".format(\n", - " epoch + 1, num_epochs, time.time() - start_time))\n", - "\n", - " print(\n", - " \" training loss (in-iteration):\\t\\t{:.6f}\".format(train_err / train_batches))\n", - " print(\" train accuracy:\\t\\t{:.2f} %\".format(\n", - " train_acc / train_batches * 100))\n", - " print(\" validation accuracy:\\t\\t{:.2f} %\".format(\n", - " val_acc / val_batches * 100))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_acc = 0\n", - "test_batches = 0\n", - "for batch in iterate_minibatches(X_test, y_test, 500):\n", - " inputs, targets = batch\n", - " acc = accuracy_fun(inputs, targets)\n", - " test_acc += acc\n", - " test_batches += 1\n", - "print(\"Final results:\")\n", - "print(\" test accuracy:\\t\\t{:.2f} %\".format(\n", - " test_acc / test_batches * 100))\n", - "\n", - "if test_acc / test_batches * 100 > 99:\n", - " print(\"Achievement unlocked: 80lvl Warlock!\")\n", - "else:\n", - " print(\"We need more magic!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# A better network ( 4+ pts )\n", - "\n", - "\n", - "* The quest is to create a network that gets at least 99% at test set\n", - " * In case you tried several architectures and have a __detailed__ report - 97.5% \"is fine too\". \n", - " * __+1 bonus point__ each 0.1% past 99%\n", - " * More points for creative approach\n", - " \n", - "__ There is a mini-report at the end that you will have to fill in. We recommend to read it first and fill in while you are iterating. __\n", - " \n", - "\n", - "## Tips on what can be done:\n", - "\n", - "\n", - "\n", - " * Network size\n", - " * MOAR neurons, \n", - " * MOAR layers, \n", - " * Convolutions are almost imperative\n", - " * Пх'нглуи мглв'нафх Ктулху Р'льех вгах'нагл фхтагн! \n", - " \n", - " \n", - " \n", - " * Regularize to prevent overfitting\n", - " * Add some L2 weight norm to the loss function, theano will do the rest\n", - " * Can be done manually or via - http://lasagne.readthedocs.org/en/latest/modules/regularization.html\n", - " \n", - " \n", - " \n", - " * Better optimization - rmsprop, nesterov_momentum, adadelta, adagrad and so on.\n", - " * Converge faster and sometimes reach better optima\n", - " * It might make sense to tweak learning rate, other learning parameters, batch size and number of epochs\n", - " \n", - " \n", - " \n", - " * Dropout - to prevent overfitting\n", - " * `lasagne.layers.DropoutLayer(prev_layer, p=probability_to_zero_out)`\n", - " \n", - " \n", - " \n", - " * Convolution layers\n", - " * `network = lasagne.layers.Conv2DLayer(prev_layer,`\n", - " ` num_filters = n_neurons,`\n", - " ` filter_size = (filter width, filter height),`\n", - " ` nonlinearity = some_nonlinearity)`\n", - " * Warning! Training convolutional networks can take long without GPU.\n", - " * If you are CPU-only, we still recomment to try a simple convolutional architecture\n", - " * a perfect option is if you can set it up to run at nighttime and check it up at the morning.\n", - " \n", - " * Plenty other layers and architectures\n", - " * http://lasagne.readthedocs.org/en/latest/modules/layers.html\n", - " * batch normalization, pooling, etc\n", - " \n", - " \n", - " * Nonlinearities in the hidden layers\n", - " * tanh, relu, leaky relu, etc\n", - " \n", - " \n", - " \n", - " \n", - "There is a template for your solution below that you can opt to use or throw away and write it your way" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from mnist import load_dataset\n", - "X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()\n", - "\n", - "print X_train.shape, y_train.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import lasagne\n", - "\n", - "input_X = T.tensor4(\"X\")\n", - "\n", - "# input dimention (None means \"Arbitrary\" and only works at the first axes [samples])\n", - "input_shape = [None, 1, 28, 28]\n", - "\n", - "target_y = T.vector(\"target Y integer\", dtype='int32')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Input layer (auxilary)\n", - "input_layer = lasagne.layers.InputLayer(shape=input_shape, input_var=input_X)\n", - "\n", - "\n", - "\n", - "dense_output = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Network predictions (theano-transformation)\n", - "y_predicted = lasagne.layers.get_output(dense_output)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# All weights (shared-varaibles)\n", - "# \"trainable\" flag means not to return auxilary params like batch mean (for batch normalization)\n", - "all_weights = lasagne.layers.get_all_params(dense_output, trainable=True)\n", - "print(all_weights)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# loss function\n", - "loss = \n", - "\n", - "# \n", - "\n", - "accuracy = \n", - "\n", - "# weight updates\n", - "updates = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# A function that accepts X and y, returns loss functions and performs weight updates\n", - "train_fun = theano.function(\n", - " [input_X, target_y], [loss, accuracy], updates=updates_sgd)\n", - "\n", - "# A function that just computes accuracy given X and y\n", - "accuracy_fun = theano.function([input_X, target_y], accuracy)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# итерации обучения\n", - "\n", - "num_epochs = \n", - "\n", - "batch_size = \n", - "\n", - "for epoch in range(num_epochs):\n", - " # In each epoch, we do a full pass over the training data:\n", - " train_err = 0\n", - " train_acc = 0\n", - " train_batches = 0\n", - " start_time = time.time()\n", - " for batch in iterate_minibatches(X_train, y_train, batch_size):\n", - " inputs, targets = batch\n", - " train_err_batch, train_acc_batch = train_fun(inputs, targets)\n", - " train_err += train_err_batch\n", - " train_acc += train_acc_batch\n", - " train_batches += 1\n", - "\n", - " # And a full pass over the validation data:\n", - " val_acc = 0\n", - " val_batches = 0\n", - " for batch in iterate_minibatches(X_val, y_val, batch_size):\n", - " inputs, targets = batch\n", - " val_acc += accuracy_fun(inputs, targets)\n", - " val_batches += 1\n", - "\n", - " # Then we print the results for this epoch:\n", - " print(\"Epoch {} of {} took {:.3f}s\".format(\n", - " epoch + 1, num_epochs, time.time() - start_time))\n", - "\n", - " print(\n", - " \" training loss (in-iteration):\\t\\t{:.6f}\".format(train_err / train_batches))\n", - " print(\" train accuracy:\\t\\t{:.2f} %\".format(\n", - " train_acc / train_batches * 100))\n", - " print(\" validation accuracy:\\t\\t{:.2f} %\".format(\n", - " val_acc / val_batches * 100))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_acc = 0\n", - "test_batches = 0\n", - "for batch in iterate_minibatches(X_test, y_test, 500):\n", - " inputs, targets = batch\n", - " acc = accuracy_fun(inputs, targets)\n", - " test_acc += acc\n", - " test_batches += 1\n", - "print(\"Final results:\")\n", - "print(\" test accuracy:\\t\\t{:.2f} %\".format(\n", - " test_acc / test_batches * 100))\n", - "\n", - "if test_acc / test_batches * 100 > 99:\n", - " print(\"Achievement unlocked: 80lvl Warlock!\")\n", - "else:\n", - " print(\"We need more magic!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Report\n", - "\n", - "All creative approaches are highly welcome, but at the very least it would be great to mention\n", - "* the idea;\n", - "* brief history of tweaks and improvements;\n", - "* what is the final architecture and why?\n", - "* what is the training method and, again, why?\n", - "* Any regularizations and other techniques applied and their effects;\n", - "\n", - "\n", - "There is no need to write strict mathematical proofs (unless you want to).\n", - " * \"I tried this, this and this, and the second one turned out to be better. And i just didn't like the name of that one\" - OK, but can be better\n", - " * \"I have analized these and these articles|sources|blog posts, tried that and that to adapt them to my problem and the conclusions are such and such\" - the ideal one\n", - " * \"I took that code that demo without understanding it, but i'll never confess that and instead i'll make up some pseudoscientific explaination\" - __not_ok__" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hi, my name is `___ ___`, and here's my story\n", - "\n", - "A long ago in a galaxy far far away, when it was still more than an hour before deadline, i got an idea:\n", - "\n", - "##### I gonna build a neural network, that\n", - "* brief text on what was\n", - "* the original idea\n", - "* and why it was so\n", - "\n", - "How could i be so naive?!\n", - "\n", - "##### One day, with no signs of warning,\n", - "This thing has finally converged and\n", - "* Some explaination about what were the results,\n", - "* what worked and what didn't\n", - "* most importantly - what next steps were taken, if any\n", - "* and what were their respective outcomes\n", - "\n", - "##### Finally, after __ iterations, __ mugs of [tea/coffee]\n", - "* what was the final architecture\n", - "* as well as training method and tricks\n", - "\n", - "That, having wasted ____ [minutes, hours or days] of my life training, got\n", - "\n", - "* accuracy on training: __\n", - "* accuracy on validation: __\n", - "* accuracy on test: __\n", - "\n", - "\n", - "[an optional afterword and mortal curses on assignment authors]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/week04_[recap]_deep_learning/seminar_pytorch.ipynb b/week04_[recap]_deep_learning/seminar_pytorch.ipynb index e10863802..d2ec82ade 100644 --- a/week04_[recap]_deep_learning/seminar_pytorch.ipynb +++ b/week04_[recap]_deep_learning/seminar_pytorch.ipynb @@ -8,9 +8,9 @@ "\n", "![img](https://pytorch.org/tutorials/_static/pytorch-logo-dark.svg)\n", "\n", - "__This notebook__ will teach you to use PyTorch low-level core. You can install it [here](http://pytorch.org/). For high-level interface see the next notebook.\n", + "__This notebook__ will teach you to use PyTorch low-level core. If you're running this notebook outside the course environment, you can install it [here](https://pytorch.org).\n", "\n", - "__PyTorch feels__ differently than tensorflow/theano on almost every level. TensorFlow makes your code live in two \"worlds\" simultaneously: symbolic graphs and actual tensors. First you declare a symbolic \"recipe\" of how to get from inputs to outputs, then feed it with actual minibatches of data. In PyTorch, __there's only one world__: all tensors have a numeric value.\n", + "__PyTorch feels__ differently than Tensorflow on almost every level. TensorFlow makes your code live in two \"worlds\" simultaneously: symbolic graphs and actual tensors. First you declare a symbolic \"recipe\" of how to get from inputs to outputs, then feed it with actual minibatches of data. In PyTorch, __there's only one world__: all tensors have a numeric value.\n", "\n", "You compute outputs on the fly without pre-declaring anything. The code looks exactly as in pure numpy with one exception: PyTorch computes gradients for you. And can run stuff on GPU. And has a number of pre-implemented building blocks for your neural nets. [And a few more things.](https://medium.com/towards-data-science/pytorch-vs-tensorflow-spotting-the-difference-25c75777377b)\n", "\n", @@ -25,8 +25,7 @@ "source": [ "import sys, os\n", "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week04_%5Brecap%5D_deep_learning/notmnist.py\n", - "\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_%5Brecap%5D_deep_learning/notmnist.py\n", " !touch .setup_complete" ] }, @@ -58,30 +57,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "X :\n", + "X:\n", "[[ 0 1 2 3]\n", " [ 4 5 6 7]\n", " [ 8 9 10 11]\n", " [12 13 14 15]]\n", "\n", - "X.shape : (4, 4)\n", + "X.shape: (4, 4)\n", "\n", - "add 5 :\n", + "add 5:\n", "[[ 5 6 7 8]\n", " [ 9 10 11 12]\n", " [13 14 15 16]\n", " [17 18 19 20]]\n", "\n", - "X*X^T :\n", + "X*X^T:\n", "[[ 14 38 62 86]\n", " [ 38 126 214 302]\n", " [ 62 214 366 518]\n", " [ 86 302 518 734]]\n", "\n", - "mean over cols :\n", + "mean over rows:\n", "[ 1.5 5.5 9.5 13.5]\n", "\n", - "cumsum of cols :\n", + "cumsum of cols:\n", "[[ 0 1 2 3]\n", " [ 4 6 8 10]\n", " [12 15 18 21]\n", @@ -95,12 +94,12 @@ "\n", "x = np.arange(16).reshape(4, 4)\n", "\n", - "print(\"X :\\n%s\\n\" % x)\n", - "print(\"X.shape : %s\\n\" % (x.shape,))\n", - "print(\"add 5 :\\n%s\\n\" % (x + 5))\n", - "print(\"X*X^T :\\n%s\\n\" % np.dot(x, x.T))\n", - "print(\"mean over cols :\\n%s\\n\" % (x.mean(axis=-1)))\n", - "print(\"cumsum of cols :\\n%s\\n\" % (np.cumsum(x, axis=0)))" + "print(\"X:\\n%s\\n\" % x)\n", + "print(\"X.shape: %s\\n\" % (x.shape,))\n", + "print(\"add 5:\\n%s\\n\" % (x + 5))\n", + "print(\"X*X^T:\\n%s\\n\" % np.dot(x, x.T))\n", + "print(\"mean over rows:\\n%s\\n\" % (x.mean(axis=-1)))\n", + "print(\"cumsum of cols:\\n%s\\n\" % (np.cumsum(x, axis=0)))" ] }, { @@ -112,26 +111,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "X :\n", + "X:\n", "tensor([[ 0., 1., 2., 3.],\n", " [ 4., 5., 6., 7.],\n", " [ 8., 9., 10., 11.],\n", " [12., 13., 14., 15.]])\n", - "X.shape : torch.Size([4, 4])\n", + "X.shape: torch.Size([4, 4])\n", "\n", - "add 5 :\n", + "add 5:\n", "tensor([[ 5., 6., 7., 8.],\n", " [ 9., 10., 11., 12.],\n", " [13., 14., 15., 16.],\n", " [17., 18., 19., 20.]])\n", - "X*X^T :\n", + "X*X^T:\n", "tensor([[ 14., 38., 62., 86.],\n", " [ 38., 126., 214., 302.],\n", " [ 62., 214., 366., 518.],\n", " [ 86., 302., 518., 734.]])\n", - "mean over cols :\n", + "mean over rows:\n", "tensor([ 1.5000, 5.5000, 9.5000, 13.5000])\n", - "cumsum of cols :\n", + "cumsum of cols:\n", "tensor([[ 0., 1., 2., 3.],\n", " [ 4., 6., 8., 10.],\n", " [12., 15., 18., 21.],\n", @@ -144,14 +143,14 @@ "\n", "x = np.arange(16).reshape(4, 4)\n", "\n", - "x = torch.tensor(x, dtype=torch.float32) # or torch.arange(0,16).view(4,4)\n", + "x = torch.tensor(x, dtype=torch.float32) # or torch.arange(0, 16).view(4, 4)\n", "\n", - "print(\"X :\\n%s\" % x)\n", - "print(\"X.shape : %s\\n\" % (x.shape,))\n", - "print(\"add 5 :\\n%s\" % (x + 5))\n", - "print(\"X*X^T :\\n%s\" % torch.matmul(x, x.transpose(1, 0))) # short: x.mm(x.t())\n", - "print(\"mean over cols :\\n%s\" % torch.mean(x, dim=-1))\n", - "print(\"cumsum of cols :\\n%s\" % torch.cumsum(x, dim=0))" + "print(\"X:\\n%s\" % x)\n", + "print(\"X.shape: %s\\n\" % (x.shape,))\n", + "print(\"add 5:\\n%s\" % (x + 5))\n", + "print(\"X*X^T:\\n%s\" % torch.matmul(x, x.transpose(1, 0))) # short: x.mm(x.t())\n", + "print(\"mean over rows:\\n%s\" % torch.mean(x, dim=-1))\n", + "print(\"cumsum of cols:\\n%s\" % torch.cumsum(x, dim=0))" ] }, { @@ -160,25 +159,25 @@ "source": [ "## NumPy and PyTorch\n", "\n", - "As you can notice, PyTorch allows you to hack stuff much the same way you did with numpy. No graph declaration, no placeholders, no sessions. This means that you can _see the numeric value of any tensor at any moment of time_. Debugging such code can be done with by printing tensors or using any debug tool you want (e.g. [gdb](https://wiki.python.org/moin/DebuggingWithGdb)).\n", + "As you can notice, PyTorch allows you to hack stuff much the same way you did with NumPy. No graph declaration, no placeholders, no sessions. This means that you can _see the numeric value of any tensor at any moment of time_. Debugging such code can be done with by printing tensors or using any debug tool you want (e.g. [PyCharm debugger](https://www.jetbrains.com/help/pycharm/part-1-debugging-python-code.html) or [gdb](https://wiki.python.org/moin/DebuggingWithGdb)).\n", "\n", - "You could also notice the a few new method names and a different API. So no, there's no compatibility with numpy [yet](https://github.com/pytorch/pytorch/issues/2228) and yes, you'll have to memorize all the names again. Get excited!\n", + "You could also notice the a few new method names and a different API. So no, there's no compatibility with NumPy [yet](https://github.com/pytorch/pytorch/issues/2228) and yes, you'll have to memorize all the names again. Get excited!\n", "\n", "![img](http://i0.kym-cdn.com/entries/icons/original/000/017/886/download.jpg)\n", "\n", - "For example, \n", - "* If something takes a list/tuple of axes in numpy, you can expect it to take *args in PyTorch\n", + "For example,\n", + "* If something takes a list/tuple of axes in NumPy, you can expect it to take `*args` in PyTorch\n", " * `x.reshape([1,2,8]) -> x.view(1,2,8)`\n", - "* You should swap _axis_ for _dim_ in operations like mean or cumsum\n", + "* You should swap `axis` for `dim` in operations like `mean` or `cumsum`\n", " * `x.sum(axis=-1) -> x.sum(dim=-1)`\n", - "* most mathematical operations are the same, but types an shaping is different\n", + "* Most mathematical operations are the same, but types an shaping is different\n", " * `x.astype('int64') -> x.type(torch.LongTensor)`\n", "\n", - "To help you acclimatize, there's a [table](https://github.com/torch/torch7/wiki/Torch-for-Numpy-users) covering most new things. There's also a neat [documentation page](http://pytorch.org/docs/master/).\n", + "To help you acclimatize, there's a [table](https://github.com/torch/torch7/wiki/Torch-for-NumPy-users) covering most new things. There's also a neat [documentation page](http://pytorch.org/docs/master/).\n", "\n", - "Finally, if you're stuck with a technical problem, we recommend searching [PyTorch forumns](https://discuss.pytorch.org/). Or just googling, which usually works just as efficiently. \n", + "Finally, if you're stuck with a technical problem, we recommend searching [PyTorch forums](https://discuss.pytorch.org/). Or just googling, which usually works just as efficiently.\n", "\n", - "If you feel like you almost give up, remember two things: __GPU__ an __free gradients__. Besides you can always jump back to numpy with x.numpy()" + "If you feel like you almost give up, remember two things: __GPU__ and __free gradients__. Besides you can always jump back to NumPy with `x.numpy()`." ] }, { @@ -190,8 +189,8 @@ "\n", "There are some simple mathematical functions with cool plots. For one, consider this:\n", "\n", - "$$ x(t) = t - 1.5 * cos( 15 t) $$\n", - "$$ y(t) = t - 1.5 * sin( 16 t) $$\n" + "$$ x(t) = t - 1.5 * cos(15 t) $$\n", + "$$ y(t) = t - 1.5 * sin(16 t) $$" ] }, { @@ -216,48 +215,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "if you're done early, try adjusting the formula and seing how it affects the function" + "If you're done early, try adjusting the formula and seeing how it affects the function." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n" + "---" ] }, { @@ -270,13 +235,13 @@ "\n", "The general pipeline looks like this:\n", "* When creating a tensor, you mark it as `requires_grad`:\n", - " * __```torch.zeros(5, requires_grad=True)```__\n", - " * torch.tensor(np.arange(5), dtype=torch.float32, requires_grad=True)\n", + " * `torch.zeros(5, requires_grad=True)`\n", + " * `torch.tensor(np.arange(5), dtype=torch.float32, requires_grad=True)`\n", "* Define some differentiable `loss = arbitrary_function(a)`\n", "* Call `loss.backward()`\n", - "* Gradients are now available as ```a.grads```\n", + "* Gradients are now available as ```a.grad```\n", "\n", - "__Here's an example:__ let's fit a linear regression on Boston house prices" + "__Here's an example:__ let's fit a linear regression on Boston house prices." ] }, { @@ -319,7 +284,6 @@ "metadata": {}, "outputs": [], "source": [ - "from torch.autograd import Variable\n", "w = torch.zeros(1, requires_grad=True)\n", "b = torch.zeros(1, requires_grad=True)\n", "\n", @@ -336,7 +300,7 @@ "y_pred = w * x + b\n", "loss = torch.mean((y_pred - y)**2)\n", "\n", - "# propagete gradients\n", + "# propagate gradients\n", "loss.backward()" ] }, @@ -404,7 +368,6 @@ "from IPython.display import clear_output\n", "\n", "for i in range(100):\n", - "\n", " y_pred = w * x + b\n", " loss = torch.mean((y_pred - y)**2)\n", " loss.backward()\n", @@ -417,15 +380,14 @@ " b.grad.data.zero_()\n", "\n", " # the rest of code is just bells and whistles\n", - " if (i+1) % 5 == 0:\n", + " if (i + 1) % 5 == 0:\n", " clear_output(True)\n", - " plt.scatter(x.data.numpy(), y.data.numpy())\n", - " plt.scatter(x.data.numpy(), y_pred.data.numpy(),\n", - " color='orange', linewidth=5)\n", + " plt.scatter(x.numpy(), y.numpy())\n", + " plt.scatter(x.numpy(), y_pred.numpy(), color='orange', linewidth=5)\n", " plt.show()\n", "\n", - " print(\"loss = \", loss.data.numpy())\n", - " if loss.data.numpy() < 0.5:\n", + " print(\"loss = \", loss.numpy())\n", + " if loss.numpy() < 0.5:\n", " print(\"Done!\")\n", " break" ] @@ -434,7 +396,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "__Bonus quest__: try implementing and writing some nonlinear regression. You can try quadratic features or some trigonometry, or a simple neural network. The only difference is that now you have more variables and a more complicated `y_pred`. " + "__Bonus quest__: try implementing and writing some nonlinear regression. You can try quadratic features or some trigonometry, or a simple neural network. The only difference is that now you have more variables and a more complicated `y_pred`." ] }, { @@ -443,9 +405,9 @@ "source": [ "# High-level PyTorch\n", "\n", - "So far we've been dealing with low-level torch API. While it's absolutely vital for any custom losses or layers, building large neura nets in it is a bit clumsy.\n", + "So far we've been dealing with low-level PyTorch API. While it's absolutely vital for any custom losses or layers, building large neural nets in it is a bit clumsy.\n", "\n", - "Luckily, there's also a high-level torch interface with a pre-defined layers, activations and training algorithms. \n", + "Luckily, there's also a high-level PyTorch interface with pre-defined layers, activations and training algorithms.\n", "\n", "We'll cover them as we go through a simple image recognition problem: classifying letters into __\"A\"__ vs __\"B\"__.\n" ] @@ -454,34 +416,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Parsing...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/jheuristic/anaconda3/lib/python3.6/site-packages/scipy/misc/pilutil.py:482: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", - " if issubdtype(ts, int):\n", - "/home/jheuristic/anaconda3/lib/python3.6/site-packages/scipy/misc/pilutil.py:485: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", - " elif issubdtype(type(size), float):\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "found broken img: ./notMNIST_small/A/RGVtb2NyYXRpY2FCb2xkT2xkc3R5bGUgQm9sZC50dGY=.png [it's ok if <10 images are broken]\n", - "Done\n", - "Train size = 2808, test_size = 937\n" - ] - } - ], + "outputs": [], "source": [ "from notmnist import load_notmnist\n", "X_train, y_train, X_test, y_test = load_notmnist(letters='AB')\n", @@ -517,7 +452,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's start with layers. The main abstraction here is __`torch.nn.Module`__" + "Let's start with layers. The main abstraction here is __`torch.nn.Module`__:" ] }, { @@ -568,7 +503,7 @@ "source": [ "There's a vast library of popular layers and architectures already built for ya'.\n", "\n", - "This is a binary classification problem, so we'll train a __Logistic Regression with sigmoid__.\n", + "This is a binary classification problem, so we'll train __Logistic Regression__.\n", "$$P(y_i | X_i) = \\sigma(W \\cdot X_i + b) ={ 1 \\over {1+e^{- [W \\cdot X_i + b]}} }$$\n" ] }, @@ -676,7 +611,7 @@ "source": [ "__Torch optimizers__\n", "\n", - "When we trained Linear Regression above, we had to manually .zero_() gradients on both our variables. Imagine that code for a 50-layer network.\n", + "When we trained Linear Regression above, we had to manually `.zero_()` gradients on both our variables. Imagine that code for a 50-layer network.\n", "\n", "Again, to keep it from getting dirty, there's `torch.optim` module with pre-implemented algorithms:" ] @@ -690,9 +625,9 @@ "opt = torch.optim.RMSprop(model.parameters(), lr=0.01)\n", "\n", "# here's how it's used:\n", + "opt.zero_grad() # clear gradients\n", "loss.backward() # add new gradients\n", - "opt.step() # change weights\n", - "opt.zero_grad() # clear gradients" + "opt.step() # change weights" ] }, { @@ -787,12 +722,12 @@ "metadata": {}, "source": [ "__Debugging tips:__\n", - "* make sure your model predicts probabilities correctly. Just print them and see what's inside.\n", - "* don't forget _minus_ sign in the loss function! It's a mistake 99% ppl do at some point.\n", - "* make sure you zero-out gradients after each step. Srsly:)\n", + "* Make sure your model predicts probabilities correctly. Just print them and see what's inside.\n", + "* Don't forget the _minus_ sign in the loss function! It's a mistake 99% people do at some point.\n", + "* Make sure you zero-out gradients after each step. Seriously:)\n", "* In general, PyTorch's error messages are quite helpful, read 'em before you google 'em.\n", "* if you see nan/inf, print what happens at each iteration to find our where exactly it occurs.\n", - " * If loss goes down and then turns nan midway through, try smaller learning rate. (Our current loss formula is unstable).\n" + " * If loss goes down and then turns nan midway through, try smaller learning rate. (Our current loss formula is unstable)." ] }, { @@ -843,25 +778,7 @@ "* Practical PyTorch - a repo that implements some... other cool DL models... yes, in PyTorch - [link](https://github.com/spro/practical-pytorch)\n", "* And some more - [link](https://www.reddit.com/r/pytorch/comments/6z0yeo/pytorch_and_pytorch_tricks_for_kaggle/)\n", "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```" + "---" ] }, { @@ -870,7 +787,7 @@ "source": [ "# Homework tasks\n", "\n", - "There will be three tasks worth 2, 3 and 5 points respectively. \n", + "There will be three tasks worth 2, 3 and 5 points respectively.\n", "If you get stuck with no progress, try switching to the next task and returning later." ] }, @@ -882,11 +799,11 @@ "\n", "![img](https://media.giphy.com/media/3o751UMCYtSrRAFRFC/giphy.gif)\n", "\n", - "When dealing with more complex stuff like neural network, it's best if you use tensors the way samurai uses his sword. \n", + "When dealing with more complex stuff like neural network, it's best if you use tensors the way samurai uses his sword.\n", "\n", "\n", - "__1.1 the cannabola__ \n", - "[_disclaimer_](https://gist.githubusercontent.com/justheuristic/e2c1fa28ca02670cabc42cacf3902796/raw/fd3d935cef63a01b85ed2790b5c11c370245cbd7/stddisclaimer.h)\n", + "__1.1 The Cannabola__\n", + "[(_disclaimer_)](https://gist.githubusercontent.com/justheuristic/e2c1fa28ca02670cabc42cacf3902796/raw/fd3d935cef63a01b85ed2790b5c11c370245cbd7/stddisclaimer.h)\n", "\n", "Let's write another function, this time in polar coordinates:\n", "$$\\rho(\\theta) = (1 + 0.9 \\cdot cos (8 \\cdot \\theta) ) \\cdot (1 + 0.1 \\cdot cos(24 \\cdot \\theta)) \\cdot (0.9 + 0.05 \\cdot cos(200 \\cdot \\theta)) \\cdot (1 + sin(\\theta))$$\n", @@ -922,20 +839,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Task II: the game of life (3 points)\n", + "### Task II: The Game of Life (3 points)\n", "\n", - "Now it's time for you to make something more challenging. We'll implement Conway's [Game of Life](http://web.stanford.edu/~cdebs/GameOfLife/) in _pure PyTorch_. \n", + "Now it's time for you to make something more challenging. We'll implement Conway's [Game of Life](http://web.stanford.edu/~cdebs/GameOfLife/) in _pure PyTorch_.\n", "\n", - "While this is still a toy task, implementing game of life this way has one cool benefit: __you'll be able to run it on GPU! __ Indeed, what could be a better use of your gpu than simulating game of life on 1M/1M grids?\n", + "While this is still a toy task, implementing game of life this way has one cool benefit: __you'll be able to run it on GPU!__ Indeed, what could be a better use of your GPU than simulating Game of Life on 1M/1M grids?\n", "\n", "![img](https://cdn.tutsplus.com/gamedev/authors/legacy/Stephane%20Beniak/2012/09/11/Preview_Image.png)\n", - "If you've skipped the url above out of sloth, here's the game of life:\n", + "If you've skipped the URL above out of sloth, here's the Game of Life:\n", "* You have a 2D grid of cells, where each cell is \"alive\"(1) or \"dead\"(0)\n", "* Any living cell that has 2 or 3 neighbors survives, else it dies [0,1 or 4+ neighbors]\n", "* Any cell with exactly 3 neighbors becomes alive (if it was dead)\n", "\n", - "For this task, you are given a reference numpy implementation that you must convert to PyTorch.\n", - "_[numpy code inspired by: https://github.com/rougier/numpy-100]_\n", + "For this task, you are given a reference NumPy implementation that you must convert to PyTorch.\n", + "_[NumPy code inspired by: https://github.com/rougier/numpy-100]_\n", "\n", "\n", "__Note:__ You can find convolution in `torch.nn.functional.conv2d(Z,filters)`. Note that it has a different input format.\n", @@ -979,10 +896,10 @@ " :param Z: torch.FloatTensor of shape [height,width] containing 0s(dead) an 1s(alive)\n", " :returns: torch.FloatTensor Z after updates.\n", "\n", - " You can opt to create new tensor or change Z inplace.\n", + " You can opt to create a new tensor or change Z inplace.\n", " \"\"\"\n", "\n", - " # \n", + " \n", "\n", " return Z" ] @@ -1002,8 +919,8 @@ "\n", "# tests\n", "Z_reference = np_update(Z_numpy.copy())\n", - "assert np.all(Z_new.numpy(\n", - ") == Z_reference), \"your PyTorch implementation doesn't match np_update. Look into Z and np_update(ZZ) to investigate.\"\n", + "assert np.all(Z_new.numpy() == Z_reference), \\\n", + " \"your PyTorch implementation doesn't match np_update. Look into Z and np_update(ZZ) to investigate.\"\n", "print(\"Well done!\")" ] }, @@ -1025,7 +942,6 @@ "fig.show()\n", "\n", "for _ in range(100):\n", - "\n", " # update\n", " Z = torch_update(Z)\n", "\n", @@ -1072,40 +988,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "\n", - "\n", "### Task III: Going deeper (5 points)\n", "\n", - "Your ultimate task for this week is to build your first neural network [almost] from scratch and pure torch.\n", + "Your ultimate task for this week is to build your first neural network [almost] from scratch and pure PyTorch.\n", + "\n", + "This time you will solve the same digit recognition problem, but at a larger scale\n", "\n", - "This time you will solve the same digit recognition problem, but at a greater scale\n", "* 10 different letters\n", "* 20k samples\n", "\n", - "We want you to build a network that reaches at least 80% accuracy and has at least 2 linear layers in it. Naturally, it should be nonlinear to beat logistic regression. You can implement it with either \n", + "We want you to build a network that reaches at least 80% accuracy and has at least 2 linear layers in it. Naturally, it should be nonlinear to beat logistic regression.\n", "\n", - "\n", - "With 10 classes you will need to use __Softmax__ at the top instead of sigmoid and train for __categorical crossentropy__ (see [here](http://wiki.fast.ai/index.php/Log_Loss)). Write your own loss or use `torch.nn.functional.nll_loss`. Just make sure you understand what it accepts as an input.\n", + "With 10 classes you will need to use __Softmax__ at the top instead of sigmoid and train using __categorical crossentropy__ (see [here](http://wiki.fast.ai/index.php/Log_Loss)). Write your own loss or use `torch.nn.functional.nll_loss`. Just make sure you understand what it accepts as input.\n", "\n", "Note that you are not required to build 152-layer monsters here. A 2-layer (one hidden, one output) neural network should already give you an edge over logistic regression.\n", "\n", @@ -1114,26 +1008,15 @@ "If you've already beaten logistic regression with a two-layer net, but enthusiasm still ain't gone, you can try improving the test accuracy even further! It should be possible to reach 90% without convnets.\n", "\n", "__SPOILERS!__\n", - "At the end of the notebook you will find a few tips and frequent errors. \n", - "If you feel confident enogh, just start coding right away and get there ~~if~~ once you need to untangle yourself. \n", - "\n" + "At the end of the notebook you will find a few tips and frequent errors.\n", + "If you feel confident enough, just start coding right away and get there ~~if~~ once you need to untangle yourself." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Parsing...\n", - "found broken img: ./notMNIST_small/F/Q3Jvc3NvdmVyIEJvbGRPYmxpcXVlLnR0Zg==.png [it's ok if <10 images are broken]\n", - "found broken img: ./notMNIST_small/A/RGVtb2NyYXRpY2FCb2xkT2xkc3R5bGUgQm9sZC50dGY=.png [it's ok if <10 images are broken]\n" - ] - } - ], + "outputs": [], "source": [ "from notmnist import load_notmnist\n", "X_train, y_train, X_test, y_test = load_notmnist(letters='ABCDEFGHIJ')\n", @@ -1175,101 +1058,39 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "











" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "\n", "# SPOILERS!\n", "\n", - "Recommended pipeline\n", + "Recommended pipeline:\n", "\n", "* Adapt logistic regression from previous assignment to classify one letter against others (e.g. A vs the rest)\n", "* Generalize it to multiclass logistic regression.\n", " - Either try to remember lecture 0 or google it.\n", " - Instead of weight vector you'll have to use matrix (feature_id x class_id)\n", - " - softmax (exp over sum of exps) can implemented manually or as nn.Softmax (layer) F.softmax (function)\n", - " - probably better to use STOCHASTIC gradient descent (minibatch) for greater speed\n", - " - you can also try momentum/rmsprop/adawhatever\n", - " - in which case sample should probably be shuffled (or use random subsamples on each iteration)\n", + " - Softmax (exp over sum of exps) can be implemented manually or as `nn.Softmax` (layer) or `F.softmax` (function)\n", + " - Probably better to use STOCHASTIC gradient descent (minibatch) for greater speed\n", + " - You can also try momentum/rmsprop/adawhatever\n", + " - in which case the dataset should probably be shuffled (or use random subsamples on each iteration)\n", "* Add a hidden layer. Now your logistic regression uses hidden neurons instead of inputs.\n", " - Hidden layer uses the same math as output layer (ex-logistic regression), but uses some nonlinearity (e.g. sigmoid) instead of softmax\n", - " - You need to train both layers, not just output layer :)\n", - " - 50 hidden neurons and a sigmoid nonlinearity will do for a start. Many ways to improve. \n", - " - In ideal case this totals to 2 torch.matmul's, 1 softmax and 1 relu/sigmoid\n", - " - __make sure this neural network works better than logistic regression__\n", - " \n", - "* Now's the time to try improving the network. Consider layers (size, neuron count), nonlinearities, optimization methods, initialization - whatever you want, but please avoid convolutions for now.\n", - " \n", + " - You need to train both layers, not just the output layer :)\n", + " - 50 hidden neurons and a sigmoid nonlinearity will do for a start. Many ways to improve.\n", + " - In ideal case this totals to 2 `torch.matmul`'s, 1 softmax and 1 ReLU/sigmoid\n", + " - __Make sure this neural network works better than logistic regression!__\n", + "\n", + "* Now's the time to try improving the network. Consider layers (size, neuron count), nonlinearities, optimization methods, initialization — whatever you want, but please avoid convolutions for now.\n", + "\n", "* If anything seems wrong, try going through one step of training and printing everything you compute.\n", - "* If you see NaNs midway through optimization, you can estimate log P(y|x) as via F.log_softmax(layer_before_softmax)\n", - "\n" + "* If you see NaNs midway through optimization, you can estimate $\\log P(y \\mid x)$ as `F.log_softmax(layer_before_softmax)`." ] } ], diff --git a/week04_[recap]_deep_learning/seminar_tensorflow.ipynb b/week04_[recap]_deep_learning/seminar_tensorflow.ipynb index a26eb57db..02cefe6ac 100644 --- a/week04_[recap]_deep_learning/seminar_tensorflow.ipynb +++ b/week04_[recap]_deep_learning/seminar_tensorflow.ipynb @@ -24,9 +24,9 @@ " %tensorflow_version 1.x\n", " \n", " if not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", "\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week04_[recap]_deep_learning/mnist.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_[recap]_deep_learning/mnist.py\n", "\n", " !touch .setup_complete\n", "\n", diff --git a/week04_approx_rl/README.md b/week04_approx_rl/README.md index 4d37da0e7..45a92a4df 100644 --- a/week04_approx_rl/README.md +++ b/week04_approx_rl/README.md @@ -1,5 +1,5 @@ ## Materials -* [__slides__](https://docs.google.com/presentation/d/1HEfIyKT0rIuUQCGAsR1PIVGirccDXu5LQvxhVUjuIqM) +* [__slides__](https://github.com/yandexdataschool/Practical_RL/files/15286337/ysda_practical_rl_lecture_04.pdf) * Our [lecture](https://yadi.sk/i/Gd9yWV1dpuB7BQ), [seminar](https://yadi.sk/i/mvtKAIRN2yKU2g) (russian) * [__seminar slides__](https://yadi.sk/i/B89gXClSpmYZKw) @@ -35,9 +35,9 @@ ## Practice -* Seminar: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring20/week04_approx_rl/seminar_pytorch.ipynb) -* Homework (main): [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring20/week04_approx_rl/homework_pytorch_main.ipynb#scrollTo=KVvvo7k_ap8w) -* Homework (debug): [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring20/week04_approx_rl/homework_pytorch_debug.ipynb#scrollTo=KVvvo7k_ap8w) +* Seminar: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week04_approx_rl/seminar_pytorch.ipynb) +* Homework (main): [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week04_approx_rl/homework_pytorch_main.ipynb#scrollTo=KVvvo7k_ap8w) +* Homework (debug): [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week04_approx_rl/homework_pytorch_debug.ipynb#scrollTo=KVvvo7k_ap8w) @@ -46,6 +46,4 @@ From now on, we have two tracks, for pytorch and tensorflow. However, pytorch tr Begin with `seminar_.ipynb` and then proceed with `homework_.ipynb`. -__Note: you're not required to submit assignments in all three frameworks. Pick one and go with it. Maybe switch it occasionally if you want more challenge. __ - - +__Note: you're not required to submit assignments in all three frameworks. Pick one and go with it. Maybe switch it occasionally if you want more challenge.__ diff --git a/week04_approx_rl/atari_wrappers.py b/week04_approx_rl/atari_wrappers.py deleted file mode 100644 index ea19dd880..000000000 --- a/week04_approx_rl/atari_wrappers.py +++ /dev/null @@ -1,119 +0,0 @@ -# taken from OpenAI baselines. - -import numpy as np -import gym - - -class MaxAndSkipEnv(gym.Wrapper): - def __init__(self, env, skip=4): - """Return only every `skip`-th frame""" - gym.Wrapper.__init__(self, env) - # most recent raw observations (for max pooling across time steps) - self._obs_buffer = np.zeros( - (2,) + env.observation_space.shape, dtype=np.uint8) - self._skip = skip - - def step(self, action): - """Repeat action, sum reward, and max over last observations.""" - total_reward = 0.0 - done = None - for i in range(self._skip): - obs, reward, done, info = self.env.step(action) - if i == self._skip - 2: - self._obs_buffer[0] = obs - if i == self._skip - 1: - self._obs_buffer[1] = obs - total_reward += reward - if done: - break - # Note that the observation on the done=True frame - # doesn't matter - max_frame = self._obs_buffer.max(axis=0) - - return max_frame, total_reward, done, info - - def reset(self, **kwargs): - return self.env.reset(**kwargs) - - -class ClipRewardEnv(gym.RewardWrapper): - def __init__(self, env): - gym.RewardWrapper.__init__(self, env) - - def reward(self, reward): - """Bin reward to {+1, 0, -1} by its sign.""" - return np.sign(reward) - - -class FireResetEnv(gym.Wrapper): - def __init__(self, env): - """Take action on reset for environments that are fixed until firing.""" - gym.Wrapper.__init__(self, env) - assert env.unwrapped.get_action_meanings()[1] == 'FIRE' - assert len(env.unwrapped.get_action_meanings()) >= 3 - - def reset(self, **kwargs): - self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(1) - if done: - self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(2) - if done: - self.env.reset(**kwargs) - return obs - - def step(self, ac): - return self.env.step(ac) - - -class EpisodicLifeEnv(gym.Wrapper): - def __init__(self, env): - """Make end-of-life == end-of-episode, but only reset on true game over. - Done by DeepMind for the DQN and co. since it helps value estimation. - """ - gym.Wrapper.__init__(self, env) - self.lives = 0 - self.was_real_done = True - - def step(self, action): - obs, reward, done, info = self.env.step(action) - self.was_real_done = done - # check current lives, make loss of life terminal, - # then update lives to handle bonus lives - lives = self.env.unwrapped.ale.lives() - if lives < self.lives and lives > 0: - # for Qbert sometimes we stay in lives == 0 condition for a few frames - # so it's important to keep lives > 0, so that we only reset once - # the environment advertises done. - done = True - self.lives = lives - return obs, reward, done, info - - def reset(self, **kwargs): - """Reset only when lives are exhausted. - This way all states are still reachable even though lives are episodic, - and the learner need not know about any of this behind-the-scenes. - """ - if self.was_real_done: - obs = self.env.reset(**kwargs) - else: - # no-op step to advance from terminal/lost life state - obs, _, _, _ = self.env.step(0) - self.lives = self.env.unwrapped.ale.lives() - return obs - - -# in torch imgs have shape [c, h, w] instead of common [h, w, c] -class AntiTorchWrapper(gym.ObservationWrapper): - def __init__(self, env): - gym.ObservationWrapper.__init__(self, env) - - self.img_size = [env.observation_space.shape[i] - for i in [1, 2, 0] - ] - self.observation_space = gym.spaces.Box(0.0, 1.0, self.img_size) - - def observation(self, img): - """what happens to each observation""" - img = img.transpose(1, 2, 0) - return img diff --git a/week04_approx_rl/dqn/__init__.py b/week04_approx_rl/dqn/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/week04_approx_rl/dqn/analysis.py b/week04_approx_rl/dqn/analysis.py new file mode 100644 index 000000000..2c007732d --- /dev/null +++ b/week04_approx_rl/dqn/analysis.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from collections.abc import Reversible +import numpy as np + + +def play_and_log_episode(env, agent, t_max=10000): + """ + Plays an episode using the greedy policy and logs for each timestep: + - state + - qvalues (estimated by the agent) + - actions + - rewards + + Also logs: + - the final (usually termo=inal) state. + - whether the episode was terminated + + Uses the greedy policy. + """ + assert t_max > 0, t_max + + states = [] + qvalues_all = [] + actions = [] + rewards = [] + + s, _ = env.reset() + for step in range(t_max): + s = np.array(s) + states.append(s) + qvalues = agent.get_qvalues(s[None])[0] + qvalues_all.append(qvalues) + action = np.argmax(qvalues) + actions.append(action) + s, r, terminated, truncated, _ = env.step(action) + rewards.append(r) + if terminated or truncated: + break + states.append(s) # the last state + + return_pack = { + "states": np.array(states), + "qvalues": np.array(qvalues_all), + "actions": np.array(actions), + "rewards": np.array(rewards), + "episode_finished": terminated, + } + + return return_pack diff --git a/week04_approx_rl/dqn/atari_wrappers.py b/week04_approx_rl/dqn/atari_wrappers.py new file mode 100644 index 000000000..563dcd99f --- /dev/null +++ b/week04_approx_rl/dqn/atari_wrappers.py @@ -0,0 +1,64 @@ +# taken from stable_baselines3. + +from gymnasium import Wrapper + + +class FireResetEnv(Wrapper): + def __init__(self, env): + """Take action on reset for environments that are fixed until firing.""" + super().__init__(env) + assert env.unwrapped.get_action_meanings()[1] == "FIRE" + assert len(env.unwrapped.get_action_meanings()) >= 3 + + def reset(self, **kwargs): + self.env.reset(**kwargs) + obs, _, terminated, truncated, _ = self.env.step(1) + if terminated or truncated: + self.env.reset(**kwargs) + obs, _, terminated, truncated, _ = self.env.step(2) + if terminated or truncated: + self.env.reset(**kwargs) + return obs, {} + + +class EpisodicLifeEnv(Wrapper): + def __init__(self, env): + """Make end-of-life == end-of-episode, but only reset on true game over. + Done by DeepMind for the DQN and co. since it helps value estimation. + """ + super().__init__(env) + self.lives = 0 + self.was_real_done = True + + def step(self, action): + obs, reward, terminated, truncated, info = self.env.step(action) + self.was_real_done = terminated or truncated + # check current lives, make loss of life terminal, + # then update lives to handle bonus lives + lives = self.env.unwrapped.ale.lives() + if lives < self.lives and lives > 0: + # for Qbert sometimes we stay in lives == 0 condition for a few frames + # so it's important to keep lives > 0, so that we only reset once + # the environment advertises done. + terminated = True + self.lives = lives + return obs, reward, terminated, truncated, info + + def reset(self, **kwargs): + """Reset only when lives are exhausted. + This way all states are still reachable even though lives are episodic, + and the learner need not know about any of this behind-the-scenes. + """ + if self.was_real_done: + obs, info = self.env.reset(**kwargs) + else: + # no-op step to advance from terminal/lost life state + obs, _, terminated, truncated, info = self.env.step(0) + + # The no-op step can lead to a game over, so we need to check it again + # to see if we should reset the environment and avoid the + # monitor.py `RuntimeError: Tried to step environment that needs reset` + if terminated or truncated: + obs, info = self.env.reset(**kwargs) + self.lives = self.env.unwrapped.ale.lives() + return obs, info diff --git a/week04_approx_rl/dqn/logger.py b/week04_approx_rl/dqn/logger.py new file mode 100644 index 000000000..52837d873 --- /dev/null +++ b/week04_approx_rl/dqn/logger.py @@ -0,0 +1,80 @@ +import matplotlib.pyplot as plt +from torch.utils.tensorboard import SummaryWriter +import numpy as np + +class Logger: + def __init__(self, use_tensorboard=True, log_dir='runs'): + """ + Initializes the Logger. + + :param use_tensorboard: If True, logs will be sent to TensorBoard. + :param log_dir: Directory where TensorBoard logs are saved. + """ + self.use_tensorboard = use_tensorboard + if self.use_tensorboard: + self.writer = SummaryWriter(log_dir=log_dir) + else: + # Initialize lists to store history for matplotlib + self.mean_rw_history = [] + self.td_loss_history = [] + self.grad_norm_history = [] + self.initial_state_v_history = [] + + def log_loss(self, loss, step): + if self.use_tensorboard: + self.writer.add_scalar("Loss", loss, step) + else: + self.td_loss_history.append(loss) + + def log_grad_norm(self, grad_norm, step): + if self.use_tensorboard: + self.writer.add_scalar("Grad Norm", grad_norm, step) + else: + self.grad_norm_history.append(grad_norm) + + def log_mean_reward(self, mean_reward, step): + if self.use_tensorboard: + self.writer.add_scalar("Mean Reward per Life", mean_reward, step) + else: + self.mean_rw_history.append(mean_reward) + + def log_initial_state_v(self, initial_v, step): + if self.use_tensorboard: + self.writer.add_scalar("Initial State V", initial_v, step) + else: + self.initial_state_v_history.append(initial_v) + + def plot(self): + if not self.use_tensorboard: + plt.figure(figsize=[16, 9]) + + plt.subplot(2, 2, 1) + plt.title("Mean Reward per Episode") + plt.plot(self.mean_rw_history, label='Mean Reward') + plt.legend() + plt.grid() + + plt.subplot(2, 2, 2) + plt.title("TD Loss History") + plt.plot(self.td_loss_history, label='TD Loss') + plt.legend() + plt.grid() + + plt.subplot(2, 2, 3) + plt.title("Initial State V") + plt.plot(self.initial_state_v_history, label='Initial State V') + plt.legend() + plt.grid() + + plt.subplot(2, 2, 4) + plt.title("Grad Norm History") + plt.plot(self.grad_norm_history, label='Grad Norm') + plt.legend() + plt.grid() + + plt.tight_layout() + plt.show() + + def close(self): + if self.use_tensorboard: + self.writer.close() \ No newline at end of file diff --git a/week04_approx_rl/dqn/replay_buffer.py b/week04_approx_rl/dqn/replay_buffer.py new file mode 100644 index 000000000..e695d78d9 --- /dev/null +++ b/week04_approx_rl/dqn/replay_buffer.py @@ -0,0 +1,120 @@ +# This code is shamelessly stolen from +# https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py +import numpy as np +import random + + +class ReplayBuffer(object): + def __init__(self, size): + """Create Replay buffer. + Parameters + ---------- + size: int + Max number of transitions to store in the buffer. When the buffer + overflows the old memories are dropped. + """ + self._storage = [] + self._maxsize = size + self._next_idx = 0 + + def __len__(self): + return len(self._storage) + + def add(self, obs_t, action, reward, obs_tp1, done): + data = (obs_t, action, reward, obs_tp1, done) + + if self._next_idx >= len(self._storage): + self._storage.append(data) + else: + self._storage[self._next_idx] = data + self._next_idx = (self._next_idx + 1) % self._maxsize + + def _encode_sample(self, idxes): + obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] + for i in idxes: + data = self._storage[i] + obs_t, action, reward, obs_tp1, done = data + obses_t.append(np.array(obs_t, copy=False)) + actions.append(np.array(action, copy=False)) + rewards.append(reward) + obses_tp1.append(np.array(obs_tp1, copy=False)) + dones.append(done) + return ( + np.array(obses_t), + np.array(actions), + np.array(rewards), + np.array(obses_tp1), + np.array(dones), + ) + + def sample(self, batch_size): + """Sample a batch of experiences. + Parameters + ---------- + batch_size: int + How many transitions to sample. + Returns + ------- + obs_batch: np.array + batch of observations + act_batch: np.array + batch of actions executed given obs_batch + rew_batch: np.array + rewards received as results of executing act_batch + next_obs_batch: np.array + next set of observations seen after executing act_batch + done_mask: np.array + done_mask[i] = 1 if executing act_batch[i] resulted in + the end of an episode and 0 otherwise. + """ + idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] + return self._encode_sample(idxes) + + +class LazyFramesVectorReplayBuffer(ReplayBuffer): + """ + ReplayBuffer for vectorized environments, which are wrapped into FrameBuffers. + + If an environment is first wrapped into a FrameBuffer and then vectorized, + then the resulting VecEnv will not use LazyFrames, but it will directly + use np.ndarrays, thus greatly increasing RAM consumption by the buffer. + + Instead, we first vectorize an environment and only then wrap in into FrameBuffers. + It's not as convenient, but it keeps the advantage in memory from LazyFrames. + + So, + observations and next_obervations are stored as LazyFrames + of shape (n_frames, n_envs, ...) + actions, rewards and dones are stored as np.ndarrays of shape (n_envs,). + + """ + + # (n_frames, n_envs, *) + + def _encode_sample(self, idxes): + """ + For each index in idxes samples a (s, a, r, s', done) transition + from a randomly chosen environment of the corresponding VecEnv. + """ + obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] + for i in idxes: + data = self._storage[i] + obs_t, action, reward, obs_tp1, done = data + n_envs = action.shape[0] + env_idx_chosen_for_sample = random.randint(0, n_envs - 1) + obses_t.append( + np.array(obs_t, copy=False)[:, env_idx_chosen_for_sample], + ) + actions.append(np.array(action, copy=False)[env_idx_chosen_for_sample]) + rewards.append(reward[env_idx_chosen_for_sample]) + obses_tp1.append( + np.array(obs_tp1, copy=False)[:, env_idx_chosen_for_sample], + ) + dones.append(done[env_idx_chosen_for_sample]) + return ( + np.array(obses_t), + np.array(actions), + np.array(rewards), + np.array(obses_tp1), + np.array(dones), + ) diff --git a/week04_approx_rl/dqn/utils.py b/week04_approx_rl/dqn/utils.py new file mode 100644 index 000000000..a79672901 --- /dev/null +++ b/week04_approx_rl/dqn/utils.py @@ -0,0 +1,14 @@ +import psutil # type: ignore + + +def is_enough_ram(min_available_gb=0.1): + mem = psutil.virtual_memory() + return mem.available >= min_available_gb * (1024**3) + + +def linear_decay( + init_val: float, final_val: float, cur_step: int, total_steps: int +) -> float: + if cur_step >= total_steps: + return final_val + return (init_val * (total_steps - cur_step) + final_val * cur_step) / total_steps diff --git a/week04_approx_rl/framebuffer.py b/week04_approx_rl/framebuffer.py deleted file mode 100644 index fa8805d24..000000000 --- a/week04_approx_rl/framebuffer.py +++ /dev/null @@ -1,45 +0,0 @@ -import numpy as np -from gym.spaces.box import Box -from gym.core import Wrapper - - -class FrameBuffer(Wrapper): - def __init__(self, env, n_frames=4, dim_order='tensorflow'): - """A gym wrapper that reshapes, crops and scales image into the desired shapes""" - super(FrameBuffer, self).__init__(env) - self.dim_order = dim_order - if dim_order == 'tensorflow': - height, width, n_channels = env.observation_space.shape - obs_shape = [height, width, n_channels * n_frames] - elif dim_order == 'pytorch': - n_channels, height, width = env.observation_space.shape - obs_shape = [n_channels * n_frames, height, width] - else: - raise ValueError( - 'dim_order should be "tensorflow" or "pytorch", got {}'.format(dim_order)) - self.observation_space = Box(0.0, 1.0, obs_shape) - self.framebuffer = np.zeros(obs_shape, 'float32') - - def reset(self): - """resets breakout, returns initial frames""" - self.framebuffer = np.zeros_like(self.framebuffer) - self.update_buffer(self.env.reset()) - return self.framebuffer - - def step(self, action): - """plays breakout for 1 step, returns frame buffer""" - new_img, reward, done, info = self.env.step(action) - self.update_buffer(new_img) - return self.framebuffer, reward, done, info - - def update_buffer(self, img): - if self.dim_order == 'tensorflow': - offset = self.env.observation_space.shape[-1] - axis = -1 - cropped_framebuffer = self.framebuffer[:, :, :-offset] - elif self.dim_order == 'pytorch': - offset = self.env.observation_space.shape[0] - axis = 0 - cropped_framebuffer = self.framebuffer[:-offset] - self.framebuffer = np.concatenate( - [img, cropped_framebuffer], axis=axis) diff --git a/week04_approx_rl/homework_lasagne.ipynb b/week04_approx_rl/homework_lasagne.ipynb deleted file mode 100644 index d09541ac4..000000000 --- a/week04_approx_rl/homework_lasagne.ipynb +++ /dev/null @@ -1,767 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Homework 4\n", - "\n", - "Today we'll start by reproducing the DQN and then try improving it with the tricks we learned on the lecture:\n", - "* Target networks\n", - "* Double q-learning\n", - "* Prioritized experience replay\n", - "* Dueling DQN\n", - "* Bootstrap DQN" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "%matplotlib inline\n", - "\n", - "\n", - "# If you are running on a server, launch xvfb to record game videos\n", - "# Please make sure you have xvfb installed\n", - "import os\n", - "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", - " !bash ../xvfb start\n", - " os.environ['DISPLAY'] = ':1'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Processing game image (2 pts)\n", - "\n", - "Raw Atari images are large, 210x160x3 by default. However, we don't need that level of detail in order to learn them.\n", - "\n", - "We can thus save a lot of time by preprocessing game image, including\n", - "* Resizing to a smaller shape\n", - "* Converting to grayscale\n", - "* Cropping irrelevant image parts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from gym.core import ObservationWrapper\n", - "from gym.spaces import Box\n", - "\n", - "from scipy.misc import imresize\n", - "\n", - "\n", - "class PreprocessAtari(ObservationWrapper):\n", - " def __init__(self, env):\n", - " \"\"\"A gym wrapper that crops, scales image into the desired shapes and optionally grayscales it.\"\"\"\n", - " ObservationWrapper.__init__(self, env)\n", - "\n", - " self.img_size = (64, 64)\n", - " self.observation_space = Box(0.0, 1.0, self.img_size)\n", - "\n", - " def observation(self, img):\n", - " \"\"\"what happens to each observation\"\"\"\n", - "\n", - " # Here's what you need to do:\n", - " # * crop image, remove irrelevant parts\n", - " # * resize image to self.img_size\n", - " # (use imresize imported above or any library you want,\n", - " # e.g. opencv, skimage, PIL, keras)\n", - " # * cast image to grayscale\n", - " # * convert image pixels to (0,1) range, float32 type\n", - "\n", - " \n", - " return " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import gym\n", - "\n", - "\n", - "def make_env():\n", - " env = gym.make(\"KungFuMasterDeterministic-v0\") # create raw env\n", - " return PreprocessAtari(env) # apply your wrapper\n", - "\n", - "\n", - "# spawn game instance for tests\n", - "env = make_env()\n", - "\n", - "observation_shape = env.observation_space.shape\n", - "n_actions = env.action_space.n\n", - "\n", - "obs = env.reset()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# test observation\n", - "assert obs.shape == observation_shape\n", - "assert obs.dtype == 'float32'\n", - "assert len(np.unique(obs)) > 2, \"your image must not be binary\"\n", - "assert 0 <= np.min(obs) and np.max(\n", - " obs) <= 1, \"convert image pixels to (0,1) range\"\n", - "\n", - "print \"Formal tests seem fine. Here's an example of what you'll get.\"\n", - "\n", - "plt.title(\"what your network gonna see\")\n", - "plt.imshow(obs, interpolation='none', cmap='gray')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.figure(figsize=[12, 12])\n", - "env.reset()\n", - "for i in range(16):\n", - " for _ in range(10):\n", - " new_obs = env.step(env.action_space.sample())[0]\n", - " plt.subplot(4, 4, i+1)\n", - " plt.imshow(new_obs, interpolation='none', cmap='gray')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# dispose of the game instance\n", - "del env" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Building a DQN (2 pts)\n", - "Here we define a simple agent that maps game images into Qvalues using simple convolutional neural network.\n", - "\n", - "![scheme](https://s18.postimg.cc/gbmsq6gmx/dqn_scheme.png)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# setup theano/lasagne. Prefer GPU. Fallback to CPU (will print warning)\n", - "%env THEANO_FLAGS = floatX = float32\n", - "\n", - "import theano\n", - "import lasagne\n", - "from lasagne.layers import *\n", - "from theano import tensor as T" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# observation\n", - "observation_layer = InputLayer(\n", - " (None,)+observation_shape) # game image, [batch,64,64]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 4-tick window over images\n", - "from agentnet.memory import WindowAugmentation\n", - "\n", - "# window size [batch,4,64,64]\n", - "prev_wnd = InputLayer((None, 4)+observation_shape)\n", - "\n", - "new_wnd = WindowAugmentation( < current observation layer> , prev_wnd)\n", - "\n", - "# if you changed img size, remove assert\n", - "assert new_wnd.output_shape == (None, 4, 64, 64)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from lasagne.nonlinearities import elu, tanh, softmax, rectify\n", - "\n", - "\n", - "\n", - "dense = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# qvalues layer\n", - "qvalues_layer = \n", - "\n", - "assert qvalues_layer.nonlinearity is not rectify" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# sample actions proportionally to policy_layer\n", - "from agentnet.resolver import EpsilonGreedyResolver\n", - "action_layer = EpsilonGreedyResolver(qvalues_layer)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define agent\n", - "Here you will need to declare how your agent works\n", - "\n", - "* `observation_layers` and `action_layers` are the input and output of agent in MDP.\n", - "* `policy_estimators` must contain whatever you need for training\n", - " * In our case, that's `qvalues_layer`, but you'll need to add more when implementing target network.\n", - "* agent_states contains our frame buffer. \n", - " * The code `{new_wnd:prev_wnd}` reads as \"`new_wnd becomes prev_wnd next turn`\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from agentnet.agent import Agent\n", - "# agent\n", - "agent = Agent(observation_layers=,\n", - " policy_estimators=,\n", - " action_layers=,\n", - " agent_states={new_wnd: prev_wnd},)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create and manage a pool of Atari sessions to play with\n", - "\n", - "* To make training more stable, we shall have an entire batch of game sessions each happening independent of others\n", - "* Why several parallel agents help training: http://arxiv.org/pdf/1602.01783v1.pdf\n", - "* Alternative approach: store more sessions: https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from agentnet.experiments.openai_gym.pool import EnvPool\n", - "\n", - "pool = EnvPool(agent, make_env, n_games=16) # 16 parallel game sessions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "# interact for 7 ticks\n", - "_, action_log, reward_log, _, _, _ = pool.interact(5)\n", - "\n", - "print('actions:')\n", - "print(action_log[0])\n", - "print(\"rewards\")\n", - "print(reward_log[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# load first sessions (this function calls interact and remembers sessions)\n", - "SEQ_LENGTH = 10 # sub-session length\n", - "pool.update(SEQ_LENGTH)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Q-learning\n", - "\n", - "We train our agent based on sessions it has played in `pool.update(SEQ_LENGTH)`\n", - "\n", - "To do so, we first obtain sequences of observations, rewards, actions, q-values, etc.\n", - "\n", - "Actions and rewards have shape `[n_games,seq_length]`, q-values are `[n_games,seq_length,n_actions]`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get agent's Qvalues obtained via experience replay\n", - "replay = pool.experience_replay\n", - "\n", - "actions, rewards, is_alive = replay.actions[0], replay.rewards, replay.is_alive\n", - "\n", - "_, _, _, _, qvalues = agent.get_sessions(\n", - " replay,\n", - " session_length=SEQ_LENGTH,\n", - " experience_replay=True,\n", - ")\n", - "\n", - "assert actions.ndim == rewards.ndim == is_alive.ndim == 2, \"actions, rewards and is_alive must have shape [batch,time]\"\n", - "assert qvalues.ndim == 3, \"q-values must have shape [batch,time,n_actions]\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# compute V(s) as Qvalues of best actions.\n", - "# For homework assignment, you will need to use target net\n", - "# or special double q-learning objective here\n", - "\n", - "state_values_target = \n", - "\n", - "assert state_values_target.eval().shape = qvalues.eval().shape[:2]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from agentnet.learning.generic import get_n_step_value_reference\n", - "\n", - "# get reference Q-values via Q-learning algorithm\n", - "reference_qvalues = get_n_step_value_reference(\n", - " state_values=state_values_target,\n", - " rewards=rewards/100.,\n", - " is_alive=is_alive,\n", - " n_steps=10,\n", - " gamma_or_gammas=0.99,\n", - ")\n", - "\n", - "# consider it constant\n", - "from theano.gradient import disconnected_grad\n", - "reference_qvalues = disconnected_grad(reference_qvalues)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get predicted Q-values for committed actions by both current and target networks\n", - "from agentnet.learning.generic import get_values_for_actions\n", - "action_qvalues = get_values_for_actions(qvalues, actions)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# loss for Qlearning =\n", - "# (Q(s,a) - (r+ gamma*r' + gamma^2*r'' + ... +gamma^10*Q(s_{t+10},a_max)))^2\n", - "\n", - "elwise_mse_loss = \n", - "\n", - "# mean over all batches and time ticks\n", - "loss = (elwise_mse_loss*is_alive).mean()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Since it's a single lasagne network, one can get it's weights, output, etc\n", - "weights = \n", - "weights" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Compute weight updates\n", - "updates = \n", - "\n", - "# compile train function\n", - "train_step = theano.function([], loss, updates=updates)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Demo run\n", - "as usual..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "action_layer.epsilon.set_value(0.05)\n", - "untrained_reward = np.mean(pool.evaluate(save_path=\"./records\",\n", - " record_video=True))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# show video\n", - "from IPython.display import HTML\n", - "import os\n", - "\n", - "video_names = list(\n", - " filter(lambda s: s.endswith(\".mp4\"), os.listdir(\"./records/\")))\n", - "\n", - "HTML(\"\"\"\n", - "\n", - "\"\"\".format(\"./records/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Training loop" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# starting epoch\n", - "epoch_counter = 1\n", - "\n", - "# full game rewards\n", - "rewards = {}\n", - "loss, reward_per_tick, reward = 0, 0, 0" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from tqdm import trange\n", - "from IPython.display import clear_output\n", - "\n", - "\n", - "for i in trange(150000):\n", - "\n", - " # update agent's epsilon (in e-greedy policy)\n", - " current_epsilon = 0.05 + 0.45*np.exp(-epoch_counter/20000.)\n", - " action_layer.epsilon.set_value(np.float32(current_epsilon))\n", - "\n", - " # play\n", - " pool.update(SEQ_LENGTH)\n", - "\n", - " # train\n", - " loss = 0.95*loss + 0.05*train_step()\n", - "\n", - " if epoch_counter % 10 == 0:\n", - " # average reward per game tick in current experience replay pool\n", - " reward_per_tick = 0.95*reward_per_tick + 0.05 * \\\n", - " pool.experience_replay.rewards.get_value().mean()\n", - " print(\"iter=%i\\tepsilon=%.3f\\tloss=%.3f\\treward/tick=%.3f\" % (epoch_counter,\n", - " current_epsilon,\n", - " loss,\n", - " reward_per_tick))\n", - "\n", - " # record current learning progress and show learning curves\n", - " if epoch_counter % 100 == 0:\n", - " action_layer.epsilon.set_value(0.05)\n", - " reward = 0.95*reward + 0.05*np.mean(pool.evaluate(record_video=False))\n", - " action_layer.epsilon.set_value(np.float32(current_epsilon))\n", - "\n", - " rewards[epoch_counter] = reward\n", - "\n", - " clear_output(True)\n", - " plt.plot(*zip(*sorted(rewards.items(), key=lambda (t, r): t)))\n", - " plt.show()\n", - "\n", - " epoch_counter += 1\n", - "\n", - "\n", - "# Time to drink some coffee!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Evaluating results\n", - " * Here we plot learning curves and sample testimonials" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "plt.plot(*zip(*sorted(rewards.items(), key=lambda k: k[0])))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from agentnet.utils.persistence import save, load\n", - "save(action_layer, \"pacman.pcl\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "action_layer.epsilon.set_value(0.05)\n", - "rw = pool.evaluate(n_games=20, save_path=\"./records\", record_video=False)\n", - "print(\"mean session score=%f.5\" % np.mean(rw))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# show video\n", - "from IPython.display import HTML\n", - "import os\n", - "\n", - "video_names = list(\n", - " filter(lambda s: s.endswith(\".mp4\"), os.listdir(\"./records/\")))\n", - "\n", - "HTML(\"\"\"\n", - "\n", - "\"\"\".format(\"./videos/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Assignment part I (5 pts)\n", - "\n", - "We'll start by implementing target network to stabilize training.\n", - "\n", - "There are two ways to do so: \n", - "\n", - "\n", - "__1)__ Manually write lasagne network, or clone it via [one of those methods](https://github.com/Lasagne/Lasagne/issues/720).\n", - "\n", - "You will need to implement loading weights from original network to target network.\n", - "\n", - "We recommend thoroughly debugging your code on simple tests before applying it in Atari dqn.\n", - "\n", - "__2)__ Use pre-build functionality from [here](http://agentnet.readthedocs.io/en/master/modules/target_network.html)\n", - "\n", - "```\n", - "from agentnet.target_network import TargetNetwork\n", - "target_net = TargetNetwork(qvalues_layer)\n", - "old_qvalues = target_net.output_layers\n", - "\n", - "#agent's policy_estimators must now become (qvalues,old_qvalues)\n", - "\n", - "_,_,_,_,(qvalues,old_qvalues) = agent.get_sessions(...) #replaying experience\n", - "\n", - "\n", - "target_net.load_weights()#loads weights, so target network is now exactly same as main network\n", - "\n", - "target_net.load_weights(0.01)# w_target = 0.99*w_target + 0.01*w_new\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Bonus I (2+ pts)\n", - "\n", - "Implement and train double q-learning.\n", - "\n", - "This task contains of\n", - "* Implementing __double q-learning__ or __dueling q-learning__ or both (see tips below)\n", - "* Training a network till convergence\n", - " * Full points will be awwarded if your network gets average score of >=10 (see \"evaluating results\")\n", - " * Higher score = more points as usual\n", - " * If you're running out of time, it's okay to submit a solution that hasn't converged yet and updating it when it converges. _Lateness penalty will not increase for second submission_, so submitting first one in time gets you no penalty.\n", - "\n", - "\n", - "#### Tips:\n", - "* Implementing __double q-learning__ shouldn't be a problem if you've already have target networks in place.\n", - " * As one option, use `get_values_for_actions(,)`.\n", - " * You will probably need `T.argmax` to select best actions\n", - " * Here's an original [article](https://arxiv.org/abs/1509.06461)\n", - "\n", - "* __Dueling__ architecture is also quite straightforward if you have standard DQN.\n", - " * You will need to change network architecture, namely the q-values layer\n", - " * It must now contain two heads: V(s) and A(s,a), both dense layers\n", - " * You should then add them up via elemwise sum layer or a [custom](http://lasagne.readthedocs.io/en/latest/user/custom_layers.html) layer.\n", - " * Here's an [article](https://arxiv.org/pdf/1511.06581.pdf)\n", - " \n", - "Here's a template for your convenience:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from lasagne.layers import *\n", - "\n", - "\n", - "class DuelingQvaluesLayer(MergeLayer):\n", - " def get_output_for(self, inputs, **tags):\n", - " V, A = inputs\n", - " return \n", - "\n", - " def get_output_shape_for(self, input_shapes, **tags):\n", - " V_shape, A_shape=input_shapes\n", - " assert len(\n", - " V_shape) == 2 and V_shape[-1] == 1, \"V layer (first param) shape must be [batch,tick,1]\"\n", - " return A_shape # shape of q-values is same as predicted advantages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# mock-up tests\n", - "import theano.tensor as T\n", - "v_tensor = -T.arange(10).reshape((10, 1))\n", - "V = InputLayer((None, 1), v_tensor)\n", - "\n", - "a_tensor = T.arange(30).reshape((10, 3))\n", - "A = InputLayer((None, 1), a_tensor)\n", - "\n", - "Q = DuelingQvaluesLayer([V, A])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "assert np.allclose(get_output(Q).eval(), (v_tensor+a_tensor).eval())\n", - "print(\"looks good\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Bonus II (5+ pts): Prioritized experience replay\n", - "\n", - "In this section, you're invited to implement prioritized experience replay\n", - "\n", - "* You will probably need to provide a custom data structure\n", - "* Once pool.update is called, collect the pool.experience_replay.observations, actions, rewards and is_alive and store them in your data structure\n", - "* You can now sample such transitions in proportion to the error (see [article](https://arxiv.org/abs/1511.05952)) for training.\n", - "\n", - "It's probably more convenient to explicitly declare inputs for \"sample observations\", \"sample actions\" and so on to plug them into q-learning.\n", - "\n", - "Prioritized (and even normal) experience replay should greatly reduce amount of game sessions you need to play in order to achieve good performance. \n", - "\n", - "While it's effect on runtime is limited for atari, more complicated envs (further in the course) will certainly benefit for it.\n", - "\n", - "Prioritized experience replay only supports off-policy algorithms, so pls enforce `n_steps=1` in your q-learning reference computation (default is 10)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/week04_approx_rl/homework_pytorch_debug.ipynb b/week04_approx_rl/homework_pytorch_debug.ipynb index 197dfd52f..41b6a06cf 100644 --- a/week04_approx_rl/homework_pytorch_debug.ipynb +++ b/week04_approx_rl/homework_pytorch_debug.ipynb @@ -2,11 +2,13 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "SqZ2EwnTZdC8" + }, "source": [ "# Deep Q-Network implementation.\n", "\n", - "This homework shamelessly demands you to implement a DQN - an approximate q-learning algorithm with experience replay and target networks - and see if it works any better this way.\n", + "This homework shamelessly demands you to implement DQN — an approximate Q-learning algorithm with experience replay and target networks — and see if it works any better this way.\n", "\n", "Original paper:\n", "https://arxiv.org/pdf/1312.5602.pdf" @@ -14,7 +16,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "Zv7XJfXaZdC9" + }, "source": [ "**This notebook is given for debug.** The main task is in the other notebook (**homework_pytorch_main**). The tasks are similar and share most of the code. The main difference is in environments. In main notebook it can take some 2 hours for the agent to start improving so it seems reasonable to launch the algorithm on a simpler env first. Here it is CartPole and it will train in several minutes.\n", "\n", @@ -26,28 +30,23 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "setup complete\n", - "Starting virtual X frame buffer: Xvfb.\n" - ] - } - ], + "metadata": { + "id": "ioIEVODJZdC9" + }, + "outputs": [], "source": [ "import sys, os\n", "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", - " \n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/atari_wrappers.py\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/utils.py\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/replay_buffer.py\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/framebuffer.py\n", + " os.makedirs('dqn', exist_ok=True)\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + "\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/dqn/atari_wrappers.py -P dqn/\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/dqn/utils.py -P dqn/\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/dqn/replay_buffer.py -P dqn/\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/dqn/framebuffer.py -P dqn/\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/dqn/analysis.py -P dqn/\n", "\n", - " !pip install gym[box2d]\n", + " !pip install gymnasium\n", "\n", " !touch .setup_complete\n", "\n", @@ -60,30 +59,36 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "FDZqlI3kZdC9" + }, "source": [ - "__Frameworks__ - we'll accept this homework in any deep learning framework. This particular notebook was designed for pytoch, but you find it easy to adapt it to almost any python-based deep learning framework." + "__Frameworks__ - we'll accept this homework in any deep learning framework. This particular notebook was designed for PyTorch, but you find it easy to adapt it to almost any Python-based deep learning framework." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "dsYq558wZdC-" + }, "outputs": [], "source": [ "import random\n", "import numpy as np\n", "import torch\n", - "import utils" + "import dqn.utils\n" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "6ypPZ8e6ZdC-" + }, "outputs": [], "source": [ - "import gym\n", + "import gymnasium as gym\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" @@ -91,7 +96,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "9j8EGNlSZdC-" + }, "source": [ "### CartPole again\n", "\n", @@ -105,62 +112,48 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "v-5u-CcQZdC-" + }, "outputs": [], "source": [ - "ENV_NAME = 'CartPole-v1'\n", + "ENV_NAME = \"CartPole-v1\"\n", + "\n", "\n", - "def make_env(seed=None):\n", + "def make_env():\n", " # some envs are wrapped with a time limit wrapper by default\n", - " env = gym.make(ENV_NAME).unwrapped\n", - " if seed is not None:\n", - " env.seed(seed)\n", - " return env" + " env = gym.make(ENV_NAME, render_mode=\"rgb_array\").unwrapped\n", + " return env\n" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.6/dist-packages/gym/logger.py:30: UserWarning: \u001b[33mWARN: Box bound precision lowered by casting to float32\u001b[0m\n", - " warnings.warn(colorize('%s: %s'%('WARN', msg % args), 'yellow'))\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAARSUlEQVR4nO3df6zddX3H8edLQHRqBsi16fpjRe1i\ncJnF3SFG/0CMCsSsmjgDW6QxJJclmGBitoFLpiYj0WTKZuaINTDr4kTmj9AQNsVKYvxDsMVaWxC5\nagltKi0KqDFjK773x/0Uz+ot99wfh9vPPc9HcnK+3/f38z3n/YmHl99++j09qSokSf14znI3IEma\nH4NbkjpjcEtSZwxuSeqMwS1JnTG4JakzIwvuJBcneSDJdJJrR/U+kjRuMor7uJOcAvwAeBNwAPg2\ncHlV3bfkbyZJY2ZUV9znA9NV9aOq+h/gFmDziN5LksbKqSN63TXAwwP7B4DXnGjw2WefXRs2bBhR\nK5LUn/379/Poo49mtmOjCu45JZkCpgDWr1/Pzp07l6sVSTrpTE5OnvDYqJZKDgLrBvbXttrTqmpr\nVU1W1eTExMSI2pCklWdUwf1tYGOSc5I8F7gM2D6i95KksTKSpZKqOprkPcBXgFOAm6tq3yjeS5LG\nzcjWuKvqDuCOUb2+JI0rvzkpSZ0xuCWpMwa3JHXG4JakzhjcktQZg1uSOmNwS1JnDG5J6ozBLUmd\nMbglqTMGtyR1xuCWpM4Y3JLUGYNbkjpjcEtSZwxuSeqMwS1JnTG4Jakzi/rpsiT7gV8ATwFHq2oy\nyVnA54ENwH7gnVX12OLalCQdsxRX3G+oqk1VNdn2rwV2VNVGYEfblyQtkVEslWwGtrXtbcDbRvAe\nkjS2FhvcBXw1ya4kU622qqoOte2fAKsW+R6SpAGLWuMGXl9VB5O8BLgzyfcHD1ZVJanZTmxBPwWw\nfv36RbYhSeNjUVfcVXWwPR8GvgycDzySZDVAez58gnO3VtVkVU1OTEwspg1JGisLDu4kL0jyomPb\nwJuBvcB2YEsbtgW4bbFNSpJ+YzFLJauALyc59jr/XlX/leTbwK1JrgQeAt65+DYlSccsOLir6kfA\nq2ap/xR442KakiSdmN+clKTOGNyS1BmDW5I6Y3BLUmcMbknqjMEtSZ0xuCWpMwa3JHXG4Jakzhjc\nktQZg1uSOmNwS1JnDG5J6ozBLUmdMbglqTMGtyR1xuCWpM4Y3JLUGYNbkjozZ3AnuTnJ4SR7B2pn\nJbkzyYPt+cxWT5KPJ5lOsifJq0fZvCSNo2GuuD8NXHxc7VpgR1VtBHa0fYBLgI3tMQXcuDRtSpKO\nmTO4q+obwM+OK28GtrXtbcDbBuqfqRnfAs5IsnqpmpUkLXyNe1VVHWrbPwFWte01wMMD4w602m9J\nMpVkZ5KdR44cWWAbkjR+Fv2Xk1VVQC3gvK1VNVlVkxMTE4ttQ5LGxkKD+5FjSyDt+XCrHwTWDYxb\n22qSpCWy0ODeDmxp21uA2wbqV7S7Sy4AnhhYUpEkLYFT5xqQ5HPAhcDZSQ4AHwA+DNya5ErgIeCd\nbfgdwKXANPAr4N0j6FmSxtqcwV1Vl5/g0BtnGVvA1YttSpJ0Yn5zUpI6Y3BLUmcMbknqjMEtSZ0x\nuCWpMwa3JHXG4JakzhjcktQZg1uSOmNwS1JnDG5J6ozBLUmdMbglqTMGtyR1xuCWpM4Y3JLUGYNb\nkjpjcEtSZ+YM7iQ3JzmcZO9A7YNJDibZ3R6XDhy7Lsl0kgeSvGVUjUvSuBrmivvTwMWz1G+oqk3t\ncQdAknOBy4BXtnP+JckpS9WsJGmI4K6qbwA/G/L1NgO3VNWTVfVjZn7t/fxF9CdJOs5i1rjfk2RP\nW0o5s9XWAA8PjDnQar8lyVSSnUl2HjlyZBFtSNJ4WWhw3wi8DNgEHAI+Ot8XqKqtVTVZVZMTExML\nbEOSxs+CgruqHqmqp6rq18Cn+M1yyEFg3cDQta0mSVoiCwruJKsHdt8OHLvjZDtwWZLTk5wDbATu\nWVyLkqRBp841IMnngAuBs5McAD4AXJhkE1DAfuAqgKral+RW4D7gKHB1VT01mtYlaTzNGdxVdfks\n5ZueYfz1wPWLaUqSdGJ+c1KSOmNwS1JnDG5J6ozBLUmdMbglqTMGtyR1Zs7bAaWVbNfWq2at//HU\nJ5/lTqThecUtSZ0xuCWpMwa3JHXG4JakzhjcktQZg1uSOmNwS1JnDG5J6ozBLUmdMbglqTMGtyR1\nZs7gTrIuyV1J7kuyL8k1rX5WkjuTPNiez2z1JPl4kukke5K8etSTkKRxMswV91HgfVV1LnABcHWS\nc4FrgR1VtRHY0fYBLmHm1903AlPAjUvetSSNsTmDu6oOVdW9bfsXwP3AGmAzsK0N2wa8rW1vBj5T\nM74FnJFk9ZJ3Lkljal5r3Ek2AOcBdwOrqupQO/QTYFXbXgM8PHDagVY7/rWmkuxMsvPIkSPzbFuS\nxtfQwZ3khcAXgfdW1c8Hj1VVATWfN66qrVU1WVWTExMT8zlVksbaUMGd5DRmQvuzVfWlVn7k2BJI\nez7c6geBdQOnr201SdISGOaukgA3AfdX1ccGDm0HtrTtLcBtA/Ur2t0lFwBPDCypSJIWaZifLnsd\n8C7ge0l2t9r7gQ8Dtya5EngIeGc7dgdwKTAN/Ap495J2LEljbs7grqpvAjnB4TfOMr6AqxfZlyTp\nBPzmpHQcfyhYJzuDW5I6Y3BLUmcMbknqjMEtSZ0xuCWpMwa3JHXG4JakzhjcktQZg1uSOmNwS1Jn\nDG5J6ozBLUmdMbglqTMGtyR1xuCWpM4Y3JLUGYNbkjozzI8Fr0tyV5L7kuxLck2rfzDJwSS72+PS\ngXOuSzKd5IEkbxnlBCRp3AzzY8FHgfdV1b1JXgTsSnJnO3ZDVf3D4OAk5wKXAa8Efg/4WpI/qKqn\nlrJxSRpXc15xV9Whqrq3bf8CuB9Y8wynbAZuqaonq+rHzPza+/lL0awkaZ5r3Ek2AOcBd7fSe5Ls\nSXJzkjNbbQ3w8MBpB3jmoJeWxa6tV/1WzR8KVg+GDu4kLwS+CLy3qn4O3Ai8DNgEHAI+Op83TjKV\nZGeSnUeOHJnPqZI01oYK7iSnMRPan62qLwFU1SNV9VRV/Rr4FL9ZDjkIrBs4fW2r/T9VtbWqJqtq\ncmJiYjFzkKSxMsxdJQFuAu6vqo8N1FcPDHs7sLdtbwcuS3J6knOAjcA9S9eyJI23Ye4qeR3wLuB7\nSXa32vuBy5NsAgrYD1wFUFX7ktwK3MfMHSlXe0eJJC2dOYO7qr4JZJZDdzzDOdcD1y+iL0nSCfjN\nSUnqjMEtSZ0xuCWpMwa3JHXG4JakzhjcktQZg1uSOmNwS1JnDG5J6ozBLUmdMbglqTMGtyR1xuCW\npM4Y3FpRkgz9GMX50rPB4JakzgzzQwrSinX7oamnt9+6eusydiINzytuja3B0J5tXzpZGdyS1Jlh\nfiz4eUnuSfLdJPuSfKjVz0lyd5LpJJ9P8txWP73tT7fjG0Y7BUkaL8NccT8JXFRVrwI2ARcnuQD4\nCHBDVb0ceAy4so2/Enis1W9o46STzvFr2q5xqxfD/FhwAb9su6e1RwEXAX/e6tuADwI3ApvbNsAX\ngH9OkvY60klj8qqtwG/C+oPL1ok0P0PdVZLkFGAX8HLgE8APgcer6mgbcgBY07bXAA8DVNXRJE8A\nLwYePdHr79q1y/ti1R0/s1ouQwV3VT0FbEpyBvBl4BWLfeMkU8AUwPr163nooYcW+5LSsxqm/iFS\nozQ5OXnCY/O6q6SqHgfuAl4LnJHkWPCvBQ627YPAOoB2/HeBn87yWlurarKqJicmJubThiSNtWHu\nKploV9okeT7wJuB+ZgL8HW3YFuC2tr297dOOf931bUlaOsMslawGtrV17ucAt1bV7UnuA25J8vfA\nd4Cb2vibgH9LMg38DLhsBH1L0tga5q6SPcB5s9R/BJw/S/2/gT9bku4kSb/Fb05KUmcMbknqjMEt\nSZ3xn3XViuINTBoHXnFLUmcMbknqjMEtSZ0xuCWpMwa3JHXG4JakzhjcktQZg1uSOmNwS1JnDG5J\n6ozBLUmdMbglqTMGtyR1xuCWpM4M82PBz0tyT5LvJtmX5EOt/ukkP06yuz02tXqSfDzJdJI9SV49\n6klI0jgZ5t/jfhK4qKp+meQ04JtJ/rMd+6uq+sJx4y8BNrbHa4Ab27MkaQnMecVdM37Zdk9rj2f6\n1+o3A59p530LOCPJ6sW3KkmCIde4k5ySZDdwGLizqu5uh65vyyE3JDm91dYADw+cfqDVJElLYKjg\nrqqnqmoTsBY4P8kfAtcBrwD+BDgL+Jv5vHGSqSQ7k+w8cuTIPNuWpPE1r7tKqupx4C7g4qo61JZD\nngT+FTi/DTsIrBs4bW2rHf9aW6tqsqomJyYmFta9JI2hYe4qmUhyRtt+PvAm4PvH1q2TBHgbsLed\nsh24ot1dcgHwRFUdGkn3kjSGhrmrZDWwLckpzAT9rVV1e5KvJ5kAAuwG/rKNvwO4FJgGfgW8e+nb\nlqTxNWdwV9Ue4LxZ6hedYHwBVy++NUnSbPzmpCR1xuCWpM4Y3JLUGYNbkjpjcEtSZwxuSeqMwS1J\nnTG4JakzBrckdcbglqTOGNyS1BmDW5I6Y3BLUmcMbknqjMEtSZ0xuCWpMwa3JHXG4JakzhjcktQZ\ng1uSOmNwS1JnDG5J6kyqarl7IMkvgAeWu48RORt4dLmbGIGVOi9YuXNzXn35/aqamO3Aqc92Jyfw\nQFVNLncTo5Bk50qc20qdF6zcuTmvlcOlEknqjMEtSZ05WYJ763I3MEIrdW4rdV6wcufmvFaIk+Iv\nJyVJwztZrrglSUNa9uBOcnGSB5JMJ7l2ufuZryQ3JzmcZO9A7awkdyZ5sD2f2epJ8vE21z1JXr18\nnT+zJOuS3JXkviT7klzT6l3PLcnzktyT5LttXh9q9XOS3N36/3yS57b66W1/uh3fsJz9zyXJKUm+\nk+T2tr9S5rU/yfeS7E6ys9W6/iwuxrIGd5JTgE8AlwDnApcnOXc5e1qATwMXH1e7FthRVRuBHW0f\nZua5sT2mgBufpR4X4ijwvqo6F7gAuLr9b9P73J4ELqqqVwGbgIuTXAB8BLihql4OPAZc2cZfCTzW\n6je0cSeza4D7B/ZXyrwA3lBVmwZu/ev9s7hwVbVsD+C1wFcG9q8DrlvOnhY4jw3A3oH9B4DVbXs1\nM/epA3wSuHy2cSf7A7gNeNNKmhvwO8C9wGuY+QLHqa3+9OcS+Arw2rZ9ahuX5e79BPNZy0yAXQTc\nDmQlzKv1uB84+7jaivkszvex3Esla4CHB/YPtFrvVlXVobb9E2BV2+5yvu2P0ecBd7MC5taWE3YD\nh4E7gR8Cj1fV0TZksPen59WOPwG8+NnteGj/CPw18Ou2/2JWxrwACvhqkl1Jplqt+8/iQp0s35xc\nsaqqknR7606SFwJfBN5bVT9P8vSxXudWVU8Bm5KcAXwZeMUyt7RoSd4KHK6qXUkuXO5+RuD1VXUw\nyUuAO5N8f/Bgr5/FhVruK+6DwLqB/bWt1rtHkqwGaM+HW72r+SY5jZnQ/mxVfamVV8TcAKrqceAu\nZpYQzkhy7EJmsPen59WO/y7w02e51WG8DvjTJPuBW5hZLvkn+p8XAFV1sD0fZub/bM9nBX0W52u5\ng/vbwMb2N9/PBS4Dti9zT0thO7ClbW9hZn34WP2K9rfeFwBPDPxR76SSmUvrm4D7q+pjA4e6nluS\niXalTZLnM7Nufz8zAf6ONuz4eR2b7zuAr1dbOD2ZVNV1VbW2qjYw89/R16vqL+h8XgBJXpDkRce2\ngTcDe+n8s7goy73IDlwK/ICZdca/Xe5+FtD/54BDwP8ys5Z2JTNrhTuAB4GvAWe1sWHmLpofAt8D\nJpe7/2eY1+uZWVfcA+xuj0t7nxvwR8B32rz2An/X6i8F7gGmgf8ATm/157X96Xb8pcs9hyHmeCFw\n+0qZV5vDd9tj37Gc6P2zuJiH35yUpM4s91KJJGmeDG5J6ozBLUmdMbglqTMGtyR1xuCWpM4Y3JLU\nGYNbkjrzf0Ew7Is+EjUWAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - }, - "output_type": "display_data" - } - ], + "metadata": { + "id": "AmFXRrkqZdC-" + }, + "outputs": [], "source": [ "env = make_env()\n", "env.reset()\n", - "plt.imshow(env.render(\"rgb_array\"))\n", + "plt.imshow(env.render())\n", "state_shape, n_actions = env.observation_space.shape, env.action_space.n" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "qOyWgOmvZdC-" + }, "source": [ "### Building a network" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "XqpThLZXZdC-" + }, "source": [ "We now need to build a neural network that can map observations to state q-values.\n", "The model does not have to be huge yet. 1-2 hidden layers with < 200 neurons and ReLU activation will probably be enough. Batch normalization and dropout can spoil everything here." @@ -169,7 +162,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "UVlpkvZOZdC-" + }, "outputs": [], "source": [ "import torch\n", @@ -183,7 +178,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "RFva1cpyZdC-" + }, "outputs": [], "source": [ "class DQNAgent(nn.Module):\n", @@ -198,7 +195,7 @@ " state_dim = state_shape[0]\n", " \n", "\n", - " \n", + "\n", " def forward(self, state_t):\n", " \"\"\"\n", " takes agent's observation (tensor), returns qvalues (tensor)\n", @@ -208,8 +205,11 @@ " qvalues = \n", "\n", " assert qvalues.requires_grad, \"qvalues must be a torch tensor with grad\"\n", - " assert len(\n", - " qvalues.shape) == 2 and qvalues.shape[0] == state_t.shape[0] and qvalues.shape[1] == n_actions\n", + " assert (\n", + " len(qvalues.shape) == 2 and\n", + " qvalues.shape[0] == state_t.shape[0] and\n", + " qvalues.shape[1] == n_actions\n", + " )\n", "\n", " return qvalues\n", "\n", @@ -238,7 +238,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "Bv1s5JKzZdC-" + }, "outputs": [], "source": [ "agent = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device)" @@ -246,7 +248,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "vazC0DPQZdC_" + }, "source": [ "Now let's try out our agent to see if it raises any errors." ] @@ -254,39 +258,34 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "e-Sg1cqPZdC_" + }, "outputs": [], "source": [ - "def evaluate(env, agent, n_games=1, greedy=False, t_max=10000):\n", + "def evaluate(env, agent, n_games=1, greedy=False, t_max=10000, seed=None):\n", " \"\"\" Plays n_games full games. If greedy, picks actions as argmax(qvalues). Returns mean reward. \"\"\"\n", " rewards = []\n", " for _ in range(n_games):\n", - " s = env.reset()\n", + " s, _ = env.reset(seed=seed)\n", " reward = 0\n", " for _ in range(t_max):\n", " qvalues = agent.get_qvalues([s])\n", " action = qvalues.argmax(axis=-1)[0] if greedy else agent.sample_actions(qvalues)[0]\n", - " s, r, done, _ = env.step(action)\n", + " s, r, terminated, truncated, _ = env.step(action)\n", " reward += r\n", - " if done:\n", + " if terminated or truncated:\n", " break\n", "\n", " rewards.append(reward)\n", " return np.mean(rewards)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evaluate(env, agent, n_games=1)" - ] - }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "Y_0NzjUEZdC_" + }, "source": [ "### Experience replay\n", "For this assignment, we provide you with experience replay buffer. If you implemented experience replay buffer in last week's assignment, you can copy-paste it here in main notebook **to get 2 bonus points**.\n", @@ -296,7 +295,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "jHyCO4TuZdC_" + }, "source": [ "#### The interface is fairly simple:\n", "* `exp_replay.add(obs, act, rw, next_obs, done)` - saves (s,a,r,s',done) tuple into the buffer\n", @@ -307,18 +308,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "wQEHwR1AZdC_" + }, "outputs": [], "source": [ - "from replay_buffer import ReplayBuffer\n", + "from dqn.replay_buffer import ReplayBuffer\n", "exp_replay = ReplayBuffer(10)\n", "\n", "for _ in range(30):\n", - " exp_replay.add(env.reset(), env.action_space.sample(),\n", - " 1.0, env.reset(), done=False)\n", + " exp_replay.add(env.reset()[0], env.action_space.sample(), 1.0, env.reset()[0], done=False)\n", "\n", - "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", - " 5)\n", + "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(5)\n", "\n", "assert len(exp_replay) == 10, \"experience replay size should be 10 because that's what maximum capacity is\"" ] @@ -326,14 +327,16 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "0RnFX5sfZdC_" + }, "outputs": [], "source": [ "def play_and_record(initial_state, agent, env, exp_replay, n_steps=1):\n", " \"\"\"\n", - " Play the game for exactly n steps, record every (s,a,r,s', done) to replay buffer. \n", - " Whenever game ends, add record with done=True and reset the game.\n", - " It is guaranteed that env has done=False when passed to this function.\n", + " Play the game for exactly n_steps, record every (s,a,r,s', done) to replay buffer.\n", + " Whenever game ends due to termination or truncation, add record with done=terminated and reset the game.\n", + " It is guaranteed that env has terminated=False when passed to this function.\n", "\n", " PLEASE DO NOT RESET ENV UNLESS IT IS \"DONE\"\n", "\n", @@ -351,46 +354,55 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZXXmFEKGZdC_", + "outputId": "d1b66847-a141-4406-9697-7ebd194fdb6a" + }, "outputs": [], "source": [ "# testing your code.\n", "exp_replay = ReplayBuffer(2000)\n", "\n", - "state = env.reset()\n", + "state, _ = env.reset()\n", "play_and_record(state, agent, env, exp_replay, n_steps=1000)\n", "\n", "# if you're using your own experience replay buffer, some of those tests may need correction.\n", "# just make sure you know what your code does\n", - "assert len(exp_replay) == 1000, \"play_and_record should have added exactly 1000 steps, \"\\\n", - " \"but instead added %i\" % len(exp_replay)\n", + "assert len(exp_replay) == 1000, \\\n", + " \"play_and_record should have added exactly 1000 steps, \" \\\n", + " \"but instead added %i\" % len(exp_replay)\n", "is_dones = list(zip(*exp_replay._storage))[-1]\n", "\n", - "assert 0 < np.mean(is_dones) < 0.1, \"Please make sure you restart the game whenever it is 'done' and record the is_done correctly into the buffer.\"\\\n", - " \"Got %f is_done rate over %i steps. [If you think it's your tough luck, just re-run the test]\" % (\n", - " np.mean(is_dones), len(exp_replay))\n", + "assert 0 < np.mean(is_dones) < 0.1, \\\n", + " \"Please make sure you restart the game whenever it is 'done' and \" \\\n", + " \"record the is_done correctly into the buffer. Got %f is_done rate over \" \\\n", + " \"%i steps. [If you think it's your tough luck, just re-run the test]\" % (\n", + " np.mean(is_dones), len(exp_replay))\n", "\n", "for _ in range(100):\n", - " obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", - " 10)\n", + " obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(10)\n", " assert obs_batch.shape == next_obs_batch.shape == (10,) + state_shape\n", - " assert act_batch.shape == (\n", - " 10,), \"actions batch should have shape (10,) but is instead %s\" % str(act_batch.shape)\n", - " assert reward_batch.shape == (\n", - " 10,), \"rewards batch should have shape (10,) but is instead %s\" % str(reward_batch.shape)\n", - " assert is_done_batch.shape == (\n", - " 10,), \"is_done batch should have shape (10,) but is instead %s\" % str(is_done_batch.shape)\n", - " assert [int(i) in (0, 1)\n", - " for i in is_dones], \"is_done should be strictly True or False\"\n", - " assert [\n", - " 0 <= a < n_actions for a in act_batch], \"actions should be within [0, n_actions]\"\n", + " assert act_batch.shape == (10,), \\\n", + " \"actions batch should have shape (10,) but is instead %s\" % str(act_batch.shape)\n", + " assert reward_batch.shape == (10,), \\\n", + " \"rewards batch should have shape (10,) but is instead %s\" % str(reward_batch.shape)\n", + " assert is_done_batch.shape == (10,), \\\n", + " \"is_done batch should have shape (10,) but is instead %s\" % str(is_done_batch.shape)\n", + " assert [int(i) in (0, 1) for i in is_dones], \\\n", + " \"is_done should be strictly True or False\"\n", + " assert [0 <= a < n_actions for a in act_batch], \"actions should be within [0, n_actions)\"\n", "\n", "print(\"Well done!\")" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "uoVGsnHRZdC_" + }, "source": [ "### Target networks\n", "\n", @@ -406,7 +418,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8BLJCNiuZdC_", + "outputId": "6181261a-60cf-4626-fbe6-930a6ccd9896" + }, "outputs": [], "source": [ "target_network = DQNAgent(agent.state_shape, agent.n_actions, epsilon=0.5).to(device)\n", @@ -416,7 +434,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "I_GGShX3ZdC_" + }, "source": [ "### Learning with... Q-learning\n", "Here we write a function similar to `agent.update` from tabular q-learning." @@ -424,7 +444,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "4hbg-xANZdC_" + }, "source": [ "Compute Q-learning TD error:\n", "\n", @@ -435,7 +457,7 @@ "$$ Q_{reference}(s,a) = r(s,a) + \\gamma \\cdot max_{a'} Q_{target}(s', a') $$\n", "\n", "Where\n", - "* $Q_{target}(s',a')$ denotes q-value of next state and next action predicted by __target_network__\n", + "* $Q_{target}(s',a')$ denotes Q-value of next state and next action predicted by __target_network__\n", "* $s, a, r, s'$ are current state, action, reward and next state respectively\n", "* $\\gamma$ is a discount factor defined two cells above.\n", "\n", @@ -448,7 +470,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "VxrEOC7mZdC_" + }, "outputs": [], "source": [ "def compute_td_loss(states, actions, rewards, next_states, is_done,\n", @@ -457,35 +481,32 @@ " check_shapes=False,\n", " device=device):\n", " \"\"\" Compute td loss using torch operations only. Use the formulae above. \"\"\"\n", - " states = torch.tensor(states, device=device, dtype=torch.float) # shape: [batch_size, *state_shape]\n", - "\n", - " # for some torch reason should not make actions a tensor\n", - " actions = torch.tensor(actions, device=device, dtype=torch.long) # shape: [batch_size]\n", - " rewards = torch.tensor(rewards, device=device, dtype=torch.float) # shape: [batch_size]\n", + " states = torch.tensor(states, device=device, dtype=torch.float32) # shape: [batch_size, *state_shape]\n", + " actions = torch.tensor(actions, device=device, dtype=torch.int64) # shape: [batch_size]\n", + " rewards = torch.tensor(rewards, device=device, dtype=torch.float32) # shape: [batch_size]\n", " # shape: [batch_size, *state_shape]\n", " next_states = torch.tensor(next_states, device=device, dtype=torch.float)\n", " is_done = torch.tensor(\n", " is_done.astype('float32'),\n", " device=device,\n", - " dtype=torch.float\n", + " dtype=torch.float32,\n", " ) # shape: [batch_size]\n", " is_not_done = 1 - is_done\n", "\n", " # get q-values for all actions in current states\n", - " predicted_qvalues = agent(states)\n", + " predicted_qvalues = agent(states) # shape: [batch_size, n_actions]\n", "\n", " # compute q-values for all actions in next states\n", - " predicted_next_qvalues = target_network(next_states)\n", - " \n", + " predicted_next_qvalues = target_network(next_states) # shape: [batch_size, n_actions]\n", + "\n", " # select q-values for chosen actions\n", - " predicted_qvalues_for_actions = predicted_qvalues[range(\n", - " len(actions)), actions]\n", + " predicted_qvalues_for_actions = predicted_qvalues[range(len(actions)), actions] # shape: [batch_size]\n", "\n", " # compute V*(next_states) using predicted next q-values\n", " next_state_values = \n", "\n", - " assert next_state_values.dim(\n", - " ) == 1 and next_state_values.shape[0] == states.shape[0], \"must predict one value per state\"\n", + " assert next_state_values.dim() == 1 and next_state_values.shape[0] == states.shape[0], \\\n", + " \"must predict one value per state\"\n", "\n", " # compute \"target q-values\" for loss - it's what's inside square parentheses in the above formula.\n", " # at the last state use the simplified formula: Q(s,a) = r(s,a) since s' doesn't exist\n", @@ -493,23 +514,24 @@ " target_qvalues_for_actions = \n", "\n", " # mean squared error loss to minimize\n", - " loss = torch.mean((predicted_qvalues_for_actions -\n", - " target_qvalues_for_actions.detach()) ** 2)\n", + " loss = torch.mean((predicted_qvalues_for_actions - target_qvalues_for_actions.detach()) ** 2)\n", "\n", " if check_shapes:\n", - " assert predicted_next_qvalues.data.dim(\n", - " ) == 2, \"make sure you predicted q-values for all actions in next state\"\n", - " assert next_state_values.data.dim(\n", - " ) == 1, \"make sure you computed V(s') as maximum over just the actions axis and not all axes\"\n", - " assert target_qvalues_for_actions.data.dim(\n", - " ) == 1, \"there's something wrong with target q-values, they must be a vector\"\n", + " assert predicted_next_qvalues.data.dim() == 2, \\\n", + " \"make sure you predicted q-values for all actions in next state\"\n", + " assert next_state_values.data.dim() == 1, \\\n", + " \"make sure you computed V(s') as maximum over just the actions axis and not all axes\"\n", + " assert target_qvalues_for_actions.data.dim() == 1, \\\n", + " \"there's something wrong with target q-values, they must be a vector\"\n", "\n", " return loss" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "pgZKcPPnZdC_" + }, "source": [ "Sanity checks" ] @@ -517,27 +539,31 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "Yp8eREoDZdC_" + }, "outputs": [], "source": [ - "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", - " 10)\n", + "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(10)\n", "\n", "loss = compute_td_loss(obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch,\n", " agent, target_network,\n", " gamma=0.99, check_shapes=True)\n", "loss.backward()\n", "\n", - "assert loss.requires_grad and tuple(loss.data.size()) == (\n", - " ), \"you must return scalar loss - mean over batch\"\n", - "assert np.any(next(agent.parameters()).grad.data.cpu().numpy() !=\n", - " 0), \"loss must be differentiable w.r.t. network weights\"\n", - "assert np.all(next(target_network.parameters()).grad is None), \"target network should not have grads\"" + "assert loss.requires_grad and tuple(loss.data.size()) == (), \\\n", + " \"you must return scalar loss - mean over batch\"\n", + "assert np.any(next(agent.parameters()).grad.data.cpu().numpy() != 0), \\\n", + " \"loss must be differentiable w.r.t. network weights\"\n", + "assert np.all(next(target_network.parameters()).grad is None), \\\n", + " \"target network should not have grads\"" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "8A1QtGVqZdC_" + }, "source": [ "### Main loop\n", "\n", @@ -547,7 +573,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "8lAUT94JZdC_" + }, "outputs": [], "source": [ "from tqdm import trange\n", @@ -558,10 +586,16 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YOk81bdZZdC_", + "outputId": "2fd2404e-19e5-4ebe-b0e1-c6f7593db790" + }, "outputs": [], "source": [ - "seed = \n", + "seed = \n", "random.seed(seed)\n", "np.random.seed(seed)\n", "torch.manual_seed(seed)" @@ -570,13 +604,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "13K5t2CTZdDA", + "outputId": "031d4a0a-99c3-4cc3-f7c3-77e4d8a5a331" + }, "outputs": [], "source": [ - "env = make_env(seed)\n", "state_dim = env.observation_space.shape\n", "n_actions = env.action_space.n\n", - "state = env.reset()\n", + "state, _ = env.reset(seed=seed)\n", "\n", "agent = DQNAgent(state_dim, n_actions, epsilon=1).to(device)\n", "target_network = DQNAgent(state_dim, n_actions, epsilon=1).to(device)\n", @@ -586,21 +625,31 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iD7PAlwQZdDA", + "outputId": "aeb4bb67-4776-4b02-e558-d9d1a3306d47" + }, "outputs": [], "source": [ - "exp_replay = ReplayBuffer(10**4)\n", + "from dqn.utils import is_enough_ram, linear_decay\n", + "\n", + "REPLAY_BUFFER_SIZE = 10**4\n", + "\n", + "exp_replay = ReplayBuffer(REPLAY_BUFFER_SIZE)\n", "for i in range(100):\n", - " if not utils.is_enough_ram(min_available_gb=0.1):\n", + " if not is_enough_ram(min_available_gb=0.1):\n", " print(\"\"\"\n", - " Less than 100 Mb RAM available. \n", + " Less than 100 Mb RAM available.\n", " Make sure the buffer size in not too huge.\n", " Also check, maybe other processes consume RAM heavily.\n", " \"\"\"\n", " )\n", " break\n", " play_and_record(state, agent, env, exp_replay, n_steps=10**2)\n", - " if len(exp_replay) == 10**4:\n", + " if len(exp_replay) == REPLAY_BUFFER_SIZE:\n", " break\n", "print(len(exp_replay))" ] @@ -608,7 +657,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "Zl2VCEYQZdDA" + }, "outputs": [], "source": [ "# # for something more complicated than CartPole\n", @@ -633,7 +684,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "x-sD-QyUZdDA" + }, "outputs": [], "source": [ "timesteps_per_epoch = 1\n", @@ -656,98 +709,120 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "piqDfKQAZdDA" + }, "outputs": [], "source": [ "mean_rw_history = []\n", "td_loss_history = []\n", "grad_norm_history = []\n", - "initial_state_v_history = []" + "initial_state_v_history = []\n", + "step = 0" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "Ks8NAV8AZdDA" + }, "outputs": [], "source": [ - "state = env.reset()\n", - "for step in trange(total_steps + 1):\n", - " if not utils.is_enough_ram():\n", - " print('less that 100 Mb RAM available, freezing')\n", - " print('make sure everything is ok and make KeyboardInterrupt to continue')\n", - " try:\n", - " while True:\n", - " pass\n", - " except KeyboardInterrupt:\n", - " pass\n", + "import time\n", "\n", - " agent.epsilon = utils.linear_decay(init_epsilon, final_epsilon, step, decay_steps)\n", + "def wait_for_keyboard_interrupt():\n", + " try:\n", + " while True:\n", + " time.sleep(1)\n", + " except KeyboardInterrupt:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sU3GSGZqZdDA" + }, + "outputs": [], + "source": [ + "state, _ = env.reset()\n", + "with trange(step, total_steps + 1) as progress_bar:\n", + " for step in progress_bar:\n", + " if not is_enough_ram():\n", + " print('less that 100 Mb RAM available, freezing')\n", + " print('make sure everything is ok and use KeyboardInterrupt to continue')\n", + " wait_for_keyboard_interrupt()\n", "\n", - " # play\n", - " _, state = play_and_record(state, agent, env, exp_replay, timesteps_per_epoch)\n", + " agent.epsilon = linear_decay(init_epsilon, final_epsilon, step, decay_steps)\n", "\n", - " # train\n", - " \n", + " # play\n", + " _, state = play_and_record(state, agent, env, exp_replay, timesteps_per_epoch)\n", "\n", - " loss = \n", + " # train\n", + " \n", "\n", - " loss.backward()\n", - " grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)\n", - " opt.step()\n", - " opt.zero_grad()\n", + " loss = \n", "\n", - " if step % loss_freq == 0:\n", - " td_loss_history.append(loss.data.cpu().item())\n", - " grad_norm_history.append(grad_norm)\n", + " loss.backward()\n", + " grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)\n", + " opt.step()\n", + " opt.zero_grad()\n", "\n", - " if step % refresh_target_network_freq == 0:\n", - " # Load agent weights into target_network\n", - " \n", + " if step % loss_freq == 0:\n", + " td_loss_history.append(loss.data.cpu().item())\n", + " grad_norm_history.append(grad_norm)\n", "\n", - " if step % eval_freq == 0:\n", - " # eval the agent\n", - " mean_rw_history.append(evaluate(\n", - " make_env(seed=step), agent, n_games=3, greedy=True, t_max=1000)\n", - " )\n", - " initial_state_q_values = agent.get_qvalues(\n", - " [make_env(seed=step).reset()]\n", - " )\n", - " initial_state_v_history.append(np.max(initial_state_q_values))\n", + " if step % refresh_target_network_freq == 0:\n", + " # Load agent weights into target_network\n", + " \n", + "\n", + " if step % eval_freq == 0:\n", + " mean_rw_history.append(evaluate(\n", + " make_env(), agent, n_games=3, greedy=True, t_max=1000, seed=step)\n", + " )\n", + " initial_state_q_values = agent.get_qvalues(\n", + " [make_env().reset(seed=step)[0]]\n", + " )\n", + " initial_state_v_history.append(np.max(initial_state_q_values))\n", "\n", - " clear_output(True)\n", - " print(\"buffer size = %i, epsilon = %.5f\" %\n", - " (len(exp_replay), agent.epsilon))\n", + " clear_output(True)\n", + " print(\"buffer size = %i, epsilon = %.5f\" %\n", + " (len(exp_replay), agent.epsilon))\n", "\n", - " plt.figure(figsize=[16, 9])\n", - " plt.subplot(2, 2, 1)\n", - " plt.title(\"Mean reward per episode\")\n", - " plt.plot(mean_rw_history)\n", - " plt.grid()\n", + " plt.figure(figsize=[16, 9])\n", "\n", - " assert not np.isnan(td_loss_history[-1])\n", - " plt.subplot(2, 2, 2)\n", - " plt.title(\"TD loss history (smoothened)\")\n", - " plt.plot(utils.smoothen(td_loss_history))\n", - " plt.grid()\n", + " plt.subplot(2, 2, 1)\n", + " plt.title(\"Mean reward per episode\")\n", + " plt.plot(mean_rw_history)\n", + " plt.grid()\n", "\n", - " plt.subplot(2, 2, 3)\n", - " plt.title(\"Initial state V\")\n", - " plt.plot(initial_state_v_history)\n", - " plt.grid()\n", + " assert not np.isnan(td_loss_history[-1])\n", + " plt.subplot(2, 2, 2)\n", + " plt.title(\"TD loss history\")\n", + " plt.plot(td_loss_history)\n", + " plt.grid()\n", "\n", - " plt.subplot(2, 2, 4)\n", - " plt.title(\"Grad norm history (smoothened)\")\n", - " plt.plot(utils.smoothen(grad_norm_history))\n", - " plt.grid()\n", + " plt.subplot(2, 2, 3)\n", + " plt.title(\"Initial state V\")\n", + " plt.plot(initial_state_v_history)\n", + " plt.grid()\n", "\n", - " plt.show()" + " plt.subplot(2, 2, 4)\n", + " plt.title(\"Grad norm history\")\n", + " plt.plot(grad_norm_history)\n", + " plt.grid()\n", + "\n", + " plt.show()" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "qwWFT2SBZdDA" + }, "outputs": [], "source": [ "final_score = evaluate(\n", @@ -761,7 +836,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "G-feeX9YZdDA" + }, "source": [ "**Agent's predicted V-values vs their Monte-Carlo estimates**" ] @@ -769,11 +846,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "rjVuSIrPZdDA" + }, "outputs": [], "source": [ + "from analysis import play_and_log_episode\n", + "\n", "eval_env = make_env()\n", - "record = utils.play_and_log_episode(eval_env, agent)\n", + "record = play_and_log_episode(eval_env, agent)\n", "print('total reward for life:', np.sum(record['rewards']))\n", "for key in record:\n", " print(key)" @@ -782,7 +863,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "FCacwLw6ZdDA" + }, "outputs": [], "source": [ "fig = plt.figure(figsize=(5, 5))\n", @@ -803,11 +886,17 @@ } ], "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, "language_info": { - "name": "python", - "pygments_lexer": "ipython3" + "name": "python" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/week04_approx_rl/homework_pytorch_main.ipynb b/week04_approx_rl/homework_pytorch_main.ipynb index b1d91e0ec..d2628ec9a 100644 --- a/week04_approx_rl/homework_pytorch_main.ipynb +++ b/week04_approx_rl/homework_pytorch_main.ipynb @@ -2,49 +2,66 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "1NDjJiqysoT-" + }, "source": [ "# Deep Q-Network implementation.\n", "\n", - "This homework shamelessly demands you to implement a DQN - an approximate q-learning algorithm with experience replay and target networks - and see if it works any better this way.\n", + "This homework shamelessly demands you to implement DQN — an approximate Q-learning algorithm with experience replay and target networks — and see if it works any better this way.\n", + "\n", + "**Papers:**\n", + "\n", + "[1] Original paper, 2013: https://arxiv.org/pdf/1312.5602.pdf\n", "\n", - "Original paper:\n", - "https://arxiv.org/pdf/1312.5602.pdf" + "[2] Extended paper, Nature, 2015: https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf\n", + "\n", + "[3] Rainbow, 2017: https://arxiv.org/pdf/1710.02298.pdf" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "BcLhaXMKsoT_" + }, "source": [ - "**This notebook is the main notebook.** Another notebook is given for debug. (**homework_pytorch_main**). The tasks are similar and share most of the code. The main difference is in environments. In main notebook it can take some 2 hours for the agent to start improving so it seems reasonable to launch the algorithm on a simpler env first. In debug one it is CartPole and it will train in several minutes.\n", + "**This notebook is the main homework notebook.**\n", + "Another notebook is given for debug: (**homework_pytorch_debug**). The debug notebook is not supported anymore, the codes have diverged. However, it can be useful in some cases. The tasks are similar and they used to share most of the code. The main difference is in environments. In main notebook it can take some 2 hours for the agent to start improving so it seems reasonable to launch the algorithm on a simpler env first. In debug one it is CartPole and it will train in several minutes.\n", "\n", - "**We suggest the following pipeline:** First implement debug notebook then implement the main one.\n", + "**About evaluation:** All points are only given for the main notebook.\n", "\n", - "**About evaluation:** All points are given for the main notebook with one exception: if agent fails to beat the threshold in main notebook you can get 1 pt (instead of 3 pts) for beating the threshold in debug notebook." + "**Plan and evaluation points:**\n", + "1. Getting to know the environment: most of the code is implemented for you\n", + "2. DQN as it is (10 points): the main part of DQN implementation\n", + "3. Main Loop (3 points): the training loop itself. Please, note, it can be really time-consuming, and implementation bugs can arise.\n", + "4. Interpretation (2 points): calculation of episode statistics and their interpretation" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Starting virtual X frame buffer: Xvfb.\n" - ] - } - ], + "metadata": { + "id": "IVo0UxTWsoT_" + }, + "outputs": [], "source": [ "import sys, os\n", "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", - " \n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/atari_wrappers.py\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/utils.py\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/replay_buffer.py\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/framebuffer.py\n", + " os.makedirs('dqn', exist_ok=True)\n", + " os.makedirs('test_td_loss', exist_ok=True)\n", + "\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + "\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/dqn/atari_wrappers.py -P dqn/\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/dqn/utils.py -P dqn/\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/dqn/replay_buffer.py -P dqn/\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/dqn/framebuffer.py -P dqn/\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/dqn/analysis.py -P dqn/\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/dqn/logger.py -P dqn/\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/test_td_loss/compute_td_loss.py -P test_td_loss/\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/requirements.txt\n", + "\n", + " !pip install -r requirements.txt\n", "\n", " !touch .setup_complete\n", "\n", @@ -57,21 +74,47 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "KkrBeP7YsoUA" + }, "source": [ - "__Frameworks__ - we'll accept this homework in any deep learning framework. This particular notebook was designed for pytoch, but you find it easy to adapt it to almost any python-based deep learning framework." + "__Frameworks__ - we'll accept this homework in any deep learning framework. This particular notebook was designed for PyTorch, but you find it easy to adapt it to almost any Python-based deep learning framework." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "P8WoWe9DsoUA" + }, "outputs": [], "source": [ - "import random\n", + "%matplotlib inline\n", + "import gymnasium as gym\n", + "import ale_py\n", "import numpy as np\n", - "import torch\n", - "import utils" + "import matplotlib.pyplot as plt\n", + "\n", + "gym.register_envs(ale_py)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6GQBgViKsoUA" + }, + "source": [ + "### Let's play some old videogames\n", + "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/nerd.png)\n", + "\n", + "This time we're gonna apply approximate Q-learning to an Atari game called Breakout. It's not the hardest thing out there, but it's definitely way more complex than anything we have tried before." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**These are various versions of Breakout provided by Gynmasium:**" ] }, { @@ -80,65 +123,84 @@ "metadata": {}, "outputs": [], "source": [ - "import gym\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline" + "all_names = list(gym.envs.registry.keys())\n", + "names_breakout = [name for name in all_names if \"Break\" in name]\n", + "names_breakout" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "S_zvw_31soUA" + }, + "outputs": [], + "source": [ + "# ENV_NAME = \"BreakoutDeterministic-v4\" # this one is simpler\n", + "ENV_NAME = \"ALE/Breakout-v5\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Let's play some old videogames\n", - "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/nerd.png)\n", + "If you are curious about Atari environments in Gymnasium, please, refer to:\n", + "- [4] Gymnasium docs: https://gymnasium.farama.org/environments/atari/\n", + "- [5] Longer paper: https://arxiv.org/abs/1709.06009\n", + "- [6] Shorter paper: https://www.ijcai.org/Proceedings/2018/0787.pdf\n", "\n", - "This time we're gonna apply approximate q-learning to an Atari game called Breakout. It's not the hardest thing out there, but it's definitely way more complex than anything we tried before.\n" + "For now it's enough to know about the v5 environments:\n", + "- v5 environments are recommended for use\n", + "- frame_skip=5, every 5-th frame is shown to the agent and the chosen action is executed for the next 5 moves\n", + "- randomness comes from repeat_action_probability=0.25: with this probability the previous action is executed instead of the chosen action" ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "cell_type": "markdown", + "metadata": { + "id": "xT9BvasNsoUA" + }, "source": [ - "ENV_NAME = \"BreakoutNoFrameskip-v4\"" + "## Getting to know the Environment" ] }, { "cell_type": "markdown", + "metadata": { + "id": "iwN8jA0OsoUA" + }, + "source": [ + "**Let's see what observations look like.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "## Preprocessing (3 pts)" + "env = gym.make(ENV_NAME, render_mode=\"rgb_array\")\n", + "env.reset()\n", + "plt.imshow(env.render())\n", + "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's see what observations look like." + "**Some more observations, coming from taking random actions**" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA6UAAAH3CAYAAABD+PmTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0\ndHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3dbazkd3nf/8/1twMPNlQ2N7Us29SG\nOqmgah1YuVYL/GlpyGJFMfQBtVUFJ0VdkEBK5FSVCVJBlSK1aTASaupoEdaaKjHQOgSrclwcNwqq\nUhPWxDHmxtgmRni12AVXQJYIYvvbB+e3MFl295w9c/Od+Z7XSxqdOb+ZOXNZ+/bRXmfm/LZaawEA\nAIAe/r/eAwAAALB3WUoBAADoxlIKAABAN5ZSAAAAurGUAgAA0I2lFAAAgG6WtpRW1YGqeqiqHqmq\nG5f1PNCDvhmdxhmdxhmdxtkktYx/p7Sqzkny5SQ/neTxJJ9Jcl1r7QsLfzJYMX0zOo0zOo0zOo2z\naZb1SumVSR5prX2ltfb9JB9Jcs2SngtWTd+MTuOMTuOMTuNslHOX9HUvSvK1mc8fT/IPZu9QVQeT\nHJw+feWS5oBZ32itvWgBX2fbvhON04XGGd3KGtc3HSyq70TjrKfTNr6spXRbrbVDSQ4lSVUt/j3E\n8KO+uson0zgdaJzRraxxfdOB7+GM7rSNL+vtu0eTXDLz+cXTMRiBvhmdxhmdxhmdxtkoy1pKP5Pk\n8qq6rKqek+TaJHcs6blg1fTN6DTO6DTO6DTORlnK23dba09X1TuT/I8k5yS5pbX2+WU816LddNNN\nO77vDTfcsOvHnvz4eR47r57PfbKTZ1nmc+3WJvedaHzVz30yjS+fxlf73CfT+HLpe7XPfbJN6DvR\n+G4er/EtvRpf2u+UttbuTHLnsr4+9KRvRqdxRqdxRqdxNkm3Ex1tgkX+9OVsHz/vc89jXX/qx+Jp\nnNFpnJHpm9FpfO9Y1u+UAgAAwLa8UsqP2O4nQXvxpzeMReOMTuOMTN+Mbi827pVSAAAAuvFKKdv+\ntGWV76GHZdA4o9M4I9M3o9O4V0oBAADoyCulZzDvTyXmefwqfyKyF376wqlpnNFpnJHpm9FpfO+o\n1lrvGVJV/YdgL7ivtba/xxNrnBXROKPr0ri+WRHfwxndaRv39l0AAAC6WYu371588cVDntqY9dKz\nMY2zChpndL0a0zer4Hs4oztTY14pBQAAoBtLKQAAAN1YSgEAAOjGUgoAAEA3llIAAAC62fVSWlWX\nVNUfVtUXqurzVfVL0/H3VtXRqrp/uly9uHFhdTTO6DTOyPTN6DTOSOb5J2GeTvIrrbXPVtXzktxX\nVXdPt72/tfYb848HXWmc0Wmckemb0WmcYex6KW2tHUtybLr+nar6YpKLFjUY9KZxRqdxRqZvRqdx\nRrKQ3ymtqkuT/FSST0+H3llVD1TVLVV1/mkec7CqjlTVkePHjy9iDFgajTM6jTMyfTM6jbPp5l5K\nq+rHk9ye5Jdba99OcnOSlya5Ils/vXnfqR7XWjvUWtvfWtu/b9++eceApdE4o9M4I9M3o9M4I5hr\nKa2qH8vW/wS/3Vr73SRprT3RWnumtfZskg8muXL+MaEPjTM6jTMyfTM6jTOKec6+W0k+lOSLrbWb\nZo5fOHO3NyV5cPfjQT8aZ3QaZ2T6ZnQaZyTznH33HyX5+SSfq6r7p2O/muS6qroiSUvyWJK3zTUh\n9KNxRqdxRqZvRqdxhjHP2Xf/V5I6xU137n4cWB8aZ3QaZ2T6ZnQaZyTzvFK6MjfccEPvEdgAN910\n0/Z3WlMaZyc0zug2tXF9sxOb2neicXZmnsYX8k/CAAAAwG5YSgEAAOjGUgoAAEA3llIAAAC6sZQC\nAADQjaUUAACAbiylAAAAdGMpBQAAoBtLKQAAAN1YSgEAAOjGUgoAAEA3llIAAAC6sZQCAADQjaUU\nAACAbs6d9wtU1WNJvpPkmSRPt9b2V9Xzk3w0yaVJHkvy5tba/533uWDV9M3oNM7oNM7I9M0oFvVK\n6T9urV3RWts/fX5jkntaa5cnuWf6HDaVvhmdxhmdxhmZvtl4y3r77jVJbp2u35rkjUt6HuhB34xO\n44xO44xM32ycRSylLcknq+q+qjo4HbugtXZsuv71JBec/KCqOlhVR6rqyPHjxxcwBizFrvpONM7G\n0Dij8/cURuZ7OEOY+3dKk7yqtXa0qv5mkrur6kuzN7bWWlW1kx/UWjuU5FCSXHLJJT9yO6yJXfU9\n3aZxNoHGGZ2/pzAy38MZwtyvlLbWjk4fn0zy8SRXJnmiqi5Mkunjk/M+D/Sgb0ancUancUamb0Yx\n11JaVfuq6nknrid5fZIHk9yR5Prpbtcn+cQ8zwM96JvRaZzRaZyR6ZuRzPv23QuSfLyqTnyt32mt\n3VVVn0nysap6a5KvJnnznM8DPeib0Wmc0WmckembYcy1lLbWvpLk75/i+DeTvG6erw296ZvRaZzR\naZyR6ZuRLOJER0t374EDvUdgA/xx7wHmoHF2QuOMblMb1zc7sal9JxpnZ+ZpfFn/TikAAABsy1IK\nAABAN5ZSAAAAurGUAgAA0I2lFAAAgG424uy7z/7tb/ceAZZK44xO44xM34xO4yybV0oBAADoxlIK\nAABAN5ZSAAAAurGUAgAA0I2lFAAAgG424uy7T/2N7/YeAZZK44xO44xM34xO4yybV0oBAADoxlIK\nAABAN7t++25V/WSSj84cekmSf5vkvCT/Ksn/mY7/amvtzl1PCJ1onNFpnNFpnJHpm5HseiltrT2U\n5IokqapzkhxN8vEkv5jk/a2131jIhNCJxhmdxhmdxhmZvhnJok509Lokj7bWvlpVC/qSP/TU3/n+\nwr8mA/rGUr+6xulP44xuQxvXNzuyoX0nGmeH5mh8Ub9Tem2S22Y+f2dVPVBVt1TV+Qt6DuhJ44xO\n44xO44xM32y0uZfSqnpOkp9L8l+nQzcneWm23k5wLMn7TvO4g1V1pKqOHD9+fN4xYGk0zug0zuh2\n07i+2RS+hzOCRbxS+oYkn22tPZEkrbUnWmvPtNaeTfLBJFee6kGttUOttf2ttf379u1bwBiwNBpn\ndBpndGfduL7ZIL6Hs/EWsZRel5m3C1TVhTO3vSnJgwt4DuhJ44xO44xO44xM32y8uU50VFX7kvx0\nkrfNHP71qroiSUvy2Em3wUbROKPTOKPTOCPTN6OYayltrR1P8oKTjv38XBOdwu88++JFf0kG9Pol\nfE2Ns040zug2tXF9sxOb2neicXZmnsYXdfZdAAAAOGuWUgAAALqxlAIAANCNpRQAAIBuLKUAAAB0\nM9fZd1fl+x95b+8R2ASv/+PeE+yaxtkRjTO6DW1c3+zIhvadaJwdmqNxr5QCAADQjaUUAACAbiyl\nAAAAdGMpBQAAoBtLKQAAAN1sxNl3/+ddV/UegQ3ws6+/qfcIu6ZxdkLjjG5TG9c3O7GpfScaZ2fm\nadwrpQAAAHRjKQUAAKAbSykAAADd7GgprapbqurJqnpw5tjzq+ruqnp4+nj+dLyq6gNV9UhVPVBV\nr1jW8LAI+mZ0Gmd0Gmdk+mYv2OkrpYeTHDjp2I1J7mmtXZ7knunzJHlDksuny8EkN88/JizV4eib\nsR2Oxhnb4WiccR2OvhncjpbS1tqnkjx10uFrktw6Xb81yRtnjn+4bbk3yXlVdeEihoVl0Dej0zij\n0zgj0zd7wTy/U3pBa+3YdP3rSS6Yrl+U5Gsz93t8OgabRN+MTuOMTuOMTN8MZSEnOmqttSTtbB5T\nVQer6khVHTl+/PgixoCl2E3ficbZHBpndP6ewsh8D2cE8yylT5x4O8D08cnp+NEkl8zc7+Lp2F/T\nWjvUWtvfWtu/b9++OcaApZir70TjrD2NMzp/T2FkvoczlHmW0juSXD9dvz7JJ2aOv2U6+9dVSb41\n8/YC2BT6ZnQaZ3QaZ2T6Zijn7uROVXVbktcmeWFVPZ7kPUn+fZKPVdVbk3w1yZunu9+Z5OokjyT5\nbpJfXPDMsFD6ZnQaZ3QaZ2T6Zi/Y0VLaWrvuNDe97hT3bUneMc9QsEr6ZnQaZ3QaZ2T6Zi9YyImO\nAAAAYDcspQAAAHRjKQUAAKAbSykAAADdWEoBAADoxlIKAABAN5ZSAAAAurGUAgAA0I2lFAAAgG4s\npQAAAHRjKQUAAKAbSykAAADdWEoBAADoxlIKAABAN5ZSAAAAutl2Ka2qW6rqyap6cObYf6yqL1XV\nA1X18ao6bzp+aVX9ZVXdP11+a5nDwyJofL3ce+BA7j1woPcYQ9E4I9M3o9M4e8FOXik9nOTkvyHe\nneTvttb+XpIvJ3nXzG2PttaumC5vX8yYsFSHo3HGdjgaXwt+6LIUh6NvxnY4Gmdw2y6lrbVPJXnq\npGOfbK09PX16b5KLlzAbrITGGZ3GGZm+14MfuCyPxtkLFvE7pf8yye/PfH5ZVf1pVf1RVb16AV8f\netM4o9M4I9M3o9P4kvmhy/KdO8+Dq+rdSZ5O8tvToWNJXtxa+2ZVvTLJ71XVy1tr3z7FYw8mOZgk\n559//jxjwNJofPWuuuuu3iPsKRpnZPpmdBpnFLt+pbSqfiHJzyb5F621liStte+11r45Xb8vyaNJ\nfuJUj2+tHWqt7W+t7d+3b99ux4Cl0Tij0/jqXXXXXX7wsiL6ZnQaZyS7eqW0qg4k+TdJ/v/W2ndn\njr8oyVOttWeq6iVJLk/ylYVMCiukcUancUam79Xzw5bV0jij2XYprarbkrw2yQur6vEk78nWGb6e\nm+TuqkqSe6eze70myb+rqr9K8mySt7fWnjrlF4Y1oXFGp3FGpm9Gp/H+/NBl+bZdSltr153i8IdO\nc9/bk9w+71CwShpndBpnZPpmdBpnL1jE2XcBAABgVyylAAAAdGMpBQAAoBtLKQAAAN1YSgEAAOjG\nUgoAAEA3llIAAAC6sZQCAADQjaUUAACAbiylAAAAdGMpBQAAoBtLKQAAAN1YSgEAAOjGUgoAAEA3\nllIAAAC6sZQCAADQzbZLaVXdUlVPVtWDM8feW1VHq+r+6XL1zG3vqqpHquqhqvqZZQ0Oi6JxRqdx\nRqdxRqZv9oKdvFJ6OMmBUxx/f2vtiulyZ5JU1cuSXJvk5dNj/nNVnbOoYWFJDkfjjO1wNM7YDkfj\njOtw9M3gtl1KW2ufSvLUDr/eNUk+0lr7Xmvtz5M8kuTKOeaDpdM4o9M4o9M4I9M3e8E8v1P6zqp6\nYHpLwfnTsYuSfG3mPo9Px35EVR2sqiNVdeT48eNzjAFLo3FGp3FGt+vG9c0G8D2cYex2Kb05yUuT\nXJHkWJL3ne0XaK0daq3tb63t37dv3y7HgKXROKPTOKObq3F9s+Z8D2cou1pKW2tPtNaeaa09m+SD\n+eHbAo4muWTmrhdPx2CjaJzRaZzRaZyR6ZvR7GopraoLZz59U5ITZwO7I8m1VfXcqrosyeVJ/mS+\nEWH1NM7oNM7oNM7I9M1ozt3uDlV1W5LXJnlhVT2e5D1JXltVVyRpSR5L8rYkaa19vqo+luQLSZ5O\n8o7W2jPLGR0WQ+OMTuOMTuOMTN/sBdsupa21605x+ENnuP+vJfm1eYaCVdI4o9M4o9M4I9M3e8E8\nZ98FAACAuVhKAQAA6MZSCgAAQDeWUgAAALqxlAIAANCNpRQAAIBuLKUAAAB0YykFAACgG0spAAAA\n3VhKAQAA6MZSCgAAQDeWUgAAALqxlAIAANCNpRQAAIBuLKUAAAB0s+1SWlW3VNWTVfXgzLGPVtX9\n0+Wxqrp/On5pVf3lzG2/tczhYRE0zug0zsj0zeg0zl5w7g7uczjJf0ry4RMHWmv//MT1qnpfkm/N\n3P/R1toVixoQVuBwNM7YDkfjjOtw9M3YDkfjDG7bpbS19qmquvRUt1VVJXlzkn+y2LFgdTTO6DTO\nyPTN6DTOXjDv75S+OskTrbWHZ45dVlV/WlV/VFWvPt0Dq+pgVR2pqiPHjx+fcwxYGo0zOo0zMn0z\nOo0zhJ28ffdMrkty28znx5K8uLX2zap6ZZLfq6qXt9a+ffIDW2uHkhxKkksuuaTNOQcsi8YZncYZ\nmb4ZncYZwq5fKa2qc5P8syQfPXGstfa91to3p+v3JXk0yU/MOyT0oHFGp3FGpm9Gp3FGMs/bd/9p\nki+11h4/caCqXlRV50zXX5Lk8iRfmW9E6EbjjE7jjEzfjE7jDGMn/yTMbUn+d5KfrKrHq+qt003X\n5q+/XSBJXpPkgem01P8tydtba08tcmBYNI0zOo0zMn0zOo2zF+zk7LvXneb4L5zi2O1Jbp9/LFgd\njTM6jTMyfTM6jbMXzHv2XQAAANg1SykAAADdWEoBAADoxlIKAABAN5ZSAAAAurGUAgAA0I2lFAAA\ngG62/XdKV+Fb5zyb/37eX/QeY0+698CBuR5/1V13LWiS+f3DT36y9winpfF+NL4aGu9H48un7370\nvRoa70fjW7xSCgAAQDeWUgAAALqxlAIAANDNWvxOKf2s0/vQYRk0zug0zsj0zeg0vsVSyjD8T83o\nNM7oNM7I9M3o5mm8WmsLHGWXQ1T1H4K94L7W2v4eT6xxVkTjjK5L4/pmRXwPZ3SnbdzvlAIAANDN\ntktpVV1SVX9YVV+oqs9X1S9Nx59fVXdX1cPTx/On41VVH6iqR6rqgap6xbL/I2AeGmd0Gmdk+mZ0\nGmcv2MkrpU8n+ZXW2suSXJXkHVX1siQ3JrmntXZ5knumz5PkDUkuny4Hk9y88KlhsTTO6DTOyPTN\n6DTO8LZdSltrx1prn52ufyfJF5NclOSaJLdOd7s1yRun69ck+XDbcm+S86rqwoVPDguicUancUam\nb0ancfaCs/qd0qq6NMlPJfl0kgtaa8emm76e5ILp+kVJvjbzsMenYyd/rYNVdaSqjpzlzLA0Gmd0\nGmdk+mZ0GmdUO15Kq+rHk9ye5Jdba9+eva1tncL3rM7a1Vo71Frb3+ssY3AyjTM6jTMyfTM6jTOy\nHS2lVfVj2fqf4Ldba787HX7ixFsBpo9PTsePJrlk5uEXT8dgbWmc0Wmckemb0Wmc0e3k7LuV5ENJ\nvthau2nmpjuSXD9dvz7JJ2aOv2U689dVSb4189YCWDsaZ3QaZ2T6ZnQaZ09orZ3xkuRV2Xo7wANJ\n7p8uVyd5QbbO9PVwkj9I8vzp/pXkN5M8muRzSfbv4Dmai8sKLkc07jL4ReMuo19+pPHo22Wci+/h\nLqNfTtl4ay01hdhVVfUfgr3gvl6/N6FxVkTjjK5L4/pmRXwPZ3Snbfyszr4LAAAAi2QpBQAAoBtL\nKQAAAN2c23uAyTeSHJ8+bqoXxvw97WT+v7WKQU5D4/3thfl7Nv4XSR7q+Pzz2gt9rLt1btz38P72\nwvz+njKfvdDIOpur8bU40VGSVNWRTf7He83f1ybMvwkznon5+1r3+dd9vu2Yv791/29Y9/m2Y/6+\nNmH+TZjxTMzf17zze/suAAAA3VhKAQAA6GadltJDvQeYk/n72oT5N2HGMzF/X+s+/7rPtx3z97fu\n/w3rPt92zN/XJsy/CTOeifn7mmv+tfmdUgAAAPaedXqlFAAAgD3GUgoAAEA33ZfSqjpQVQ9V1SNV\ndWPveXaiqh6rqs9V1f1VdWQ69vyquruqHp4+nt97zllVdUtVPVlVD84cO+XMteUD05/JA1X1in6T\n/2DWU83/3qo6Ov053F9VV8/c9q5p/oeq6mf6TP2DWTS+ZPruS+PLp/F+NrHvROOrpvHV2rS+E41v\n+wSttW6XJOckeTTJS5I8J8mfJXlZz5l2OPdjSV540rFfT3LjdP3GJP+h95wnzfeaJK9I8uB2Mye5\nOsnvJ6kkVyX59JrO/94k//oU933Z1NJzk1w2NXZOp7k13q8Pfa9mdo33a0Tjy597I/ueZtd4//k1\nvry5N6rvMzSi8enS+5XSK5M80lr7Smvt+0k+kuSazjPt1jVJbp2u35rkjR1n+RGttU8leeqkw6eb\n+ZokH25b7k1yXlVduJpJT+0085/ONUk+0lr7Xmvtz5M8kq3WetD4Cui7W9+JxldC476HL4jGl0Tj\na2Ft+040nm0a772UXpTkazOfPz4dW3ctySer6r6qOjgdu6C1dmy6/vUkF/QZ7aycbuZN+nN55/S2\nhltm3qaxTvOv0yxnY4TG9b0a6zbPTml8Pax74+s0y9nS+HrQ+HKM0Hei8R/ovZRuqle11l6R5A1J\n3lFVr5m9sW29br1R/9bOJs6c5OYkL01yRZJjSd7Xd5yhDNX4ps070fdyabw/jS+XxvvT+PIM1Xey\nmTNngY33XkqPJrlk5vOLp2NrrbV2dPr4ZJKPZ+vl6CdOvKw+fXyy34Q7drqZN+LPpbX2RGvtmdba\ns0k+mB++LWCd5l+nWXZskMb1vRrrNs+OaLy/DWl8nWY5KxrvT+PLM0jficZ/oPdS+pkkl1fVZVX1\nnCTXJrmj80xnVFX7qup5J64neX2SB7M19/XT3a5P8ok+E56V0818R5K3TGf+uirJt2beWrA2Tnpv\n/Zuy9eeQbM1/bVU9t6ouS3J5kj9Z9XwTjfej79XQeD8aX76N6zvR+LrQ+HIM1Hei8R8601mQVnHJ\n1tmlvpytszK9u/c8O5j3Jdk6m9SfJfn8iZmTvCDJPUkeTvIHSZ7fe9aT5r4tWy+r/1W23tf91tPN\nnK0zff3m9GfyuST713T+/zLN98AU/4Uz93/3NP9DSd7QeXaN9+lD36ubX+N9GtH4ambfqL6nmTW+\nHvNrfDnzblzfZ2hE49OlpgcBAADAyvV++y4AAAB7mKUUAACAbiylAAAAdGMpBQAAoBtLKQAAAN1Y\nSgEAAOjGUgoAAEA3llIAAAC6sZQCAADQjaUUAACAbiylAAAAdGMpBQAAoBtLKQAAAN1YSgEAAOjG\nUgoAAEA3llIAAAC6sZQCAADQjaUUAACAbiylAAAAdGMpBQAAoBtLKQAAAN1YSgEAAOjGUgoAAEA3\nllIAAAC6sZQCAADQjaUUAACAbiylAAAAdGMpBQAAoBtLKQAAAN1YSgEAAOjGUgoAAEA3llIAAAC6\nsZQCAADQjaUUAACAbiylAAAAdGMpBQAAoBtLKQAAAN1YSgEAAOjGUgoAAEA3llIAAAC6sZQCAADQ\njaUUAACAbiylAAAAdGMpBQAAoBtLKQAAAN0sbSmtqgNV9VBVPVJVNy7reaAHfTM6jTM6jTM6jbNJ\nqrW2+C9adU6SLyf56SSPJ/lMkutaa19Y+JPBiumb0Wmc0Wmc0WmcTbOsV0qvTPJIa+0rrbXvJ/lI\nkmuW9FywavpmdBpndBpndBpno5y7pK97UZKvzXz+eJJ/MHuHqjqY5OD06SuXNAfM+kZr7UUL+Drb\n9p1onC40zuhW1ri+6WBRfScaZz2dtvFlLaXbaq0dSnIoSapq8e8hhh/11VU+mcbpQOOMbmWN65sO\nfA9ndKdtfFlv3z2a5JKZzy+ejsEI9M3oNM7oNM7oNM5GWdZS+pkkl1fVZVX1nCTXJrljSc8Fq6Zv\nRqdxRqdxRqdxNspS3r7bWnu6qt6Z5H8kOSfJLa21zy/juRbtpptu2vF9b7jhhl0/9uTHz/PYefV8\n7pOdPMsyn2u3NrnvROOrfu6TaXz5NL7a5z6ZxpdL36t97pNtQt+JxnfzeI1v6dX40n6ntLV2Z5I7\nl/X1oSd9MzqNMzqNMzqNs0m6nehoEyzypy9n+/h5n3se6/pTPxZP44xO44xM34xO43vHsn6nFAAA\nALbllVJ+xHY/CdqLP71hLBpndBpnZPpmdHuxca+UAgAA0I1XStn2py2rfA89LIPGGZ3GGZm+GZ3G\nvVIKAABAR14pPYN5fyoxz+NX+RORvfDTF05N44xO44xM34xO43tHtdZ6z5Cq6j8Ee8F9rbX9PZ5Y\n46yIxhldl8b1zYr4Hs7oTtu4t+8CAADQzVq8fffiiy8e8tTGrJeejWmcVdA4o+vVmL5ZBd/DGd2Z\nGvNKKQAAAN1YSgEAAOjGUgoAAEA3llIAAAC6sZQCAADQza6X0qq6pKr+sKq+UFWfr6pfmo6/t6qO\nVtX90+XqxY0Lq6NxRqdxRqZvRqdxRjLPPwnzdJJfaa19tqqel+S+qrp7uu39rbXfmH886ErjjE7j\njEzfjE7jDGPXS2lr7ViSY9P171TVF5NctKjBoDeNMzqNMzJ9MzqNM5KF/E5pVV2a5KeSfHo69M6q\neqCqbqmq80/zmINVdaSqjhw/fnwRY8DSaJzRaZyR6ZvRaZxNN/dSWlU/nuT2JL/cWvt2kpuTvDTJ\nFdn66c37TvW41tqh1tr+1tr+ffv2zTsGLI3GGZ3GGZm+GZ3GGcFcS2lV/Vi2/if47dba7yZJa+2J\n1tozrbVnk3wwyZXzjwl9aJzRaZyR6ZvRaZxRzHP23UryoSRfbK3dNHP8wpm7vSnJg7sfD/rROKPT\nOCPTN6PTOCOZ5+y7/yjJzyf5XFXdPx371STXVdUVSVqSx5K8ba4JoR+NMzqNMzJ9MzqNM4x5zr77\nv5LUKW66c/fjwPrQOKPTOCPTN6PTOCOZ55XSlbnhhht6j8AGuOmmm7a/05rSODuhcUa3qY3rm53Y\n1L4TjbMz8zS+kH8SBgAAAHbDUgoAAEA3llIAAAC6sZQCAADQjaUUAACAbiylAAAAdGMpBQAAoBtL\nKQAAAN1YSgEAAOjGUgoAAEA3llIAAAC6sZQCAADQjaUUAACAbiylAAAAdHPuvF+gqh5L8p0kzyR5\nurW2v6qen+SjSS5N8liSN7fW/u+8zwWrpm9Gp3FGp3FGpm9GsahXSv9xa+2K1tr+6fMbk9zTWrs8\nyT3T57Cp9M3oNM7oNM7I9M3GW9bbd69Jcut0/dYkb1zS80AP+mZ0Gmd0Gmdk+mbjLGIpbUk+WVX3\nVdXB6dgFrbVj0/WvJ7ng5AdV1cGqOlJVR44fP76AMWApdtV3onE2hsYZnb+nMDLfwxnC3L9TmuRV\nrbWjVfU3k9xdVV+avbG11qqqnfyg1tqhJIeS5JJLLvmR22FN7Krv6TaNswk0zuj8PYWR+R7OEOZ+\npbS1dnT6+GSSjye5MskTVXVhkkwfn5z3eaAHfTM6jTM6jTMyfTOKuZbSqtpXVc87cT3J65M8mOSO\nJNdPd7s+ySfmeR7oQd+MTmOrsbMAABLfSURBVOOMTuOMTN+MZN63716Q5ONVdeJr/U5r7a6q+kyS\nj1XVW5N8Ncmb53we6EHfjE7jjE7jjEzfDGOupbS19pUkf/8Ux7+Z5HXzfG3oTd+MTuOMTuOMTN+M\nZBEnOlq6ew8c6D0CG+CPew8wB42zExpndJvauL7ZiU3tO9E4OzNP48v6d0oBAABgW5ZSAAAAurGU\nAgAA0I2lFAAAgG4spQAAAHSzEWffffZvf7v3CLBUGmd0Gmdk+mZ0GmfZvFIKAABAN5ZSAAAAurGU\nAgAA0I2lFAAAgG4spQAAAHSzEWfffepvfLf3CLBUGmd0Gmdk+mZ0GmfZvFIKAABAN5ZSAAAAutn1\n23er6ieTfHTm0EuS/Nsk5yX5V0n+z3T8V1trd+56QuhE44xO44xO44xM34xk10tpa+2hJFckSVWd\nk+Roko8n+cUk72+t/cZCJoRONM7oNM7oNM7I9M1IFnWio9clebS19tWqWtCX/KGn/s73F/41GdA3\nlvrVNU5/Gmd0G9q4vtmRDe070Tg7NEfji/qd0muT3Dbz+Tur6oGquqWqzj/VA6rqYFUdqaojx48f\nX9AYsDQaZ3QaZ3Rn1bi+2TC+h7PR5l5Kq+o5SX4uyX+dDt2c5KXZejvBsSTvO9XjWmuHWmv7W2v7\n9+3bN+8YsDQaZ3QaZ3S7aVzfbArfwxnBIl4pfUOSz7bWnkiS1toTrbVnWmvPJvlgkisX8BzQk8YZ\nncYZncYZmb7ZeItYSq/LzNsFqurCmdvelOTBBTwH9KRxRqdxRqdxRqZvNt5cJzqqqn1JfjrJ22YO\n/3pVXZGkJXnspNtgo2ic0Wmc0WmckembUcy1lLbWjid5wUnHfn6uiU7hd5598aK/JAN6/RK+psZZ\nJxpndJvauL7ZiU3tO9E4OzNP44s6+y4AAACcNUspAAAA3VhKAQAA6MZSCgAAQDeWUgAAALqZ6+y7\nq/L9j7y39whsgtf/ce8Jdk3j7IjGGd2GNq5vdmRD+040zg7N0bhXSgEAAOjGUgoAAEA3llIAAAC6\nsZQCAADQjaUUAACAbjbi7Lv/866reo/ABvjZ19/Ue4Rd0zg7oXFGt6mN65ud2NS+E42zM/M07pVS\nAAAAurGUAgAA0I2lFAAAgG52tJRW1S1V9WRVPThz7PlVdXdVPTx9PH86XlX1gap6pKoeqKpXLGt4\nWAR9MzqNMzqNMzJ9sxfs9JXSw0kOnHTsxiT3tNYuT3LP9HmSvCHJ5dPlYJKb5x8Tlupw9M3YDkfj\njO1wNM64DkffDG5HS2lr7VNJnjrp8DVJbp2u35rkjTPHP9y23JvkvKq6cBHDwjLom9FpnNFpnJHp\nm71gnt8pvaC1dmy6/vUkF0zXL0rytZn7PT4d+2uq6mBVHamqI8ePH59jDFiKufpONM7a0zij8/cU\nRuZ7OENZyImOWmstSTvLxxxqre1vre3ft2/fIsaApdhN39PjNM5G0Dij8/cURuZ7OCOYZyl94sTb\nAaaPT07Hjya5ZOZ+F0/HYJPom9FpnNFpnJHpm6HMs5TekeT66fr1ST4xc/wt09m/rkryrZm3F8Cm\n0Dej0zij0zgj0zdDOXcnd6qq25K8NskLq+rxJO9J8u+TfKyq3prkq0nePN39ziRXJ3kkyXeT/OKC\nZ4aF0jej0zij0zgj0zd7wY6W0tbadae56XWnuG9L8o55hoJV0jej0zij0zgj0zd7wUJOdAQAAAC7\nYSkFAACgG0spAAAA3VhKAQAA6MZSCgAAQDeWUgAAALqxlAIAANCNpRQAAIBuLKUAAAB0YykFAACg\nG0spAAAA3VhKAQAA6MZSCgAAQDeWUgAAALqxlAIAANDNtktpVd1SVU9W1YMzx/5jVX2pqh6oqo9X\n1XnT8Uur6i+r6v7p8lvLHB4WQeOMTuOMTN+MTuPsBTt5pfRwkgMnHbs7yd9trf29JF9O8q6Z2x5t\nrV0xXd6+mDFhqQ5H44ztcDTOuA5H34ztcDTO4LZdSltrn0ry1EnHPtlae3r69N4kFy9hNlgJjTM6\njTMyfTM6jbMXLOJ3Sv9lkt+f+fyyqvrTqvqjqnr1Ar4+9KZxRqdxRqZvRqdxNt658zy4qt6d5Okk\nvz0dOpbkxa21b1bVK5P8XlW9vLX27VM89mCSg0ly/vnnzzPG0O49sPVujavuuqvzJHuTxhmdxhmZ\nvhmdxhnFrl8prapfSPKzSf5Fa60lSWvte621b07X70vyaJKfONXjW2uHWmv7W2v79+3bt9sxYGk0\nvnz3Hjjwgx+8sHoaZ2T6ZnQaZyS7Wkqr6kCSf5Pk51pr3505/qKqOme6/pIklyf5yiIGhVXSOKPT\n+PL5oUs/+mZ0Gmc02759t6puS/LaJC+sqseTvCdbZ/h6bpK7qypJ7p3O7vWaJP+uqv4qybNJ3t5a\ne+qUXxjWhMYZncYZmb5Xz68WrZbG2Qu2XUpba9ed4vCHTnPf25PcPu9Q/JBv+MuncUancUamb0an\n8dXyQ5c+5jrREcA8fMMHAMBSCgBL4IcuALAzllIAAHbMD1yARbOUAgAAxA9detn1v1MKAAAA87KU\nAgAA0I2lFAAAgG4spQAAAHRjKQUAAKAbSykAAADdWEoBAADoxlIKAABAN5ZSAAAAurGUAgAA0I2l\nFAAAgG62XUqr6paqerKqHpw59t6qOlpV90+Xq2due1dVPVJVD1XVzyxrcFgUjTM6jTM6jTMyfbMX\n7OSV0sNJDpzi+Ptba1dMlzuTpKpeluTaJC+fHvOfq+qcRQ0LS3I4Gmdsh6NxxnY4Gmdch6NvBrft\nUtpa+1SSp3b49a5J8pHW2vdaa3+e5JEkV84xHyydxhmdxhmdxhmZvtkL5vmd0ndW1QPTWwrOn45d\nlORrM/d5fDoGm0jjjE7jjE7jjEzfDGO3S+nNSV6a5Iokx5K872y/QFUdrKojVXXk+PHjuxwDlkbj\njE7jjG6uxvXNmvM9nKHsailtrT3RWnumtfZskg/mh28LOJrkkpm7XjwdO9XXONRa299a279v377d\njAFLo3FGp3FGN2/j+mad+R7OaHa1lFbVhTOfvinJibOB3ZHk2qp6blVdluTyJH8y34iwehpndBpn\ndBpnZPpmNOdud4equi3Ja5O8sKoeT/KeJK+tqiuStCSPJXlbkrTWPl9VH0vyhSRPJ3lHa+2Z5YwO\ni6FxRqdxRqdxRqZv9oJtl9LW2nWnOPyhM9z/15L82jxDwSppnNFpnNFpnJHpm71gnrPvAgAAwFws\npQAAAHRjKQUAAKAbSykAAADdWEoBAADoxlIKAABAN5ZSAAAAurGUAgAA0I2lFAAAgG4spQAAAHRj\nKQUAAKAbSykAAADdWEoBAADoxlIKAABAN5ZSAAAAutl2Ka2qW6rqyap6cObYR6vq/unyWFXdPx2/\ntKr+cua231rm8LAIGmd0Gmdk+mZ0GmcvOHcH9zmc5D8l+fCJA621f37ielW9L8m3Zu7/aGvtikUN\nCCtwOBpnbIejccZ1OPpmbIejcQa37VLaWvtUVV16qtuqqpK8Ock/WexYsDoaZ3QaZ2T6ZnQaZy+Y\n93dKX53kidbawzPHLquqP62qP6qqV8/59aE3jTM6jTMyfTM6jTOEnbx990yuS3LbzOfHkry4tfbN\nqnplkt+rqpe31r598gOr6mCSg0ly/vnnzzkGLI3GGZ3GGZm+GZ3GGcKuXymtqnOT/LMkHz1xrLX2\nvdbaN6fr9yV5NMlPnOrxrbVDrbX9rbX9+/bt2+0YsDQaZ3QaZ2T6ZnQaZyTzvH33nyb5Umvt8RMH\nqupFVXXOdP0lSS5P8pX5RoRuNM7oNM7I9M3oNM4wdvJPwtyW5H8n+cmqeryq3jrddG3++tsFkuQ1\nSR6YTkv935K8vbX21CIHhkXTOKPTOCPTN6PTOHvBTs6+e91pjv/CKY7dnuT2+ceC1dE4o9M4I9M3\no9M4e8G8Z98FAACAXbOUAgAA0I2lFAAAgG4spQAAAHRjKQUAAKAbSykAAADdWEoBAADoxlIKAABA\nN+f2HiBJvnXOs/nv5/1F7zH2jHsPHJjr8VfdddeCJlmsf/jJT/Ye4bQ0vjqj9p1onLH7Tta3cX2v\nhr770fhqaPz0vFIKAABAN5ZSAAAAurGUAgAA0M1a/E4pq7Xu70eHeeibkembkemb0Wn89CylDMP/\n6IxO44xO44xM34xunsartbbAUXY5RFX/IdgL7mut7e/xxBpnRTTO6Lo0rm9WxPdwRnfaxrf9ndKq\nuqSq/rCqvlBVn6+qX5qOP7+q7q6qh6eP50/Hq6o+UFWPVNUDVfWKxf63wGJpnNFpnJHpm9FpnL1g\nJyc6ejrJr7TWXpbkqiTvqKqXJbkxyT2ttcuT3DN9niRvSHL5dDmY5OaFTw2LpXFGp3FGpm9Gp3GG\nt+1S2lo71lr77HT9O0m+mOSiJNckuXW6261J3jhdvybJh9uWe5OcV1UXLnxyWBCNMzqNMzJ9MzqN\nsxec1T8JU1WXJvmpJJ9OckFr7dh009eTXDBdvyjJ12Ye9vh0DNaexhmdxhmZvhmdxhnVjs++W1U/\nnuT2JL/cWvt2Vf3gttZaO9tfkK6qg9l6SwGsBY0zOo0zMn0zOo0zsh29UlpVP5at/wl+u7X2u9Ph\nJ068FWD6+OR0/GiSS2YefvF07K9prR1qre3vdZYxmKVxRqdxRqZvRqdxRreTs+9Wkg8l+WJr7aaZ\nm+5Icv10/fokn5g5/pbpzF9XJfnWzFsLYO1onNFpnJHpm9FpnD2htXbGS5JXJWlJHkhy/3S5OskL\nsnWmr4eT/EGS50/3ryS/meTRJJ9Lsn8Hz9FcXFZwOaJxl8EvGncZ/fIjjUffLuNcfA93Gf1yysZb\na6kpxK7O9j3wsEv+UWpGp3FG16VxfbMivoczutM2flZn3wUAAIBFspQCAADQjaUUAACAbiylAAAA\ndHNu7wEm30hyfPq4qV4Y8/e0k/n/1ioGOQ2N97cX5u/Z+F8keajj889rL/Sx7ta5cd/D+9sL8/t7\nynz2QiPrbK7G1+Lsu0lSVUc2+R/vNX9fmzD/Jsx4Jubva93nX/f5tmP+/tb9v2Hd59uO+fvahPk3\nYcYzMX9f887v7bsAAAB0YykFAACgm3VaSg/1HmBO5u9rE+bfhBnPxPx9rfv86z7fdszf37r/N6z7\nfNsxf1+bMP8mzHgm5u9rrvnX5ndKAQAA2HvW6ZVSAAAA9hhLKQAAAN10X0qr6kBVPVRVj1TVjb3n\n2YmqeqyqPldV91fVkenY86vq7qp6ePp4fu85Z1XVLVX1ZFU9OHPslDPXlg9MfyYPVNUr+k3+g1lP\nNf97q+ro9Odwf1VdPXPbu6b5H6qqn+kz9Q9m0fiS6bsvjS+fxvvZxL4Tja+axldr0/pONL7tE7TW\nul2SnJPk0SQvSfKcJH+W5GU9Z9rh3I8leeFJx349yY3T9RuT/Ifec54032uSvCLJg9vNnOTqJL+f\npJJcleTTazr/e5P861Pc92VTS89NctnU2Dmd5tZ4vz70vZrZNd6vEY0vf+6N7HuaXeP959f48ube\nqL7P0IjGp0vvV0qvTPJIa+0rrbXvJ/lIkms6z7Rb1yS5dbp+a5I3dpzlR7TWPpXkqZMOn27ma5J8\nuG25N8l5VXXhaiY9tdPMfzrXJPlIa+17rbU/T/JItlrrQeMroO9ufScaXwmN+x6+IBpfEo2vhbXt\nO9F4tmm891J6UZKvzXz++HRs3bUkn6yq+6rq4HTsgtbasen615Nc0Ge0s3K6mTfpz+Wd09sabpl5\nm8Y6zb9Os5yNERrX92qs2zw7pfH1sO6Nr9MsZ0vj60HjyzFC34nGf6D3UrqpXtVae0WSNyR5R1W9\nZvbGtvW69Ub9WzubOHOSm5O8NMkVSY4leV/fcYYyVOObNu9E38ul8f40vlwa70/jyzNU38lmzpwF\nNt57KT2a5JKZzy+ejq211trR6eOTST6erZejnzjxsvr08cl+E+7Y6WbeiD+X1toTrbVnWmvPJvlg\nfvi2gHWaf51m2bFBGtf3aqzbPDui8f42pPF1muWsaLw/jS/PIH0nGv+B3kvpZ5JcXlWXVdVzklyb\n5I7OM51RVe2rqueduJ7k9UkezNbc1093uz7JJ/pMeFZON/MdSd4ynfnrqiTfmnlrwdo46b31b8rW\nn0OyNf+1VfXcqrosyeVJ/mTV80003o++V0Pj/Wh8+Tau70Tj60LjyzFQ34nGf+hMZ0FaxSVbZ5f6\ncrbOyvTu3vPsYN6XZOtsUn+W5PMnZk7ygiT3JHk4yR8keX7vWU+a+7Zsvaz+V9l6X/dbTzdzts70\n9ZvTn8nnkuxf0/n/yzTfA1P8F87c/93T/A8leUPn2TXepw99r25+jfdpROOrmX2j+p5m1vh6zK/x\n5cy7cX2foRGNT5eaHgQAAAAr1/vtuwAAAOxhllIAAAC6sZQCAADQjaUUAACAbiylAAAAdGMpBQAA\noBtLKQAAAN38P4MFx8hKCA3KAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - }, - "output_type": "display_data" - } - ], - "source": [ - "env = gym.make(ENV_NAME)\n", + "metadata": { + "id": "rUZHU2HdsoUB" + }, + "outputs": [], + "source": [ + "env = gym.make(ENV_NAME, render_mode=\"rgb_array\")\n", "env.reset()\n", "\n", "n_cols = 5\n", @@ -148,7 +210,7 @@ "for row in range(n_rows):\n", " for col in range(n_cols):\n", " ax = fig.add_subplot(n_rows, n_cols, row * n_cols + col + 1)\n", - " ax.imshow(env.render('rgb_array'))\n", + " ax.imshow(env.render())\n", " env.step(env.action_space.sample())\n", "plt.show()" ] @@ -156,6 +218,44 @@ { "cell_type": "markdown", "metadata": {}, + "source": [ + "**About the game:** You have 5 lives and get points for breaking the wall. Higher bricks cost more than the lower ones. There are 4 actions: start game (should be called at the beginning and after each life is lost), move left, move right and do nothing. There are some common wrappers used for Atari environments." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Let's take a look at action meanings:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env.unwrapped.get_action_meanings()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. NOOP stands for the action of doing nothing\n", + "2. Right and Left move the platform to the corresponding direction\n", + "3. Fire releases the ball in the beginning of a life\n", + "\n", + "In this assignment we will wrap the environment to execute the \"Fire\" action in the beginning of a life automatically. It will turn the \"FIRE\" action into another \"NOOP\". \n", + "Also, we will wrap the environment to make an episode last for 1 life instead of 5 ones. \n", + "These transforms are claimed as non-recommended in the paper [5] but it was done in the original paper [1] and it will help the training to converge faster. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hLNt1fbnsoUB" + }, "source": [ "**Let's play a little.**\n", "\n", @@ -165,33 +265,73 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "WOIL47azsoUB" + }, "outputs": [], "source": [ - "# # does not work in Colab.\n", - "# # make keyboard interrupt to continue\n", + "# # Does not work in Colab.\n", + "# Even on a local laptop breaks matplotlib rendering. So it's recommended to restart the notebook after playing.\n", + "# # Use the Escape button to continue.\n", "\n", - "# from gym.utils.play import play\n", + "# from gymnasium.utils.play import play\n", "\n", - "# play(env=gym.make(ENV_NAME), zoom=5, fps=30)" + "# play(env=gym.make(ENV_NAME, render_mode=\"rgb_array\"), zoom=4, fps=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Processing game image \n", + "## Wrapping the Environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def make_basic_env():\n", + " return gym.make(ENV_NAME, render_mode=\"rgb_array\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5DPrxQuXsoUB" + }, + "source": [ + "### Processing game image\n", "\n", - "Raw Atari images are large, 210x160x3 by default. However, we don't need that level of detail in order to learn them.\n", + "Let's check the shape and the dtype of the observation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env = make_basic_env()\n", + "obs, *_ = env.reset()\n", + "obs.shape, obs.dtype" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Raw Atari images are large, 210x160x3 by default. However, we don't need that level of detail in order to learn from them.\n", "\n", "We can thus save a lot of time by preprocessing game image, including\n", - "* Resizing to a smaller shape, 64 x 64\n", + "* Resizing to a smaller shape, 64x64 (or 84 x 84, which is used in literature)\n", "* Converting to grayscale\n", - "* Cropping irrelevant image parts (top, bottom and edges)\n", - "\n", - "Also please keep one dimension for channel so that final shape would be 1 x 64 x 64.\n", + "* Cropping irrelevant image parts (top, bottom and edges) \\[we won't do this\\]\n", "\n", - "Tip: You can implement your own grayscale converter and assign a huge weight to the red channel. This dirty trick is not necessary but it will speed up learning." + "The images are of the uint8 dtype. \n", + "uint8 stands for the 8-bit unsigned integer type. \n", + "We are going to store 10^5 or 10^6 observations in memory (RAM), so let's pay attention to preserving the 8-bit type after our transforms." ] }, { @@ -200,35 +340,73 @@ "metadata": {}, "outputs": [], "source": [ - "from gym.core import ObservationWrapper\n", - "from gym.spaces import Box\n", + "def apply_gray_scale_wrap(env):\n", + " # With the argument values chosen as below, the gym.wrappers.AtariPreprocessing wrapper\n", + " # only converts images to grayscale and downsamples them the screen_size\n", + " env = gym.wrappers.AtariPreprocessing(\n", + " env,\n", + " noop_max=0, # the default value 30 can be harmful with FireResetEnv and frame_skip=5\n", + " frame_skip=1, # frame_skip has already been set to 5 inside the env\n", + " terminal_on_life_loss=False, # we do this explicitly in the FireResetEnv wrapper\n", + " screen_size=84 # please use 84 (which is the standard value) or 64 (which will save some computations and memory)\n", + " )\n", + " return env\n", "\n", "\n", - "class PreprocessAtariObs(ObservationWrapper):\n", - " def __init__(self, env):\n", - " \"\"\"A gym wrapper that crops, scales image into the desired shapes and grayscales it.\"\"\"\n", - " ObservationWrapper.__init__(self, env)\n", + "env = make_basic_env()\n", + "env = apply_gray_scale_wrap(env)\n", "\n", - " self.img_size = (1, 64, 64)\n", - " self.observation_space = Box(0.0, 1.0, self.img_size)\n", + "obs, *_ = env.reset()\n", "\n", + "assert obs.dtype == np.dtype('uint8'), obs_dtype\n", "\n", - " def _to_gray_scale(self, rgb, channel_weights=[0.8, 0.1, 0.1]):\n", - " \n", + "print(obs.shape, obs.dtype)\n", + "plt.imshow(obs)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Atari specific wrappers\n", + "\n", + "We try to make our lives a little easier with the following wrappers:\n", + "1. EpisodicLifeEnv it makes the signal that dropping a ball is not good more explicit\n", + "2. Using FireResetEnv: with it the agent doesn't have to perform a special action to fire the ball in the beginning of a life" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dqn.atari_wrappers import FireResetEnv\n", + "from dqn.atari_wrappers import EpisodicLifeEnv\n", + "\n", + "def apply_atary_specific_wrap(env):\n", + " env = EpisodicLifeEnv(env)\n", + " env = FireResetEnv(env)\n", + " return env\n", "\n", + "env = make_basic_env()\n", + "env = apply_gray_scale_wrap(env)\n", + "env = apply_atary_specific_wrap(env)\n", "\n", - " def observation(self, img):\n", - " \"\"\"what happens to each observation\"\"\"\n", + "obs, *_ = env.reset()\n", "\n", - " # Here's what you need to do:\n", - " # * crop image, remove irrelevant parts\n", - " # * resize image to self.img_size\n", - " # (use imresize from any library you want,\n", - " # e.g. opencv, skimage, PIL, keras)\n", - " # * cast image to grayscale\n", - " # * convert image pixels to (0,1) range, float32 type\n", - " \n", - " return " + "print(obs.shape, obs.dtype)\n", + "plt.imshow(obs)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### FrameStack\n", + "To make the game playable from a single observation (note the direction of the ball), we stack 4 consecutive frames:" ] }, { @@ -237,52 +415,58 @@ "metadata": {}, "outputs": [], "source": [ - "import gym\n", - "# spawn game instance for tests\n", - "env = gym.make(ENV_NAME) # create raw env\n", - "env = PreprocessAtariObs(env)\n", - "observation_shape = env.observation_space.shape\n", - "n_actions = env.action_space.n\n", - "env.reset()\n", - "obs, _, _, _ = env.step(env.action_space.sample())\n", + "N_FRAMES_STACKED = 4\n", "\n", - "# test observation\n", - "assert obs.ndim == 3, \"observation must be [channel, h, w] even if there's just one channel\"\n", - "assert obs.shape == observation_shape\n", - "assert obs.dtype == 'float32'\n", - "assert len(np.unique(obs)) > 2, \"your image must not be binary\"\n", - "assert 0 <= np.min(obs) and np.max(\n", - " obs) <= 1, \"convert image pixels to [0,1] range\"\n", + "def make_final_env(apply_frame_stack=True):\n", + " \"\"\"\n", + " Builds the environment with all the wrappers applied.\n", + " The environment is meant be used directly as an RL algorithm input.\n", "\n", - "assert np.max(obs) >= 0.5, \"It would be easier to see a brighter observation\"\n", - "assert np.mean(obs) >= 0.1, \"It would be easier to see a brighter observation\"\n", + " apply_frame_stack=False can be useful for vecotrized environments, which are not required for this assignment.\n", + " \"\"\"\n", + " env = make_basic_env()\n", + " env = apply_gray_scale_wrap(env)\n", + " env = apply_atary_specific_wrap(env)\n", + " if apply_frame_stack:\n", + " env = # your code. Please, use gym.wrappers.FrameStackObservation\n", + " return env\n", "\n", - "print(\"Formal tests seem fine. Here's an example of what you'll get.\")\n", "\n", - "n_cols = 5\n", - "n_rows = 2\n", - "fig = plt.figure(figsize=(16, 9))\n", - "obs = env.reset()\n", - "for row in range(n_rows):\n", - " for col in range(n_cols):\n", - " ax = fig.add_subplot(n_rows, n_cols, row * n_cols + col + 1)\n", - " ax.imshow(obs[0, :, :], interpolation='none', cmap='gray')\n", - " obs, _, _, _ = env.step(env.action_space.sample())\n", - "plt.show()\n" + "env = make_final_env()\n", + "\n", + "obs, *_ = env.reset()\n", + "print(f\"Shape: {obs.shape}, dtype: {obs.dtype}, Python object type: {type(obs)}\")\n", + "for _ in range(N_FRAMES_STACKED - 1):\n", + " obs, *_ = env.step(env.action_space.sample())\n", + "print()\n", + "\n", + "\n", + "print(\"Frames, left to right: from older to more recent. The ball is dropping.\")\n", + "_, axes = plt.subplots(figsize=(len(obs) * 3, 4), ncols=len(obs))\n", + "for ax, frame in zip(axes, obs):\n", + " ax.imshow(frame)\n", + "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Wrapping." + "**That is the final version of the environment we are going to do RL on.**\n", + "\n", + "**Let's discuss the representation of an obsevation.**\\\n", + "An observation is a 4-frame-stack of grayscale images with reduced resolution.\\\n", + "Memory (RAM) is a high-demand resource in this task. That's why:\n", + "1. We use the uint8 dtype instead of float32 the neural network will operate on\n", + "2. We don't represent them as numpy.ndarrays. **LazyFrames** are used by gym.wrappers.FrameStack instead. 2 consecutive observations share 3 of 4 frames. LazyFrames make use of this fact to save memory.\n", + "When we feed the observations to neural networks, we should remember to scale them to the \\[-1, 1\\] range. We'll implement scaling as the first layer of a neural network, but that'll be later." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**About the game:** You have 5 lives and get points for breaking the wall. Higher bricks cost more than the lower ones. There are 4 actions: start game (should be called at the beginning and after each life is lost), move left, move right and do nothing. There are some common wrappers used for Atari environments." + "**The ball is dropping, but its hard to notice. Let's define a function to render more human-readable images:**" ] }, { @@ -291,43 +475,33 @@ "metadata": {}, "outputs": [], "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import atari_wrappers\n", - "\n", - "def PrimaryAtariWrap(env, clip_rewards=True):\n", - " assert 'NoFrameskip' in env.spec.id\n", - "\n", - " # This wrapper holds the same action for frames and outputs\n", - " # the maximal pixel value of 2 last frames (to handle blinking\n", - " # in some envs)\n", - " env = atari_wrappers.MaxAndSkipEnv(env, skip=4)\n", + "def merge_frame_stack_to_plot(frame_stack_obs: np.ndarray | gym.wrappers.frame_stack.LazyFrames):\n", + " \"\"\"\n", + " A helper function to plot a frame stack as a single human-interpretable image.\n", "\n", - " # This wrapper sends done=True when each life is lost\n", - " # (not all the 5 lives that are givern by the game rules).\n", - " # It should make easier for the agent to understand that losing is bad.\n", - " env = atari_wrappers.EpisodicLifeEnv(env)\n", + " Brighter pixels are more recent, pale pixels are older.\n", + " Motions goes from pale to bright.\n", "\n", - " # This wrapper laucnhes the ball when an episode starts.\n", - " # Without it the agent has to learn this action, too.\n", - " # Actually it can but learning would take longer.\n", - " env = atari_wrappers.FireResetEnv(env)\n", + " Note! This function is designed for human vision convenience and it is NOT supposed to be used as part of\n", + " data preprocessing for the Reinforcement Learning agent.\n", + " \"\"\"\n", + " weights = np.ones(frame_stack_obs.shape[0], dtype=float)\n", + " weights[-1] += weights.sum()\n", + " weights /= weights.sum()\n", + " result = (weights[:, None, None] * frame_stack_obs).sum(0)\n", + " return result\n", "\n", - " # This wrapper transforms rewards to {-1, 0, 1} according to their sign\n", - " if clip_rewards:\n", - " env = atari_wrappers.ClipRewardEnv(env)\n", "\n", - " # This wrapper is yours :)\n", - " env = PreprocessAtariObs(env)\n", - " return env" + "obs_joint = merge_frame_stack_to_plot(obs)\n", + "plt.imshow(obs_joint)\n", + "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Let's see if the game is still playable after applying the wrappers.**\n", - "At playing the EpisodicLifeEnv wrapper seems not to work but actually it does (because after when life finishes a new ball is dropped automatically - it means that FireResetEnv wrapper understands that a new episode began)." + "Hope that's better" ] }, { @@ -336,30 +510,92 @@ "metadata": {}, "outputs": [], "source": [ - "# # does not work in Colab.\n", - "# # make keyboard interrupt to continue\n", + "N_ACTIONS = env.action_space.n\n", + "STATE_SHAPE = env.observation_space.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4iJM3IAwsoUB" + }, + "source": [ + "**Let's see if the game is still playable after applying the wrappers.**\n", + "At playing the EpisodicLifeEnv wrapper seems not to work but actually it does (because after when life finishes a new ball is dropped automatically - it means that FireResetEnv wrapper understands that a new episode began).\n", "\n", - "# from gym.utils.play import play\n", + "**Not supported for now.**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jhiOKsQvsoUC" + }, + "source": [ + "## DQN as it is (10 pts)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aspwJFiGsoUC" + }, + "source": [ + "### Building a network\n", "\n", - "# def make_play_env():\n", - "# env = gym.make(ENV_NAME)\n", - "# env = PrimaryAtariWrap(env)\n", - "# # in torch imgs have shape [c, h, w] instead of common [h, w, c]\n", - "# env = atari_wrappers.AntiTorchWrapper(env)\n", - "# return env\n", + "We now need to build a neural network that can map images to state q-values. This network will be called on every agent's step so it better not be resnet-152 unless you have an array of GPUs. Instead, you can use strided convolutions with a small number of features to save time and memory.\n", "\n", - "# play(make_play_env(), zoom=10, fps=3)" + "You can build any architecture you want, but you can find a couple of examples on diagrams below." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "YbZIucfksoUC" + }, "source": [ - "### Frame buffer\n", + "**Dueling network:**\\\n", + "Paper: https://arxiv.org/pdf/1511.06581.pdf\n", + "$$Q_{\\theta}(s, a) = V_{\\eta}(f_{\\xi}(s)) + A_{\\psi}(f_{\\xi}(s), a) - \\frac{\\sum_{a'}A_{\\psi}(f_{\\xi}(s), a')}{N_{actions}},$$\n", + "where $\\xi$, $\\eta$, and $\\psi$ are, respectively, the parameters of the\n", + "shared encoder $f_ξ$ , of the value stream $V_\\eta$ , and of the advantage stream $A_\\psi$; and $\\theta = \\{\\xi, \\eta, \\psi\\}$ is their concatenation.\n", + "\n", + "This is what it looks like:\n", + "\n", + "Simple, expects height=width=64\n", "\n", - "Our agent can only process one observation at a time, so we gotta make sure it contains enough information to find optimal actions. For instance, agent has to react to moving objects so he must be able to measure object's velocity.\n", + "\n", + "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/week04_approx_rl/img/dueling_basic.png)\n", "\n", - "To do so, we introduce a buffer that stores 4 last images. This time everything is pre-implemented for you, not really by the staff of the course :)" + "Nature DQN ([2]), expects height=width=84\n", + "\n", + "\n", + "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/week04_approx_rl/img/dueling_nature.png)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "SPPmY6wIsoUC", + "outputId": "717e2355-008e-4994-b5f2-1c8cf98ac445" + }, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + "device" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These constants will be useful" ] }, { @@ -368,20 +604,7 @@ "metadata": {}, "outputs": [], "source": [ - "from framebuffer import FrameBuffer\n", - "\n", - "def make_env(clip_rewards=True, seed=None):\n", - " env = gym.make(ENV_NAME) # create raw env\n", - " if seed is not None:\n", - " env.seed(seed)\n", - " env = PrimaryAtariWrap(env, clip_rewards)\n", - " env = FrameBuffer(env, n_frames=4, dim_order='pytorch')\n", - " return env\n", - "\n", - "env = make_env()\n", - "env.reset()\n", - "n_actions = env.action_space.n\n", - "state_shape = env.observation_space.shape" + "N_ACTIONS, N_FRAMES_STACKED" ] }, { @@ -390,56 +613,139 @@ "metadata": {}, "outputs": [], "source": [ - "for _ in range(12):\n", - " obs, _, _, _ = env.step(env.action_space.sample())\n", + "class ConvBackbone(nn.Sequential):\n", + " \"\"\"\n", + " The convolutional part of a DQN model.\n", + " Please, don't think about input scaling here: it will be implemented below.\n", + " \"\"\"\n", + " def __init__(self, c_in: int = N_FRAMES_STACKED) -> None:\n", + " super().__init__(\n", + " nn.Conv2d(...),\n", + " ...\n", + " # your code,\n", + " nn.Flatten(),\n", + " )\n", "\n", - "plt.figure(figsize=[12,10])\n", - "plt.title(\"Game image\")\n", - "plt.imshow(env.render(\"rgb_array\"))\n", - "plt.show()\n", "\n", - "plt.figure(figsize=[15,15])\n", - "plt.title(\"Agent observation (4 frames top to bottom)\")\n", - "plt.imshow(utils.img_by_obs(obs, state_shape), cmap='gray')\n", - "plt.show()" + "class DuelingDqnHead(nn.Module):\n", + " \"\"\"\n", + " Implenets the Dueling DQN logic.\n", + " Please, don't think about gradient scaling here (if you know what it is about): it will be implemented below.\n", + " \"\"\"\n", + " def __init__(self, n_actions, inp_size=64 * 7 * 7, hidden_size=512) -> None:\n", + " super().__init__()\n", + " self.adv_stream = nn.Sequential(\n", + " # your code\n", + " )\n", + " self.value_stream = nn.Sequential(\n", + " # your code\n", + " )\n", + "\n", + " def forward(self, x: torch.Tensor) -> torch.Tensor:\n", + " assert x.ndim == 2, x.shape # (batch_size, n_features)\n", + " # your code\n", + " # When calculating the mean advantage, please, remember, x is a batched input!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## DQN as it is (4 pts)" + "Let's make a simple test for the network architecture:" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "### Building a network\n", + "@torch.no_grad()\n", + "def test_network_part_shapes(backbone, head):\n", + " batch_size = 3\n", + " env = make_final_env()\n", + " s, _ = env.reset()\n", + " inp = torch.rand(batch_size, *s.shape)\n", "\n", - "We now need to build a neural network that can map images to state q-values. This network will be called on every agent's step so it better not be resnet-152 unless you have an array of GPUs. Instead, you can use strided convolutions with a small number of features to save time and memory.\n", + " features = backbone(inp)\n", + " qvalues = head(features)\n", "\n", - "You can build any architecture you want, but for reference, here's something that will more or less work:" + " assert features.ndim == 2, features.shape\n", + " assert features.shape[0] == batch_size, features.shape\n", + " \n", + " assert qvalues.ndim == 2, qvalues.shape\n", + " assert qvalues.shape[0] == batch_size, qvalues.shape\n", + " assert qvalues.shape[1] == N_ACTIONS, qvalues.shape\n", + "\n", + " print(\"Test passed!\")\n", + "\n", + "test_network_part_shapes(\n", + " backbone=ConvBackbone(N_FRAMES_STACKED),\n", + " head=DuelingDqnHead(N_ACTIONS),\n", + ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/dqn_arch.png)" + "**Now let's build a full model.**" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "**Dueling network: (+2 pts)**\n", - "$$Q_{\\theta}(s, a) = V_{\\eta}(f_{\\xi}(s)) + A_{\\psi}(f_{\\xi}(s), a) - \\frac{\\sum_{a'}A_{\\psi}(f_{\\xi}(s), a')}{N_{actions}},$$\n", - "where $\\xi$, $\\eta$, and $\\psi$ are, respectively, the parameters of the\n", - "shared encoder $f_ξ$ , of the value stream $V_\\eta$ , and of the advan\n", - "tage stream $A_\\psi$; and $\\theta = \\{\\xi, \\eta, \\psi\\}$ is their concatenation.\n", + "MAX_UINT_8 = 2 ** 8 - 1\n", + "\n", + "\n", + "class InputScaler(nn.Module):\n", + " def __init__(self, mult=1 / MAX_UINT_8):\n", + " super().__init__()\n", + " self.mult = mult\n", "\n", - "For the architecture on the image $V$ and $A$ heads can follow the dense layer instead of $Q$. Please don't worry that the model becomes a little bigger." + " def forward(self, x: torch.Tensor) -> torch.Tensor:\n", + " return x * self.mult\n", + "\n", + "\n", + "class GradScalerFunctional(torch.autograd.Function):\n", + " \"\"\"\n", + " A torch.autograd.Function works as Identity on forward pass\n", + " and scales the gradient by scale_factor on backward pass.\n", + " \"\"\"\n", + " @staticmethod\n", + " def forward(ctx, input, scale_factor):\n", + " ctx.scale_factor = scale_factor\n", + " return input\n", + "\n", + " @staticmethod\n", + " def backward(ctx, grad_output):\n", + " scale_factor = ctx.scale_factor\n", + " grad_input = grad_output * scale_factor\n", + " return grad_input, None\n", + "\n", + "\n", + "class GradScaler(nn.Module):\n", + " \"\"\"\n", + " An nn.Module incapsulating GradScalerFunctional\n", + " \"\"\"\n", + " def __init__(self, scale_factor: float):\n", + " super().__init__()\n", + " self.scale_factor = scale_factor\n", + "\n", + " def forward(self, x):\n", + " return GradScalerFunctional.apply(x, self.scale_factor)\n", + "\n", + "\n", + "class DQNetworkDueling(nn.Sequential):\n", + " def __init__(self, c_in: int, n_actions: int) -> None:\n", + " input_scaler = InputScaler() # the inputs come from the uint8 range\n", + " backbone = ConvBackbone(c_in=c_in) # your code\n", + " grad_scaler = GradScaler(1 / 2**0.5) # Dueling DQN suggests do scale the gradient by 1 / sqrt(2)\n", + " head = DuelingDqnHead(n_actions=n_actions)\n", + " super().__init__(input_scaler, backbone, grad_scaler, head)" ] }, { @@ -448,28 +754,30 @@ "metadata": {}, "outputs": [], "source": [ - "import torch\n", - "import torch.nn as nn\n", - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", - "# those who have a GPU but feel unfair to use it can uncomment:\n", - "# device = torch.device('cpu')\n", - "device" + "@torch.no_grad()\n", + "def test_network_shapes(model):\n", + " batch_size = 3\n", + " env = make_final_env()\n", + " s, _ = env.reset()\n", + " inp = torch.rand(batch_size, *s.shape)\n", + "\n", + " qvalues = model(inp)\n", + " \n", + " assert qvalues.ndim == 2, qvalues.shape\n", + " assert qvalues.shape[0] == batch_size, qvalues.shape\n", + " assert qvalues.shape[1] == N_ACTIONS, qvalues.shape\n", + "\n", + " print(\"Test passed!\")\n", + "\n", + "test_network_shapes(model=DQNetworkDueling(N_FRAMES_STACKED, N_ACTIONS))" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "def conv2d_size_out(size, kernel_size, stride):\n", - " \"\"\"\n", - " common use case:\n", - " cur_layer_img_w = conv2d_size_out(cur_layer_img_w, kernel_size, stride)\n", - " cur_layer_img_h = conv2d_size_out(cur_layer_img_h, kernel_size, stride)\n", - " to understand the shape for dense layer's input\n", - " \"\"\"\n", - " return (size - (kernel_size - 1) - 1) // stride + 1\n" + "**Now let's wrap our model into an Agent class.** \n", + "It will implement epsilon-greedy policy on numpy ndarrays." ] }, { @@ -479,52 +787,57 @@ "outputs": [], "source": [ "class DQNAgent(nn.Module):\n", - " def __init__(self, state_shape, n_actions, epsilon=0):\n", + " \"\"\"\n", + " Epsilon-greedy policy with a torch.nn.Module Q-value estimator.\n", + " \"\"\"\n", "\n", + " def __init__(self, q_network: nn.Module, epsilon=1) -> None:\n", " super().__init__()\n", " self.epsilon = epsilon\n", - " self.n_actions = n_actions\n", - " self.state_shape = state_shape\n", - "\n", - " # Define your network body here. Please make sure agent is fully contained here\n", - " # nn.Flatten() can be useful\n", - " \n", - " \n", + " self.q_network = q_network\n", "\n", " def forward(self, state_t):\n", " \"\"\"\n", " takes agent's observation (tensor), returns qvalues (tensor)\n", " :param state_t: a batch of 4-frame buffers, shape = [batch_size, 4, h, w]\n", " \"\"\"\n", - " # Use your network to compute qvalues for given state\n", - " qvalues = \n", - "\n", - " assert qvalues.requires_grad, \"qvalues must be a torch tensor with grad\"\n", - " assert len(\n", - " qvalues.shape) == 2 and qvalues.shape[0] == state_t.shape[0] and qvalues.shape[1] == n_actions\n", - "\n", + " # Use your network to compute qvalues for given state\n", + " qvalues = # your code\n", " return qvalues\n", "\n", - " def get_qvalues(self, states):\n", + " @torch.no_grad() # we don't need autograd here, so let's save the computations\n", + " def get_qvalues(self, states: np.ndarray) -> np.ndarray:\n", " \"\"\"\n", " like forward, but works on numpy arrays, not tensors\n", " \"\"\"\n", " model_device = next(self.parameters()).device\n", - " states = torch.tensor(states, device=model_device, dtype=torch.float)\n", - " qvalues = self.forward(states)\n", - " return qvalues.data.cpu().numpy()\n", + " states_pt = torch.tensor(\n", + " np.array(states), device=model_device, dtype=torch.float32\n", + " )\n", + " # Use your network to compute qvalues for given state\n", + " qvalues_pt = # your code\n", + " qvalues = qvalues_pt.data.cpu().numpy()\n", + " return qvalues\n", "\n", - " def sample_actions(self, qvalues):\n", - " \"\"\"pick actions given qvalues. Uses epsilon-greedy exploration strategy. \"\"\"\n", - " epsilon = self.epsilon\n", + " def sample_actions_by_qvalues(self, qvalues: np.ndarray, greedy: bool = False) -> np.ndarray:\n", + " \"\"\"pick actions given qvalues. Uses epsilon-greedy exploration strategy.\"\"\"\n", " batch_size, n_actions = qvalues.shape\n", + " # greedy_actions = # your code\n", + " greedy_actions = qvalues.argmax(axis=-1) # your code\n", + " if greedy:\n", + " return greedy_actions\n", + "\n", + " random_actions = np.random.randint(\"your code\")\n", + " should_explore = np.random.binomial(\"your code\")\n", + " epsilon_greedy_actions = np.where(\n", + " \"your code\"\n", + " )\n", + " return epsilon_greedy_actions\n", "\n", - " random_actions = np.random.choice(n_actions, size=batch_size)\n", - " best_actions = qvalues.argmax(axis=-1)\n", - "\n", - " should_explore = np.random.choice(\n", - " [0, 1], batch_size, p=[1-epsilon, epsilon])\n", - " return np.where(should_explore, random_actions, best_actions)" + " def sample_actions(self, states: np.ndarray) -> np.ndarray:\n", + " qvalues = self.get_qvalues(states)\n", + " actions = self.sample_actions_by_qvalues(qvalues)\n", + " return actions" ] }, { @@ -533,12 +846,30 @@ "metadata": {}, "outputs": [], "source": [ - "agent = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device)" + "test_network_shapes(\n", + " model=DQNAgent(DQNetworkDueling(N_FRAMES_STACKED, N_ACTIONS))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BUFMLKX1soUC" + }, + "outputs": [], + "source": [ + "agent = DQNAgent(\n", + " DQNetworkDueling(N_FRAMES_STACKED, N_ACTIONS),\n", + " epsilon=0.5\n", + ").to(device)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "XbsIT2EdsoUC" + }, "source": [ "Now let's try out our agent to see if it raises any errors." ] @@ -546,49 +877,49 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "pZR3qE2esoUC" + }, "outputs": [], "source": [ - "def evaluate(env, agent, n_games=1, greedy=False, t_max=10000):\n", + "def evaluate(env, agent, n_games=1, greedy=False, t_max=10000, seed=None):\n", " \"\"\" Plays n_games full games. If greedy, picks actions as argmax(qvalues). Returns mean reward. \"\"\"\n", " rewards = []\n", " for _ in range(n_games):\n", - " s = env.reset()\n", + " s, _ = env.reset(seed=seed)\n", " reward = 0\n", " for _ in range(t_max):\n", - " qvalues = agent.get_qvalues([s])\n", - " action = qvalues.argmax(axis=-1)[0] if greedy else agent.sample_actions(qvalues)[0]\n", - " s, r, done, _ = env.step(action)\n", + " action = agent.sample_actions(np.array(s)[None], greedy=greedy)[0]\n", + " s, r, terminated, truncated, _ = env.step(action)\n", " reward += r\n", - " if done:\n", + " if terminated or truncated:\n", " break\n", "\n", " rewards.append(reward)\n", - " return np.mean(rewards)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evaluate(env, agent, n_games=1)" + " return np.mean(rewards)\n", + "\n", + "print(evaluate(env, agent, n_games=1, greedy=False))\n", + "print(evaluate(env, agent, n_games=1, greedy=True))" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "2BiRixA-soUC" + }, "source": [ "### Experience replay\n", - "For this assignment, we provide you with experience replay buffer. If you implemented experience replay buffer in last week's assignment, you can copy-paste it here **to get 2 bonus points**.\n", + "For this assignment, we provide you with experience replay buffer.\n", "\n", - "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/exp_replay.png)" + "\n", + "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/exp_replay.png)\n" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "jTBZo5BVsoUC" + }, "source": [ "#### The interface is fairly simple:\n", "* `exp_replay.add(obs, act, rw, next_obs, done)` - saves (s,a,r,s',done) tuple into the buffer\n", @@ -599,33 +930,112 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "Ydi0KK9LsoUC" + }, "outputs": [], "source": [ - "from replay_buffer import ReplayBuffer\n", + "from dqn.replay_buffer import ReplayBuffer\n", "exp_replay = ReplayBuffer(10)\n", "\n", "for _ in range(30):\n", - " exp_replay.add(env.reset(), env.action_space.sample(),\n", - " 1.0, env.reset(), done=False)\n", + " exp_replay.add(env.reset()[0], env.action_space.sample(), 1.0, env.reset()[0], done=False)\n", "\n", - "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", - " 5)\n", + "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(5)\n", "\n", "assert len(exp_replay) == 10, \"experience replay size should be 10 because that's what maximum capacity is\"" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**The `play_and_record` function, defined below is the main way the agent will interact with the environment during training.**\n", + "\n", + "Previously we used to train RL algorithms on entire episodes. \n", + "\n", + "This time we keep the environment constantly running and will be getting small portions of interactions with it.\n", + "\n", + "The agent takes several actions (4 actions in [2] and [3]), the corresponding (s, a, r, s', terminated) tuples are put into the replay buffer. \n", + "Whenever an episode finishes (i.e. `truncated or terminated`), the environment is reset and the procedure continues as usually. \n", + "\n", + "To make the first step in a constantly running environment, the agent needs to know the state of the environment. This is the meaning of the `initial_state` argument of the function.\n", + "\n", + "It's worth noting, the agent does not train on the fresh tuples immediately. The agent trains on samples which are sampled from the buffer.\n", + "\n", + "**Implementation note:**\n", + "We define an `ActionSampler` protocol. The goal of it is to let the function `play_and_record` accept not only `DQNAgent` class instances, but any object that can sample actions. " + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "cEXv69KWsoUC" + }, "outputs": [], "source": [ - "def play_and_record(initial_state, agent, env, exp_replay, n_steps=1):\n", + "from typing import Protocol\n", + "\n", + "\n", + "class ActionSampler(Protocol):\n", + " \"\"\"\n", + " A protocol which defines an Callable which samples actions from states\n", + " \"\"\"\n", + "\n", + " def __call__(\n", + " self, state: gym.wrappers.stateful_observation.FrameStackObservation\n", + " ) -> int: ...\n", + "\n", + "\n", + "class RandomActionSampler:\n", + " \"\"\"\n", + " We will need this guy to fill the buffer with initial 50-200K observations from a random policy.\n", " \"\"\"\n", - " Play the game for exactly n steps, record every (s,a,r,s', done) to replay buffer. \n", - " Whenever game ends, add record with done=True and reset the game.\n", - " It is guaranteed that env has done=False when passed to this function.\n", + "\n", + " def __init__(self, action_space) -> None:\n", + " self.action_space = action_space\n", + "\n", + " def __call__(\n", + " self, state: gym.wrappers.stateful_observation.FrameStackObservation\n", + " ) -> int:\n", + " action = self.action_space.sample()\n", + " return action\n", + "\n", + "\n", + "class DqnActionSampler:\n", + " \"\"\"\n", + " DQNAgent works on batched np.ndarray inputs.\n", + " This class uses a DQNAgent to sample actions from single LazyFrames observations.\n", + "\n", + " This will be an epsilon-greedy sampler.\n", + " A greedy sampler can be defined as well, but we won't need it.\n", + " \"\"\"\n", + "\n", + " def __init__(self, agent: DQNAgent):\n", + " self.agent = agent\n", + "\n", + " def __call__(\n", + " self, state: gym.wrappers.stateful_observation.FrameStackObservation\n", + " ) -> int:\n", + " state_batched = np.array(state)[None]\n", + " action_batched = # \n", + " action = action_batched.item()\n", + " return action\n", + "\n", + "\n", + "@torch.no_grad()\n", + "def play_and_record(\n", + " initial_state: gym.wrappers.stateful_observation.FrameStackObservation,\n", + " action_sampler: ActionSampler,\n", + " env,\n", + " exp_replay,\n", + " n_steps=1,\n", + "):\n", + " \"\"\"\n", + " Play the game for exactly n_steps, record every (s,a,r,s', done) to replay buffer.\n", + " Whenever game ends due to termination or truncation, add record with done=terminated and reset the game.\n", + " It is guaranteed that env has terminated=False when passed to this function.\n", "\n", " PLEASE DO NOT RESET ENV UNLESS IT IS \"DONE\"\n", "\n", @@ -637,7 +1047,7 @@ " # Play the game for n_steps as per instructions above\n", " \n", "\n", - " return sum_rewards, s" + " return sum_rewards, s\n" ] }, { @@ -646,43 +1056,55 @@ "metadata": {}, "outputs": [], "source": [ - "# testing your code.\n", - "exp_replay = ReplayBuffer(2000)\n", - "\n", - "state = env.reset()\n", - "play_and_record(state, agent, env, exp_replay, n_steps=1000)\n", - "\n", - "# if you're using your own experience replay buffer, some of those tests may need correction.\n", - "# just make sure you know what your code does\n", - "assert len(exp_replay) == 1000, \"play_and_record should have added exactly 1000 steps, \"\\\n", - " \"but instead added %i\" % len(exp_replay)\n", - "is_dones = list(zip(*exp_replay._storage))[-1]\n", - "\n", - "assert 0 < np.mean(is_dones) < 0.1, \"Please make sure you restart the game whenever it is 'done' and record the is_done correctly into the buffer.\"\\\n", - " \"Got %f is_done rate over %i steps. [If you think it's your tough luck, just re-run the test]\" % (\n", - " np.mean(is_dones), len(exp_replay))\n", - "\n", - "for _ in range(100):\n", - " obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", - " 10)\n", - " assert obs_batch.shape == next_obs_batch.shape == (10,) + state_shape\n", - " assert act_batch.shape == (\n", - " 10,), \"actions batch should have shape (10,) but is instead %s\" % str(act_batch.shape)\n", - " assert reward_batch.shape == (\n", - " 10,), \"rewards batch should have shape (10,) but is instead %s\" % str(reward_batch.shape)\n", - " assert is_done_batch.shape == (\n", - " 10,), \"is_done batch should have shape (10,) but is instead %s\" % str(is_done_batch.shape)\n", - " assert [int(i) in (0, 1)\n", - " for i in is_dones], \"is_done should be strictly True or False\"\n", - " assert [\n", - " 0 <= a < n_actions for a in act_batch], \"actions should be within [0, n_actions)\"\n", + "def test_play_and_record(action_sampler):\n", + " exp_replay = ReplayBuffer(10_000)\n", "\n", - "print(\"Well done!\")" + " state, _ = env.reset()\n", + " \n", + " # action_sampler = RandomActionSampler(env.action_space)\n", + " # action_sampler = DqnActionSampler(agent)\n", + " play_and_record(state, action_sampler, env, exp_replay, n_steps=1000);\n", + " \n", + " # if you're using your own experience replay buffer, some of those tests may need correction.\n", + " # just make sure you know what your code does\n", + " assert len(exp_replay) == 1000, \\\n", + " \"play_and_record should have added exactly 1000 steps, \" \\\n", + " \"but instead added %i\" % len(exp_replay)\n", + " is_dones = list(zip(*exp_replay._storage))[-1]\n", + " \n", + " assert 0 < np.mean(is_dones) < 0.1, \\\n", + " \"Please make sure you restart the game whenever it is 'done' and \" \\\n", + " \"record the is_done correctly into the buffer. Got %f is_done rate over \" \\\n", + " \"%i steps. [If you think it's your tough luck, just re-run the test]\" % (\n", + " np.mean(is_dones), len(exp_replay))\n", + " \n", + " for _ in range(100):\n", + " obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(10)\n", + " assert obs_batch.shape == next_obs_batch.shape == (10,) + STATE_SHAPE\n", + " assert act_batch.shape == (10,), \\\n", + " \"actions batch should have shape (10,) but is instead %s\" % str(act_batch.shape)\n", + " assert reward_batch.shape == (10,), \\\n", + " \"rewards batch should have shape (10,) but is instead %s\" % str(reward_batch.shape)\n", + " assert is_done_batch.shape == (10,), \\\n", + " \"is_done batch should have shape (10,) but is instead %s\" % str(is_done_batch.shape)\n", + " assert [int(i) in (0, 1) for i in is_dones], \\\n", + " \"is_done should be strictly True or False\"\n", + " assert [0 <= a < N_ACTIONS for a in act_batch], \"actions should be within [0, n_actions)\"\n", + " \n", + " print(\"Well done!\")\n", + "\n", + "\n", + "print(\"Random:\")\n", + "test_play_and_record(RandomActionSampler(env.action_space))\n", + "print(\"DQN:\")\n", + "test_play_and_record(DqnActionSampler(agent))" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "y5zyryPOsoUF" + }, "source": [ "### Target networks\n", "\n", @@ -701,14 +1123,15 @@ "metadata": {}, "outputs": [], "source": [ - "target_network = DQNAgent(agent.state_shape, agent.n_actions, epsilon=0.5).to(device)\n", - "# This is how you can load weights from agent into target network\n", - "target_network.load_state_dict(agent.state_dict())" + "target_network = DQNetworkDueling(N_FRAMES_STACKED, N_ACTIONS).to(device)\n", + "target_network.load_state_dict(agent.q_network.state_dict())" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "2idY8QX0soUF" + }, "source": [ "### Learning with... Q-learning\n", "Here we write a function similar to `agent.update` from tabular q-learning." @@ -716,7 +1139,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "k46MPwwwsoUF" + }, "source": [ "Compute Q-learning TD error:\n", "\n", @@ -727,19 +1152,87 @@ "$$ Q_{reference}(s,a) = r(s,a) + \\gamma \\cdot max_{a'} Q_{target}(s', a') $$\n", "\n", "Where\n", - "* $Q_{target}(s',a')$ denotes q-value of next state and next action predicted by __target_network__\n", + "* $Q_{target}(s',a')$ denotes Q-value of next state and next action predicted by __target_network__\n", "* $s, a, r, s'$ are current state, action, reward and next state respectively\n", "* $\\gamma$ is a discount factor defined two cells above.\n", "\n", "\n", "__Note 1:__ there's an example input below. Feel free to experiment with it before you write the function.\n", "\n", - "__Note 2:__ compute_td_loss is a source of 99% of bugs in this homework. If reward doesn't improve, it often helps to go through it line by line [with a rubber duck](https://rubberduckdebugging.com/).\n", + "__Note 2:__ compute_td_loss is a major source of of bugs in this homework. We tried to cover it with tests, but if reward doesn't improve, it often helps to go through it line by line [with a rubber duck](https://rubberduckdebugging.com/).\n", "\n", - "**Double DQN (+2 pts)**\n", + "**Double DQN**\n", "\n", "$$ Q_{reference}(s,a) = r(s, a) + \\gamma \\cdot\n", - "Q_{target}(s',argmax_{a'}Q_\\theta(s', a')) $$" + "Q_{target}(s',argmax_{a'}Q_\\theta(s', a')) $$\n", + "\n", + "We will use Double DQN for training, but **we ask you to implement both** of the methods to experience the difference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "V02HcUYasoUG", + "outputId": "7a11e3d3-d030-40be-8f14-59b5481749fb" + }, + "outputs": [], + "source": [ + "def compute_td_loss_on_tensors(\n", + " states: torch.Tensor, # (batch_size, *state_shape)\n", + " actions: torch.Tensor, # (batch_size,)\n", + " rewards: torch.Tensor, # (batch_size,)\n", + " next_states: torch.Tensor, # (batch_size, *state_shape)\n", + " is_done: torch.Tensor, # (batch_size,), torch.bool\n", + " agent: nn.Module,\n", + " target_network: nn.Module,\n", + " gamma: float = 0.99,\n", + " check_shapes=False,\n", + "):\n", + " predicted_qvalues = agent(states) # shape: [batch_size, n_actions]\n", + " assert is_done.dtype is torch.bool\n", + "\n", + " # compute q-values for all actions in next states\n", + " with torch.no_grad():\n", + " predicted_next_qvalues_target = # your code; # shape: [batch_size, n_actions]\n", + "\n", + " # select q-values for chosen actions\n", + " predicted_qvalues_for_actions = predicted_qvalues[\n", + " range(len(actions)), actions\n", + " ] # shape: [batch_size]\n", + "\n", + " # compute V*(next_states) using predicted next q-values\n", + " next_state_values = # your code\n", + "\n", + " if check_shapes:\n", + " assert (\n", + " next_state_values.dim() == 1\n", + " and next_state_values.shape[0] == states.shape[0]\n", + " ), \"must predict one value per state\"\n", + " assert not next_state_values.requires_grad\n", + "\n", + " # compute \"target q-values\" for loss - it's what's inside square parentheses in the above formula.\n", + " # at the last state use the simplified formula: Q(s,a) = r(s,a) since s' doesn't exist\n", + " target_qvalues_for_actions = # your code\n", + "\n", + " # mean squared error loss to minimize\n", + " loss = torch.mean((predicted_qvalues_for_actions - target_qvalues_for_actions) ** 2)\n", + "\n", + " if check_shapes:\n", + " assert (\n", + " predicted_next_qvalues_target.data.dim() == 2\n", + " ), \"make sure you predicted q-values for all actions in next state\"\n", + " assert (\n", + " next_state_values.data.dim() == 1\n", + " ), \"make sure you computed V(s') as maximum over just the actions axis and not all axes\"\n", + " assert (\n", + " target_qvalues_for_actions.data.dim() == 1\n", + " ), \"there's something wrong with target q-values, they must be a vector\"\n", + "\n", + " return loss" ] }, { @@ -748,67 +1241,89 @@ "metadata": {}, "outputs": [], "source": [ - "def compute_td_loss(states, actions, rewards, next_states, is_done,\n", - " agent, target_network,\n", - " gamma=0.99,\n", - " check_shapes=False,\n", - " device=device):\n", - " \"\"\" Compute td loss using torch operations only. Use the formulae above. \"\"\"\n", - " states = torch.tensor(states, device=device, dtype=torch.float) # shape: [batch_size, *state_shape]\n", - "\n", - " # for some torch reason should not make actions a tensor\n", - " actions = torch.tensor(actions, device=device, dtype=torch.long) # shape: [batch_size]\n", - " rewards = torch.tensor(rewards, device=device, dtype=torch.float) # shape: [batch_size]\n", - " # shape: [batch_size, *state_shape]\n", - " next_states = torch.tensor(next_states, device=device, dtype=torch.float)\n", - " is_done = torch.tensor(\n", - " is_done.astype('float32'),\n", - " device=device,\n", - " dtype=torch.float\n", - " ) # shape: [batch_size]\n", - " is_not_done = 1 - is_done\n", + "from test_td_loss.compute_td_loss import test_is_done_is_used, test_compute_td_loss_vanilla\n", "\n", - " # get q-values for all actions in current states\n", - " predicted_qvalues = agent(states)\n", + "test_compute_td_loss_vanilla(compute_td_loss_on_tensors)\n", + "print(\"Well done!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_td_loss_on_tensors_double(\n", + " states: torch.Tensor, # (batch_size, *state_shape)\n", + " actions: torch.Tensor, # (batch_size,)\n", + " rewards: torch.Tensor, # (batch_size,)\n", + " next_states: torch.Tensor, # (batch_size, *state_shape)\n", + " is_done: torch.Tensor, # (batch_size,), torch.bool\n", + " agent: nn.Module,\n", + " target_network: nn.Module,\n", + " gamma: float = 0.99,\n", + " check_shapes=False,\n", + "):\n", + " predicted_qvalues = agent(states) # shape: [batch_size, n_actions]\n", + " assert is_done.dtype is torch.bool\n", "\n", " # compute q-values for all actions in next states\n", - " predicted_next_qvalues = target_network(next_states)\n", - " \n", + " with torch.no_grad():\n", + " predicted_next_qvalues_target = # your code; # shape: [batch_size, n_actions]\n", + "\n", " # select q-values for chosen actions\n", - " predicted_qvalues_for_actions = predicted_qvalues[range(\n", - " len(actions)), actions]\n", + " predicted_qvalues_for_actions = predicted_qvalues[\n", + " range(len(actions)), actions\n", + " ] # shape: [batch_size]\n", "\n", " # compute V*(next_states) using predicted next q-values\n", - " next_state_values = \n", + " next_state_values = # your code\n", "\n", - " assert next_state_values.dim(\n", - " ) == 1 and next_state_values.shape[0] == states.shape[0], \"must predict one value per state\"\n", + " if check_shapes:\n", + " assert (\n", + " next_state_values.dim() == 1\n", + " and next_state_values.shape[0] == states.shape[0]\n", + " ), \"must predict one value per state\"\n", + " assert not next_state_values.requires_grad\n", "\n", " # compute \"target q-values\" for loss - it's what's inside square parentheses in the above formula.\n", " # at the last state use the simplified formula: Q(s,a) = r(s,a) since s' doesn't exist\n", - " # you can multiply next state values by is_not_done to achieve this.\n", - " target_qvalues_for_actions = \n", + " target_qvalues_for_actions = # your code\n", "\n", " # mean squared error loss to minimize\n", - " loss = torch.mean((predicted_qvalues_for_actions -\n", - " target_qvalues_for_actions.detach()) ** 2)\n", + " loss = torch.mean((predicted_qvalues_for_actions - target_qvalues_for_actions) ** 2)\n", "\n", " if check_shapes:\n", - " assert predicted_next_qvalues.data.dim(\n", - " ) == 2, \"make sure you predicted q-values for all actions in next state\"\n", - " assert next_state_values.data.dim(\n", - " ) == 1, \"make sure you computed V(s') as maximum over just the actions axis and not all axes\"\n", - " assert target_qvalues_for_actions.data.dim(\n", - " ) == 1, \"there's something wrong with target q-values, they must be a vector\"\n", + " assert (\n", + " predicted_next_qvalues_target.data.dim() == 2\n", + " ), \"make sure you predicted q-values for all actions in next state\"\n", + " assert (\n", + " next_state_values.data.dim() == 1\n", + " ), \"make sure you computed V(s') as maximum over just the actions axis and not all axes\"\n", + " assert (\n", + " target_qvalues_for_actions.data.dim() == 1\n", + " ), \"there's something wrong with target q-values, they must be a vector\"\n", "\n", " return loss" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from test_td_loss.compute_td_loss import test_compute_td_loss_double\n", + "\n", + "test_compute_td_loss_double(compute_td_loss_on_tensors_double)\n", + "print(\"Well done!\")" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Sanity checks" + "**The following function works on np.ndarrays: it converts its inputs to torch.Tensors and calls the torch-tensor function**" ] }, { @@ -817,24 +1332,88 @@ "metadata": {}, "outputs": [], "source": [ - "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", - " 10)\n", + "def compute_td_loss(\n", + " states,\n", + " actions,\n", + " rewards,\n", + " next_states,\n", + " is_done,\n", + " agent,\n", + " target_network,\n", + " gamma=0.99,\n", + " check_shapes=False,\n", + " device=None,\n", + " tensor_loss_evaluator=compute_td_loss_on_tensors_double,\n", + "):\n", + " \"\"\"Compute td loss using torch operations only. Use the formulae above.\"\"\"\n", + "\n", + " if device is None:\n", + " device = next(agent.parameters()).device\n", + " states = torch.tensor(\n", + " states, device=device, dtype=torch.float32\n", + " ) # shape: [batch_size, *state_shape]\n", + " actions = torch.tensor(\n", + " actions, device=device, dtype=torch.int64\n", + " ) # shape: [batch_size]\n", + " rewards = torch.tensor(\n", + " rewards, device=device, dtype=torch.float32\n", + " ) # shape: [batch_size]\n", + " # shape: [batch_size, *state_shape]\n", + " next_states = torch.tensor(next_states, device=device, dtype=torch.float)\n", + " is_done = torch.tensor(\n", + " is_done, device=device, dtype=torch.bool\n", + " ) # shape: [batch_size]\n", + "\n", + " return tensor_loss_evaluator(\n", + " states=states,\n", + " actions=actions,\n", + " rewards=rewards,\n", + " next_states=next_states,\n", + " is_done=is_done,\n", + " agent=agent,\n", + " target_network=target_network,\n", + " gamma=gamma,\n", + " check_shapes=check_shapes,\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "x8AvquAtsoUG" + }, + "source": [ + "Sanity checks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5nRoOn30soUG" + }, + "outputs": [], + "source": [ + "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(10)\n", "\n", "loss = compute_td_loss(obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch,\n", " agent, target_network,\n", " gamma=0.99, check_shapes=True)\n", "loss.backward()\n", "\n", - "assert loss.requires_grad and tuple(loss.data.size()) == (\n", - " ), \"you must return scalar loss - mean over batch\"\n", - "assert np.any(next(agent.parameters()).grad.data.cpu().numpy() !=\n", - " 0), \"loss must be differentiable w.r.t. network weights\"\n", - "assert np.all(next(target_network.parameters()).grad is None), \"target network should not have grads\"" + "assert loss.requires_grad and tuple(loss.data.size()) == (), \\\n", + " \"you must return scalar loss - mean over batch\"\n", + "assert np.any(next(agent.parameters()).grad.data.cpu().numpy() != 0), \\\n", + " \"loss must be differentiable w.r.t. network weights\"\n", + "assert np.all(next(target_network.parameters()).grad is None), \\\n", + " \"target network should not have grads\"" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "KIplL0hSsoUG" + }, "source": [ "## Main loop (3 pts)\n", "\n", @@ -847,10 +1426,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "-JV-ulB-soUG" + }, "outputs": [], "source": [ - "from tqdm import trange\n", + "from tqdm.auto import trange\n", "from IPython.display import clear_output\n", "import matplotlib.pyplot as plt" ] @@ -861,7 +1442,9 @@ "metadata": {}, "outputs": [], "source": [ - "seed = \n", + "import random\n", + "\n", + "seed = # your favourite seed\n", "random.seed(seed)\n", "np.random.seed(seed)\n", "torch.manual_seed(seed)" @@ -870,46 +1453,73 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-eurxA-_soUG", + "outputId": "4eb73eb0-771c-442e-a086-50bae3ebd9d8" + }, "outputs": [], "source": [ - "env = make_env(seed)\n", - "state_shape = env.observation_space.shape\n", - "n_actions = env.action_space.n\n", - "state = env.reset()\n", + "env = make_final_env()\n", + "\n", + "state, _ = env.reset(seed=seed)\n", "\n", - "agent = DQNAgent(state_shape, n_actions, epsilon=1).to(device)\n", - "target_network = DQNAgent(state_shape, n_actions).to(device)\n", - "target_network.load_state_dict(agent.state_dict())" + "agent = DQNAgent(\n", + " DQNetworkDueling(N_FRAMES_STACKED, N_ACTIONS),\n", + " epsilon=1\n", + ").to(device)\n", + "target_network = DQNetworkDueling(N_FRAMES_STACKED, N_ACTIONS).to(device)\n", + "target_network.load_state_dict(agent.q_network.state_dict())\n", + "\n", + "action_sampler = DqnActionSampler(agent)\n", + "action_sampler_random = RandomActionSampler(env.action_space)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "WZg25kIasoUG" + }, "source": [ - "Buffer of size $10^4$ fits into 5 Gb RAM.\n", + "Buffer of size $10^4$ can probably pass the threshold for this assignment.\n", "\n", - "Larger sizes ($10^5$ and $10^6$ are common) can be used. It can improve the learning, but $10^4$ is quiet enough. $10^2$ will probably fail learning." + "Larger sizes ($10^5$ and $10^6$ are common) can show a much higher score." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hWyMxfN4soUG", + "outputId": "a0d4147b-56b2-4f69-802a-0da87ad82bdb", + "scrolled": true + }, "outputs": [], "source": [ - "exp_replay = ReplayBuffer(10**4)\n", - "for i in range(100):\n", - " if not utils.is_enough_ram(min_available_gb=0.1):\n", + "from dqn.utils import is_enough_ram\n", + "\n", + "REPLAY_BUFFER_SIZE = 10**6\n", + "# INITIAL_BUFFER_FILL = 50_000 # Nature DQN Extended Data Table 1\n", + "INITIAL_BUFFER_FILL = 200_000 # Rainbow without prioritization\n", + "_n_steps = 100\n", + "\n", + "exp_replay = ReplayBuffer(REPLAY_BUFFER_SIZE)\n", + "for i in trange(INITIAL_BUFFER_FILL // _n_steps):\n", + " if not is_enough_ram(min_available_gb=0.1):\n", " print(\"\"\"\n", - " Less than 100 Mb RAM available. \n", + " Less than 100 Mb RAM available.\n", " Make sure the buffer size in not too huge.\n", " Also check, maybe other processes consume RAM heavily.\n", " \"\"\"\n", " )\n", " break\n", - " play_and_record(state, agent, env, exp_replay, n_steps=10**2)\n", - " if len(exp_replay) == 10**4:\n", + " play_and_record(state, action_sampler_random, env, exp_replay, n_steps=_n_steps)\n", + " if len(exp_replay) >= INITIAL_BUFFER_FILL:\n", " break\n", "print(len(exp_replay))" ] @@ -920,21 +1530,32 @@ "metadata": {}, "outputs": [], "source": [ - "timesteps_per_epoch = 1\n", - "batch_size = 16\n", - "total_steps = 3 * 10**6\n", - "decay_steps = 10**6\n", + "len(exp_replay)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_ca9vbW4soUG" + }, + "outputs": [], + "source": [ + "update_frequency = 4 # n_steps for play_and_record; Nature DQN Extended Data Table 1 + Rainbow Table 4: Additional hyper-parameters\n", + "batch_size = 32 # Nature DQN Extended Data Table 1 + Table 4: Additional hyper-parameters\n", + "total_steps = 10 * 10**6 # this can be long, feel free to stop the training when the target score is reached\n", + "decay_steps = 10**6 # Nature DQN Extended Data Table 1\n", "\n", - "opt = torch.optim.Adam(agent.parameters(), lr=1e-4)\n", + "opt = torch.optim.Adam(agent.parameters(), lr=6.25e-05, eps=1.4e-4) # Rainbow\n", "\n", - "init_epsilon = 1\n", - "final_epsilon = 0.1\n", + "init_epsilon = 1 # Nature DQN\n", + "final_epsilon = 0.1 # Nature DQN\n", "\n", - "loss_freq = 50\n", - "refresh_target_network_freq = 5000\n", - "eval_freq = 5000\n", + "loss_freq = 100\n", + "refresh_target_network_freq = 10_000 # Nature DQN\n", + "eval_freq = 10_000\n", "\n", - "max_grad_norm = 50\n", + "max_grad_norm = 10 # Dueling DQN\n", "\n", "n_lives = 5" ] @@ -942,98 +1563,149 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "oJWs0q-6soUG" + }, "outputs": [], "source": [ - "mean_rw_history = []\n", - "td_loss_history = []\n", - "grad_norm_history = []\n", - "initial_state_v_history = []\n", "step = 0" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "675-JU0hsoUG" + }, "outputs": [], "source": [ - "state = env.reset()\n", - "for step in trange(step, total_steps + 1):\n", - " if not utils.is_enough_ram():\n", - " print('less that 100 Mb RAM available, freezing')\n", - " print('make sure everythin is ok and make KeyboardInterrupt to continue')\n", - " try:\n", - " while True:\n", - " pass\n", - " except KeyboardInterrupt:\n", - " pass\n", + "import time\n", "\n", - " agent.epsilon = utils.linear_decay(init_epsilon, final_epsilon, step, decay_steps)\n", - "\n", - " # play\n", - " _, state = play_and_record(state, agent, env, exp_replay, timesteps_per_epoch)\n", + "def wait_for_keyboard_interrupt():\n", + " try:\n", + " while True:\n", + " time.sleep(1)\n", + " except KeyboardInterrupt:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dqn.utils import linear_decay, is_enough_ram\n", + "from torch.utils.tensorboard import SummaryWriter\n", + "from dqn.logger import Logger" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "use_tensorboard = True # Set to False to use simple matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "logger = Logger(use_tensorboard=use_tensorboard)\n", "\n", - " # train\n", - " \n", + "if use_tensorboard:\n", + " %load_ext tensorboard\n", + " %tensorboard --logdir runs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FgQ1vK3CsoUG", + "scrolled": true + }, + "outputs": [], + "source": [ + "env = make_final_env()\n", + "state, _ = env.reset()\n", "\n", - " loss = \n", + "with trange(step, total_steps + 1) as progress_bar:\n", + " for step in progress_bar:\n", + " if not is_enough_ram():\n", + " print('Less than 100 MB RAM available, freezing.')\n", + " print('Ensure everything is okay and use KeyboardInterrupt to continue.')\n", + " wait_for_keyboard_interrupt()\n", "\n", - " loss.backward()\n", - " grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)\n", - " opt.step()\n", - " opt.zero_grad()\n", + " agent.epsilon = linear_decay(init_epsilon, final_epsilon, step, decay_steps)\n", "\n", - " if step % loss_freq == 0:\n", - " td_loss_history.append(loss.data.cpu().item())\n", - " grad_norm_history.append(grad_norm)\n", + " # Play\n", + " _, state = play_and_record(state, action_sampler, env, exp_replay, n_steps=update_frequency)\n", "\n", - " if step % refresh_target_network_freq == 0:\n", - " # Load agent weights into target_network\n", - " \n", + " # Train\n", + " s, a, r, s_next, done = exp_replay.sample(batch_size)\n", + " # \n", "\n", - " if step % eval_freq == 0:\n", - " mean_rw_history.append(evaluate(\n", - " make_env(clip_rewards=True, seed=step), agent, n_games=3 * n_lives, greedy=True)\n", - " )\n", - " initial_state_q_values = agent.get_qvalues(\n", - " [make_env(seed=step).reset()]\n", - " )\n", - " initial_state_v_history.append(np.max(initial_state_q_values))\n", + " loss = compute_td_loss(s, a, r, s_next, done, agent, target_network, device=device) # Ensure 'device' is defined\n", "\n", - " clear_output(True)\n", - " print(\"buffer size = %i, epsilon = %.5f\" %\n", - " (len(exp_replay), agent.epsilon))\n", + " loss.backward()\n", + " grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)\n", + " opt.step()\n", + " opt.zero_grad()\n", "\n", - " plt.figure(figsize=[16, 9])\n", + " if step % loss_freq == 0:\n", + " loss_value = loss.data.cpu().item()\n", + " grad_norm_value = grad_norm.cpu().item()\n", + " logger.log_loss(loss_value, step)\n", + " logger.log_grad_norm(grad_norm_value, step)\n", "\n", - " plt.subplot(2, 2, 1)\n", - " plt.title(\"Mean reward per life\")\n", - " plt.plot(mean_rw_history)\n", - " plt.grid()\n", + " if step % refresh_target_network_freq == 0:\n", + " # Load agent weights into target_network\n", + " target_network.load_state_dict(agent.q_network.state_dict())\n", + " torch.save(agent.state_dict(), \"last_state_dict.pt\")\n", "\n", - " assert not np.isnan(td_loss_history[-1])\n", - " plt.subplot(2, 2, 2)\n", - " plt.title(\"TD loss history (smoothened)\")\n", - " plt.plot(utils.smoothen(td_loss_history))\n", - " plt.grid()\n", + " if step % eval_freq == 0:\n", + " mean_reward = evaluate(\n", + " make_final_env(), agent, n_games=3 * n_lives, greedy=True, seed=step\n", + " )\n", + " initial_state_q_values = agent.get_qvalues(\n", + " [make_final_env().reset(seed=step)[0]]\n", + " )\n", + " initial_v = np.max(initial_state_q_values).item()\n", "\n", - " plt.subplot(2, 2, 3)\n", - " plt.title(\"Initial state V\")\n", - " plt.plot(initial_state_v_history)\n", - " plt.grid()\n", + " logger.log_mean_reward(mean_reward, step)\n", + " logger.log_initial_state_v(initial_v, step)\n", "\n", - " plt.subplot(2, 2, 4)\n", - " plt.title(\"Grad norm history (smoothened)\")\n", - " plt.plot(utils.smoothen(grad_norm_history))\n", - " plt.grid()\n", + " clear_output(True)\n", + " print(\"Buffer size = %i, Epsilon = %.5f\" % (len(exp_replay), agent.epsilon))\n", "\n", - " plt.show()" + " if not use_tensorboard:\n", + " # If using matplotlib, plot the metrics\n", + " logger.plot()" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], + "source": [ + "agent = DQNAgent(\n", + " DQNetworkDueling(N_FRAMES_STACKED, N_ACTIONS),\n", + " epsilon=1\n", + ").to(device)\n", + "agent.load_state_dict(torch.load(\"last_state_dict.pt\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZEDQhQrdsoUG" + }, "source": [ "Agent is evaluated for 1 life, not for a whole episode of 5 lives. Rewards in evaluation are also truncated. Cuz this is what environment the agent is learning in and in this way mean rewards per life can be compared with initial state value\n", "\n", @@ -1042,7 +1714,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "s0jLjYGwsoUG" + }, "source": [ "Final scoring is done on a whole episode with all 5 lives." ] @@ -1050,28 +1724,32 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "xTGVrwwQsoUG" + }, "outputs": [], "source": [ "final_score = evaluate(\n", - " make_env(clip_rewards=False, seed=9),\n", - " agent, n_games=30, greedy=True, t_max=10 * 1000\n", - ") * n_lives\n", + " make_final_env(),\n", + " agent, n_games=30, greedy=True, t_max=10 * 1000, seed=9\n", + ")\n", "print('final score:', final_score)\n", - "assert final_score >= 15, 'not as cool as DQN can'\n", + "assert final_score >= 3, 'not as cool as DQN can'\n", "print('Cool!')" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "ovaG8N4lsoUH" + }, "source": [ "## How to interpret plots:\n", "\n", - "This aint no supervised learning so don't expect anything to improve monotonously. \n", + "This aint no supervised learning so don't expect anything to improve monotonously.\n", "* **TD loss** is the MSE between agent's current Q-values and target Q-values. It may slowly increase or decrease, it's ok. The \"not ok\" behavior includes going NaN or stayng at exactly zero before agent has perfect performance.\n", "* **grad norm** just shows the intensivity of training. Not ok is growing to values of about 100 (or maybe even 50) though it depends on network architecture.\n", - "* **mean reward** is the expected sum of r(s,a) agent gets over the full game session. It will oscillate, but on average it should get higher over time (after a few thousand iterations...). \n", + "* **mean reward** is the expected sum of r(s,a) agent gets over the full game session. It will oscillate, but on average it should get higher over time (after a few thousand iterations...).\n", " * In basic q-learning implementation it takes about 40k steps to \"warm up\" agent before it starts to get better.\n", "* **Initial state V** is the expected discounted reward for episode in the oppinion of the agent. It should behave more smoothly than **mean reward**. It should get higher over time but sometimes can experience drawdowns because of the agaent's overestimates.\n", "* **buffer size** - this one is simple. It should go up and cap at max size.\n", @@ -1088,11 +1766,13 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "kVV72AB-soUH" + }, "source": [ "## About hyperparameters:\n", "\n", - "The task has something in common with supervised learning: loss is optimized through the buffer (instead of Train dataset). But the distribution of states and actions in the buffer **is not stationary** and depends on the policy that generated it. It can even happen that the mean TD error across the buffer is very low but the performance is extremely poor (imagine the agent collecting data to the buffer always manages to avoid the ball).\n", + "The task has something in common with supervised learning: loss is optimized through the buffer (instead of Train dataset). But the distribution of states and actions in the buffer **is not stationary** and depends on the policy it was generated by. It can even happen that the mean TD error across the buffer is very low but the performance is extremely poor (imagine the agent collecting data to the buffer always manages to avoid the ball).\n", "\n", "* Total timesteps and training time: It seems to be so huge, but actually it is normal for RL.\n", "\n", @@ -1102,15 +1782,14 @@ "\n", "* lr: $10^{-3}$ would probably be too huge\n", "\n", - "* batch size: This one can be very important: if it is too small the agent can fail to learn. Huge batch takes more time to process. If batch of size 8 can not be processed on the hardware you use take 2 (or even 4) batches of size 4, divide the loss on them by 2 (or 4) and make optimization step after both backward() calls in torch.\n", - "\n", - "* target network update frequency: has something in common with learning rate. Too frequent updates can lead to divergence. Too rare can lead to slow leraning. For millions of total timesteps thousands of inner steps seem ok. One iteration of target network updating is an iteration of the (this time approximate) $\\gamma$-compression that stands behind Q-learning. The more inner steps it makes the more accurate is the compression.\n", - "* max_grad_norm - just huge enough. In torch clip_grad_norm also evaluates the norm before clipping and it can be convenient for logging." + "* target network update frequency: has something in common with learning rate. Too frequent updates can lead to divergence. Too rare can lead to slow leraning. For millions of total timesteps thousands of inner steps seem ok. One iteration of target network updating is an iteration of the (this time approximate) $\\gamma$-compression that stands behind Q-learning. The more inner steps it makes the more accurate is the compression." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "Plp8WC_esoUH" + }, "source": [ "### Video" ] @@ -1118,58 +1797,64 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "DdExc_AssoUH" + }, "outputs": [], "source": [ - "# Record sessions\n", + "# record sessions\n", + "from gymnasium.wrappers import RecordVideo\n", "\n", - "import gym.wrappers\n", - "\n", - "with gym.wrappers.Monitor(make_env(), directory=\"videos\", force=True) as env_monitor:\n", - " sessions = [evaluate(env_monitor, agent, n_games=n_lives, greedy=True) for _ in range(10)]" + "with make_final_env() as env, RecordVideo(\n", + " env=env, video_folder=\"./videos\", episode_trigger=lambda episode_number: True\n", + ") as env_monitor:\n", + " sessions = [\n", + " evaluate(env_monitor, agent, n_games=n_lives, greedy=True) for _ in range(10)\n", + " ]\n" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "lt6xg1n_soUH" + }, "outputs": [], "source": [ "# Show video. This may not work in some setups. If it doesn't\n", "# work for you, you can download the videos and view them locally.\n", "\n", "from pathlib import Path\n", + "from base64 import b64encode\n", "from IPython.display import HTML\n", "\n", - "video_names = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_paths = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_path = video_paths[-1] # You can also try other indices\n", + "\n", + "if 'google.colab' in sys.modules:\n", + " # https://stackoverflow.com/a/57378660/1214547\n", + " with video_path.open('rb') as fp:\n", + " mp4 = fp.read()\n", + " data_url = 'data:video/mp4;base64,' + b64encode(mp4).decode()\n", + "else:\n", + " data_url = str(video_path)\n", "\n", "HTML(\"\"\"\n", "\n", - "\"\"\".format(video_names[-1])) # You can also try other indices" + "\"\"\".format(data_url))" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "fLPx2aI7soUH" + }, "source": [ - "## Let's have a closer look at this.\n", + "## Let's have a closer look at this. Interpretation (2 pts).\n", "\n", - "If average episode score is below 200 using all 5 lives, then probably DQN has not converged fully. But anyway let's make a more complete record of an episode." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "eval_env = make_env(clip_rewards=False)\n", - "record = utils.play_and_log_episode(eval_env, agent)\n", - "print('total reward for life:', np.sum(record['rewards']))\n", - "for key in record:\n", - " print(key)" + "Let's play 5 episodes (note that the game has 5 lives) and log some stats:" ] }, { @@ -1178,60 +1863,52 @@ "metadata": {}, "outputs": [], "source": [ - "fig = plt.figure(figsize=(5, 5))\n", - "ax = fig.add_subplot(1, 1, 1)\n", + "from dqn.analysis import play_and_log_episode\n", "\n", - "ax.scatter(record['v_mc'], record['v_agent'])\n", - "ax.plot(sorted(record['v_mc']), sorted(record['v_mc']),\n", - " 'black', linestyle='--', label='x=y')\n", + "env = make_final_env()\n", + "stats = play_and_log_episode(env, agent)\n", "\n", - "ax.grid()\n", - "ax.legend()\n", - "ax.set_title('State Value Estimates')\n", - "ax.set_xlabel('Monte-Carlo')\n", - "ax.set_ylabel('Agent')\n", - "\n", - "plt.show()" + "print(\"Keys:\", list(stats.keys()))\n", + "print(\"Shapes:\")\n", + "for key in [\"states\", \"qvalues\", \"actions\", \"rewards\"]:\n", + " print(f\"{key}: {stats[key].shape}\")\n", + "print(\"terminated:\", stats[\"episode_finished\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "$\\hat V_{Monte-Carlo}(s_t) = \\sum_{\\tau=0}^{episode~end} \\gamma^{\\tau-t}r_t$" + "Let's plot rewards:" ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Is there a big bias? It's ok, anyway it works." - ] - }, - { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "## Bonus I (2 pts)" + "plt.plot(stats[\"rewards\"])\n", + "plt.grid()\n", + "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**1.** Plot several (say 3) states with high and low spreads of Q estimate by actions i.e.\n", - "$$\\max_a \\hat Q(s,a) - \\min_a \\hat Q(s,a)\\$$\n", - "Please take those states from different episodes to make sure that the states are really different.\n", - "\n", - "What should high and low spread mean at least in the world of perfect Q-fucntions?\n", + "Your task it to evaluate the following quantities from the logs:\n", + "1. Discounted returns: $G[t] = \\sum_{t'=t}^T \\gamma ^ {t' - t}r[t]$, where $T$ is the total time of an episode.\n", + "2. State Values estimated by the agent: $V_{agent}[t] = \\max_{a}Q_{agent}(s[t], a)$. \n", + "3. Q-spread: $\\Delta Q[t] = \\max_{a}Q_{agent}(s[t], a) - \\min_{a}Q_{agent}(s[t], a)$\n", "\n", - "Comment the states you like most.\n", + "Create a new env: `env = make_final_env()`, play for 5 episodes (a full game has 5 lives, so it will be 1 full game). \n", + "Plot rewards and the evaluated quantites for each of them. \n", + "Using the plots, can you find points where the ball hits the wall? \n", + "Where the ball hits the platform? \n", + "Probably, you won't need all the quantities for that, but it's still good to check the behavior of a model.\n", "\n", - "**2.** Plot several (say 3) states with high td-error and several states with high values of\n", - "$$| \\hat V_{Monte-Carlo}(s) - \\hat V_{agent}(s)|,$$ \n", - "$$\\hat V_{agent}(s)=\\max_a \\hat Q(s,a).$$ Please take those states from different episodes to make sure that the states are really different. From what part (i.e. beginning, middle, end) of an episode did these states come from?\n", - "\n", - "Comment the states you like most." + "The `merge_frame_stack_to_plot` function may be useful." ] }, { @@ -1240,75 +1917,91 @@ "metadata": {}, "outputs": [], "source": [ - "from utils import play_and_log_episode, img_by_obs\n", + "def get_discounted_returns(rewards: Reversible[float], gamma: float) -> list[float]:\n", + " \"\"\"\n", + " Calculates G[t] for each t, given rewards and gamma.\n", "\n", - "" + " Tip: Iterate backwards through rewards and use the following relationship:\n", + " G[t] = r[t] + gamma * G[t + 1]\n", + " \"\"\"\n", + " returns_reverse: list[float] = []\n", + " last_return: float = 0\n", + " for r in reversed(rewards):\n", + " # your code\n", + " returns = list(reversed(returns_reverse))\n", + " return returns\n" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "e36bU0u8soUH" + }, "source": [ - "## Bonus II (1-5 pts). Get High Score!\n", + "## Bonus I (2 pts). Get High Score!\n", "\n", - "1 point to you for each 50 points of your agent. Truncated by 5 points. Starting with 50 points, **not** 50 + threshold.\n", - "\n", - "One way is to train for several days and use heavier hardware (why not actually).\n", - "\n", - "Another way is to apply modifications (see **Bonus III**)." + "Get mean reward per life:\n", + "1. $\\geq 20$: +1 pt\n", + "2. $\\geq 40$: +2 pts" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "78e4nRoSsoUH" + }, "source": [ - "## Bonus III (2+ pts). Apply modifications to DQN.\n", - "\n", - "For inspiration see [Rainbow](https://arxiv.org/abs/1710.02298) - a version of q-learning that combines lots of them.\n", + "## Bonus II (2+ pts). Apply modifications to DQN.\n", "\n", - "Points for Bonus II and Bonus III fully stack. So if modified agent gets score 250+ you get 5 pts for Bonus II + points for modifications. If the final score is 40 then you get the points for modifications.\n", - "\n", - "\n", - "Some modifications:\n", - "* [Prioritized experience replay](https://arxiv.org/abs/1511.05952) (5 pts for your own implementation, 3 pts for using a ready one)\n", - "* [double q-learning](https://arxiv.org/abs/1509.06461) (2 pts)\n", - "* [dueling q-learning](https://arxiv.org/abs/1511.06581) (2 pts)\n", - "* multi-step heuristics (see [Rainbow](https://arxiv.org/abs/1710.02298)) (3 pts)\n", - "* [Noisy Nets](https://arxiv.org/abs/1706.10295) (3 pts)\n", - "* [distributional RL](https://arxiv.org/abs/1707.06887)(distributional and distributed stand for different things here) (5 pts)\n", + "* [Prioritized experience replay](https://arxiv.org/abs/1511.05952) (2 pts, please check for effective open-source implementations)\n", + "* [Noisy Nets](https://arxiv.org/abs/1706.10295) (2 pts, please, pay attention to the exploration policy and whether epsilon-greedy policy is used)\n", + "* [distributional RL](https://arxiv.org/abs/1707.06887)(distributional and distributed stand for different things here) (3 pts)\n", "* Other modifications (2+ pts depending on complexity)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "j_3RXboysoUH" + }, "source": [ - "## Bonus IV (4+ pts). Distributed RL.\n", + "## Bonus III (2 pts). Distributed RL.\n", "\n", - "Solve the task in a distributed way. It can strongly speed up learning. See [article](https://arxiv.org/pdf/1602.01783.pdf) or some guides." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**As usual bonus points for all the tasks fully stack.**" + "https://gymnasium.farama.org/api/vector/#gymnasium.vector.AsyncVectorEnv" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "f9X5aB56soUI" + }, "outputs": [], "source": [] } ], "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "pygments_lexer": "ipython3" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/week04_approx_rl/homework_tf.ipynb b/week04_approx_rl/homework_tf.ipynb index b7023c487..c95dc2355 100644 --- a/week04_approx_rl/homework_tf.ipynb +++ b/week04_approx_rl/homework_tf.ipynb @@ -20,10 +20,12 @@ " %tensorflow_version 1.x\n", " \n", " if not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", "\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week04_approx_rl/framebuffer.py\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week04_approx_rl/replay_buffer.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/framebuffer.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/replay_buffer.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/atari_wrappers.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/utils.py\n", "\n", " !touch .setup_complete\n", "\n", @@ -41,6 +43,17 @@ "__Frameworks__ - we'll accept this homework in any deep learning framework. This particular notebook was designed for tensorflow, but you will find it easy to adapt it to almost any python-based deep learning framework." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "import numpy as np\n", + "import utils" + ] + }, { "cell_type": "code", "execution_count": null, @@ -49,7 +62,6 @@ "source": [ "import gym\n", "import numpy as np\n", - "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" ] @@ -64,6 +76,50 @@ "This time we're gonna apply approximate q-learning to an Atari game called Breakout. It's not the hardest thing out there, but it's definitely way more complex than anything we tried before.\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ENV_NAME = \"BreakoutNoFrameskip-v4\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing (3 pts)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see what observations look like." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env = gym.make(ENV_NAME)\n", + "env.reset()\n", + "\n", + "n_cols = 5\n", + "n_rows = 2\n", + "fig = plt.figure(figsize=(16, 9))\n", + "\n", + "for row in range(n_rows):\n", + " for col in range(n_cols):\n", + " ax = fig.add_subplot(n_rows, n_cols, row * n_cols + col + 1)\n", + " ax.imshow(env.render('rgb_array'))\n", + " env.step(env.action_space.sample())\n", + "plt.show()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -75,7 +131,9 @@ "We can thus save a lot of time by preprocessing game image, including\n", "* Resizing to a smaller shape, 64 x 64\n", "* Converting to grayscale\n", - "* Cropping irrelevant image parts (top & bottom)" + "* Cropping irrelevant image parts (top & bottom)\n", + "\n", + "Tip: You can implement your own grayscale converter and assign a huge weight to the red channel. This dirty trick is not necessary but it will speed up learning." ] }, { @@ -90,15 +148,15 @@ "import cv2\n", "\n", "\n", - "class PreprocessAtari(ObservationWrapper):\n", + "class PreprocessAtariObs(ObservationWrapper):\n", " def __init__(self, env):\n", " \"\"\"A gym wrapper that crops, scales image into the desired shapes and optionally grayscales it.\"\"\"\n", - " ObservationWrapper.__init__(self, env)\n", + " super().__init__(env)\n", "\n", " self.img_size = (64, 64, 1)\n", " self.observation_space = Box(0.0, 1.0, self.img_size)\n", "\n", - " def _to_gray_scale(self, rgb_image, channel_weights=[0.8, 0.1, 0.1]):\\n\",\n", + " def _to_gray_scale(self, rgb, channel_weights=[0.8, 0.1, 0.1]):\n", " \n", "\n", " def observation(self, img):\n", @@ -124,8 +182,8 @@ "source": [ "import gym\n", "# spawn game instance for tests\n", - "env = gym.make(\"BreakoutNoFrameskip-v4\") # create raw env\n", - "env = PreprocessAtari(env)\n", + "env = gym.make(ENV_NAME) # create raw env\n", + "env = PreprocessAtariObs(env)\n", "\n", "observation_shape = env.observation_space.shape\n", "n_actions = env.action_space.n\n", @@ -135,17 +193,78 @@ "\n", "# test observation\n", "assert obs.ndim == 3, \"observation must be [height, width, channels] even if there's just one channel\"\n", - "assert obs.shape == observation_shape\n", - "assert obs.dtype == 'float32'\n", + "assert obs.shape == observation_shape, (obs.shape, observation_shape)\n", + "assert obs.dtype == np.float32\n", "assert len(np.unique(obs)) > 2, \"your image must not be binary\"\n", "assert 0 <= np.min(obs) and np.max(obs) <= 1, \"convert image pixels to (0,1) range\"\n", "assert np.max(obs) >= 0.5, \"It would be easier to see a brighter observation\"\n", "assert np.mean(obs) >= 0.1, \"It would be easier to see a brighter observation\"\n", "\n", + "assert np.max(obs) >= 0.5, \"It would be easier to see a brighter observation\"\n", + "assert np.mean(obs) >= 0.1, \"It would be easier to see a brighter observation\"\n", + "\n", "print(\"Formal tests seem fine. Here's an example of what you'll get.\")\n", "\n", - "plt.title(\"what your network gonna see\")\n", - "plt.imshow(obs, interpolation='none', cmap='gray')" + "\n", + "n_cols = 5\n", + "n_rows = 2\n", + "fig = plt.figure(figsize=(16, 9))\n", + "obs = env.reset()\n", + "for row in range(n_rows):\n", + " for col in range(n_cols):\n", + " ax = fig.add_subplot(n_rows, n_cols, row * n_cols + col + 1)\n", + " ax.imshow(obs[:, :, 0], interpolation='none', cmap='gray')\n", + " obs, _, _, _ = env.step(env.action_space.sample())\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Wrapping." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**About the game:** You have 5 lives and get points for breaking the wall. Higher bricks cost more than the lower ones. There are 4 actions: start game (should be called at the beginning and after each life is lost), move left, move right and do nothing. There are some common wrappers used for Atari environments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import atari_wrappers\n", + "\n", + "def PrimaryAtariWrap(env, clip_rewards=True):\n", + " assert 'NoFrameskip' in env.spec.id\n", + "\n", + " # This wrapper holds the same action for frames and outputs\n", + " # the maximal pixel value of 2 last frames (to handle blinking\n", + " # in some envs)\n", + " env = atari_wrappers.MaxAndSkipEnv(env, skip=4)\n", + "\n", + " # This wrapper sends done=True when each life is lost\n", + " # (not all the 5 lives that are givern by the game rules).\n", + " # It should make easier for the agent to understand that losing is bad.\n", + " env = atari_wrappers.EpisodicLifeEnv(env)\n", + "\n", + " # This wrapper laucnhes the ball when an episode starts.\n", + " # Without it the agent has to learn this action, too.\n", + " # Actually it can but learning would take longer.\n", + " env = atari_wrappers.FireResetEnv(env)\n", + "\n", + " # This wrapper transforms rewards to {-1, 0, 1} according to their sign\n", + " if clip_rewards:\n", + " env = atari_wrappers.ClipRewardEnv(env)\n", + "\n", + " # This wrapper is yours :)\n", + " env = PreprocessAtariObs(env)\n", + " return env" ] }, { @@ -168,9 +287,11 @@ "from framebuffer import FrameBuffer\n", "\n", "\n", - "def make_env():\n", - " env = gym.make(\"BreakoutNoFrameskip-v4\")\n", - " env = PreprocessAtari(env)\n", + "def make_env(clip_rewards=True, seed=None):\n", + " env = gym.make(ENV_NAME) # create raw env\n", + " if seed is not None:\n", + " env.seed(seed)\n", + " env = PrimaryAtariWrap(env, clip_rewards)\n", " env = FrameBuffer(env, n_frames=4, dim_order='tensorflow')\n", " return env\n", "\n", @@ -178,7 +299,7 @@ "env = make_env()\n", "env.reset()\n", "n_actions = env.action_space.n\n", - "state_dim = env.observation_space.shape" + "state_shape = env.observation_space.shape" ] }, { @@ -191,11 +312,22 @@ " obs, _, _, _ = env.step(env.action_space.sample())\n", "\n", "\n", + "plt.figure()\n", "plt.title(\"Game image\")\n", "plt.imshow(env.render(\"rgb_array\"))\n", "plt.show()\n", + "\n", + "plt.figure(figsize=[12,10])\n", "plt.title(\"Agent observation (4 frames left to right)\")\n", - "plt.imshow(obs.transpose([0, 2, 1]).reshape([state_dim[0], -1]))" + "plt.imshow(obs.transpose([0, 2, 1]).reshape([state_shape[0], -1]))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DQN as it is (4 pts)" ] }, { @@ -216,6 +348,19 @@ "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/dqn_arch.png)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Dueling network: (+2 pts)**\n", + "$$Q_{\\theta}(s, a) = V_{\\eta}(f_{\\xi}(s)) + A_{\\psi}(f_{\\xi}(s), a) - \\frac{\\sum_{a'}A_{\\psi}(f_{\\xi}(s), a')}{N_{actions}},$$\n", + "where $\\xi$, $\\eta$, and $\\psi$ are, respectively, the parameters of the\n", + "shared encoder $f_ξ$ , of the value stream $V_\\eta$ , and of the advan\n", + "tage stream $A_\\psi$; and $\\theta = \\{\\xi, \\eta, \\psi\\}$ is their concatenation.\n", + "\n", + "For the architecture on the image $V$ and $A$ heads can follow the dense layer instead of $Q$. Please don't worry that the model becomes a little bigger." + ] + }, { "cell_type": "code", "execution_count": null, @@ -233,7 +378,8 @@ "metadata": {}, "outputs": [], "source": [ - "from keras.layers import Conv2D, Dense, Flatten\n", + "from tensorflow.keras.layers import Conv2D, Dense, Flatten\n", + "from tensorflow.keras import models\n", "\n", "\n", "class DQNAgent:\n", @@ -283,7 +429,7 @@ "metadata": {}, "outputs": [], "source": [ - "agent = DQNAgent(\"dqn_agent\", state_dim, n_actions, epsilon=0.5)\n", + "agent = DQNAgent(\"dqn_agent\", state_shape, n_actions, epsilon=1)\n", "sess.run(tf.global_variables_initializer())" ] }, @@ -370,7 +516,7 @@ "metadata": {}, "outputs": [], "source": [ - "def play_and_record(agent, env, exp_replay, n_steps=1):\n", + "def play_and_record(initial_state, agent, env, exp_replay, n_steps=1):\n", " \"\"\"\n", " Play the game for exactly n steps, record every (s,a,r,s', done) to replay buffer. \n", " Whenever game ends, add record with done=True and reset the game.\n", @@ -378,13 +524,16 @@ "\n", " PLEASE DO NOT RESET ENV UNLESS IT IS \"DONE\"\n", "\n", - " :returns: return sum of rewards over time\n", + " :returns: return sum of rewards over time and the state in which the env stays\n", " \"\"\"\n", " # initial state\n", - " s = env.framebuffer\n", + " s = initial_state\n", + " sum_rewards = 0\n", "\n", " # Play the game for n_steps as per instructions above\n", - " " + " \n", + "\n", + " return sum_rewards, s" ] }, { @@ -393,33 +542,34 @@ "metadata": {}, "outputs": [], "source": [ - "# testing your code. This may take a minute...\n", - "exp_replay = ReplayBuffer(20000)\n", + "# testing your code.\n", + "exp_replay = ReplayBuffer(2000)\n", "\n", - "play_and_record(agent, env, exp_replay, n_steps=10000)\n", + "state = env.reset()\n", + "play_and_record(state, agent, env, exp_replay, n_steps=1000)\n", "\n", "# if you're using your own experience replay buffer, some of those tests may need correction.\n", "# just make sure you know what your code does\n", - "assert len(exp_replay) == 10000, (\n", - " \"play_and_record should have added exactly 10000 steps, \" +\n", - " \"but instead added %i\") % len(exp_replay)\n", + "assert len(exp_replay) == 1000, \"play_and_record should have added exactly 1000 steps, \"\\\n", + " \"but instead added %i\" % len(exp_replay)\n", "is_dones = list(zip(*exp_replay._storage))[-1]\n", "\n", - "assert 0 < np.mean(is_dones) < 0.1, (\n", - " \"Please make sure you restart the game whenever it is 'done' \" +\n", - " \"and record the is_done correctly into the buffer.\" +\n", - " \"Got %f is_done rate over %i steps. [If you think it's your tough luck, just re-run the test]\"\n", - ") % (np.mean(is_dones), len(exp_replay))\n", + "assert 0 < np.mean(is_dones) < 0.1, \"Please make sure you restart the game whenever it is 'done' and record the is_done correctly into the buffer.\"\\\n", + " \"Got %f is_done rate over %i steps. [If you think it's your tough luck, just re-run the test]\" % (\n", + " np.mean(is_dones), len(exp_replay))\n", "\n", "for _ in range(100):\n", " obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", " 10)\n", - " assert obs_batch.shape == next_obs_batch.shape == (10,) + state_dim\n", - " assert act_batch.shape == (10,), \"actions batch should have shape (10,) but is instead %s\" % str(act_batch.shape)\n", - " assert reward_batch.shape == (10,), \"rewards batch should have shape (10,) but is instead %s\" % str(reward_batch.shape)\n", - " assert is_done_batch.shape == (10,), \"is_done batch should have shape (10,) but is instead %s\" % str(is_done_batch.shape)\n", + " assert obs_batch.shape == next_obs_batch.shape == (10,) + state_shape\n", + " assert act_batch.shape == (10,), \\\n", + " \"actions batch should have shape (10,) but is instead %s\" % str(act_batch.shape)\n", + " assert reward_batch.shape == (10,), \\\n", + " \"rewards batch should have shape (10,) but is instead %s\" % str(reward_batch.shape)\n", + " assert is_done_batch.shape == (10,), \\\n", + " \"is_done batch should have shape (10,) but is instead %s\" % str(is_done_batch.shape)\n", " assert [int(i) in (0, 1) for i in is_dones], \"is_done should be strictly True or False\"\n", - " assert [0 <= a <= n_actions for a in act_batch], \"actions should be within [0, n_actions]\"\n", + " assert [0 <= a < n_actions for a in act_batch], \"actions should be within [0, n_actions)\"\n", "\n", "print(\"Well done!\")" ] @@ -446,7 +596,7 @@ "metadata": {}, "outputs": [], "source": [ - "target_network = DQNAgent(\"target_network\", state_dim, n_actions)" + "target_network = DQNAgent(\"target_network\", state_shape, n_actions)" ] }, { @@ -493,10 +643,10 @@ "outputs": [], "source": [ "# placeholders that will be fed with exp_replay.sample(batch_size)\n", - "obs_ph = tf.placeholder(tf.float32, shape=(None,) + state_dim)\n", + "obs_ph = tf.placeholder(tf.float32, shape=(None,) + state_shape)\n", "actions_ph = tf.placeholder(tf.int32, shape=[None])\n", "rewards_ph = tf.placeholder(tf.float32, shape=[None])\n", - "next_obs_ph = tf.placeholder(tf.float32, shape=(None,) + state_dim)\n", + "next_obs_ph = tf.placeholder(tf.float32, shape=(None,) + state_shape)\n", "is_done_ph = tf.placeholder(tf.float32, shape=[None])\n", "\n", "is_not_done = 1 - is_done_ph\n", @@ -535,7 +685,17 @@ "Where\n", "* $Q_{target}(s',a')$ denotes q-value of next state and next action predicted by __target_network__\n", "* $s, a, r, s'$ are current state, action, reward and next state respectively\n", - "* $\\gamma$ is a discount factor defined two cells above." + "* $\\gamma$ is a discount factor defined two cells above.\n", + "\n", + "\n", + "__Note 1:__ there's an example input below. Feel free to experiment with it before you write the function.\n", + "\n", + "__Note 2:__ compute_td_loss is a source of 99% of bugs in this homework. If reward doesn't improve, it often helps to go through it line by line [with a rubber duck](https://rubberduckdebugging.com/).\n", + "\n", + "**Double DQN (+2 pts)**\n", + "\n", + "$$ Q_{reference}(s,a) = r(s, a) + \\gamma \\cdot\n", + "Q_{target}(s',argmax_{a'}Q_\\theta(s', a')) $$" ] }, { @@ -586,7 +746,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Main loop\n", + "## Main loop (3 pts)\n", + "\n", + "**If deadline is tonight and it has not converged:** It is ok. Send the notebook today and when it converges send it again.\n", + "If the code is exactly the same points will not be discounted.\n", "\n", "It's time to put everything together and see if it learns anything." ] @@ -601,13 +764,27 @@ "import pandas as pd\n", "from IPython.display import clear_output\n", "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "seed = \n", + "random.seed(seed)\n", + "np.random.seed(seed)\n", + "tf.set_random_seed(seed)\n", "\n", - "def moving_average(x, span=100, **kw):\n", - " return pd.DataFrame({'x': np.asarray(x)}).x.ewm(span=span, **kw).mean().values\n", + "env = make_env(seed)\n", + "state_shape = env.observation_space.shape\n", + "n_actions = env.action_space.n\n", + "state = env.reset()\n", "\n", - "mean_rw_history = []\n", - "td_loss_history = []" + "sess.run(tf.global_variables_initializer())\n", + "sess.run(copy_step)" ] }, { @@ -616,8 +793,10 @@ "metadata": {}, "outputs": [], "source": [ - "exp_replay = ReplayBuffer(10**5)\n", - "play_and_record(agent, env, exp_replay, n_steps=10000)\n", + "REPLAY_BUFFER_SIZE = 10**5\n", + "\n", + "exp_replay = ReplayBuffer(REPLAY_BUFFER_SIZE)\n", + "_, state = play_and_record(state, agent, env, exp_replay, n_steps=10000)\n", "\n", "\n", "def sample_batch(exp_replay, batch_size):\n", @@ -637,41 +816,19 @@ "metadata": {}, "outputs": [], "source": [ - "for i in trange(10**5):\n", - " # play\n", - " play_and_record(agent, env, exp_replay, 10)\n", - "\n", - " # train\n", - " _, loss_t = sess.run([train_step, td_loss], sample_batch(exp_replay, batch_size=64))\n", - " td_loss_history.append(loss_t)\n", - "\n", - " # adjust agent parameters\n", - " if i % 500 == 0:\n", - " # You could think that loading weights onto a target network is simply\n", - " # load_weigths_into_target_network(agent, target_network)\n", - " # but actually calling this function repeatedly creates a TF copy operator\n", - " # again and again, which bloats memory consumption with each training step.\n", - " # Instead, you should create 'copy_step' once.\n", - " sess.run(copy_step)\n", - " agent.epsilon = max(agent.epsilon * 0.99, 0.01)\n", - " mean_rw_history.append(evaluate(make_env(), agent, n_games=3))\n", + "timesteps_per_epoch = 10\n", + "batch_size = 64\n", + "total_steps = 3 * 10**5\n", + "decay_steps = 10**5\n", "\n", - " if i % 100 == 0:\n", - " clear_output(True)\n", - " print(\"buffer size = %i, epsilon = %.5f\" % (len(exp_replay), agent.epsilon))\n", + "init_epsilon = 1\n", + "final_epsilon = 0.1\n", "\n", - " plt.subplot(1, 2, 1)\n", - " plt.title(\"mean reward per game\")\n", - " plt.plot(mean_rw_history)\n", - " plt.grid()\n", + "loss_freq = 5\n", + "refresh_target_network_freq = 500\n", + "eval_freq = 500\n", "\n", - " assert not np.isnan(loss_t)\n", - " plt.figure(figsize=[12, 4])\n", - " plt.subplot(1, 2, 2)\n", - " plt.title(\"TD loss history (moving average)\")\n", - " plt.plot(moving_average(np.array(td_loss_history), span=100, min_periods=100))\n", - " plt.grid()\n", - " plt.show()" + "n_lives = 5" ] }, { @@ -680,30 +837,123 @@ "metadata": {}, "outputs": [], "source": [ - "assert np.mean(mean_rw_history[-10:]) > 10.\n", - "print(\"That's good enough for tutorial.\")" + "mean_rw_history = []\n", + "td_loss_history = []\n", + "initial_state_v_history = []\n", + "step = 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "state = env.reset()\n", + "\n", + "with trange(step, total_steps) as progress_bar:\n", + " for step in progress_bar:\n", + " agent.epsilon = utils.linear_decay(init_epsilon, final_epsilon, step, decay_steps)\n", + " \n", + " # play\n", + " _, state = play_and_record(state, agent, env, exp_replay, timesteps_per_epoch)\n", + "\n", + " # train\n", + " _, loss_t = sess.run([train_step, td_loss], sample_batch(exp_replay, batch_size=batch_size))\n", + " \n", + " if step % loss_freq == 0:\n", + " td_loss_history.append(loss_t)\n", + " \n", + " if step % refresh_target_network_freq == 0:\n", + " # Load agent weights into target_network\n", + " sess.run(copy_step)\n", + "\n", + " if step % eval_freq == 0:\n", + " # eval the agent\n", + " mean_rw_history.append(evaluate(\n", + " make_env(clip_rewards=True, seed=step), agent, n_games=3 * n_lives, greedy=True)\n", + " )\n", + " initial_state_q_values = agent.get_qvalues(\n", + " [make_env(seed=step).reset()]\n", + " )\n", + " initial_state_v_history.append(np.max(initial_state_q_values))\n", + "\n", + " clear_output(True)\n", + " print(\"buffer size = %i, epsilon = %.5f\" %\n", + " (len(exp_replay), agent.epsilon))\n", + "\n", + " plt.figure(figsize=[16, 9])\n", + "\n", + " plt.subplot(2, 2, 1)\n", + " plt.title(\"Mean reward per life\")\n", + " plt.plot(mean_rw_history)\n", + " plt.grid()\n", + "\n", + " assert not np.isnan(td_loss_history[-1])\n", + " plt.subplot(2, 2, 2)\n", + " plt.title(\"TD loss history (smoothened)\")\n", + " plt.plot(utils.smoothen(td_loss_history))\n", + " plt.grid()\n", + "\n", + " plt.subplot(2, 2, 3)\n", + " plt.title(\"Initial state V\")\n", + " plt.plot(initial_state_v_history)\n", + " plt.grid()\n", + "\n", + " plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "__ How to interpret plots: __\n", + "Agent is evaluated for 1 life, not for a whole episode of 5 lives. Rewards in evaluation are also truncated. Cuz this is what environment the agent is learning in and in this way mean rewards per life can be compared with initial state value\n", "\n", + "**The goal is to get 15 points in the real env**. So 3 or better 4 points in the preprocessed one will probably be enough. You can interrupt learning then." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Final scoring is done on a whole episode with all 5 lives." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_score = evaluate(\n", + " make_env(clip_rewards=False, seed=9),\n", + " agent, n_games=30, greedy=True, t_max=10 * 1000\n", + ") * n_lives\n", + "print('final score:', final_score)\n", + "assert final_score >= 15, 'not as cool as DQN can'\n", + "print('Cool!')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How to interpret plots:\n", "\n", "This aint no supervised learning so don't expect anything to improve monotonously. \n", - "* __ TD loss __ is the MSE between agent's current Q-values and target Q-values. It may slowly increase or decrease, it's ok. The \"not ok\" behavior includes going NaN or stayng at exactly zero before agent has perfect performance.\n", - "* __ mean reward__ is the expected sum of r(s,a) agent gets over the full game session. It will oscillate, but on average it should get higher over time (after a few thousand iterations...). \n", - " * In basic q-learning implementation it takes 5-10k steps to \"warm up\" agent before it starts to get better.\n", - "* __ buffer size__ - this one is simple. It should go up and cap at max size.\n", - "* __ epsilon__ - agent's willingness to explore. If you see that agent's already at 0.01 epsilon before it's average reward is above 0 - __ it means you need to increase epsilon__. Set it back to some 0.2 - 0.5 and decrease the pace at which it goes down.\n", - "* Also please ignore first 100-200 steps of each plot - they're just oscillations because of the way moving average works.\n", + "* **TD loss** is the MSE between agent's current Q-values and target Q-values. It may slowly increase or decrease, it's ok. The \"not ok\" behavior includes going NaN or stayng at exactly zero before agent has perfect performance.\n", + "* **mean reward** is the expected sum of r(s,a) agent gets over the full game session. It will oscillate, but on average it should get higher over time (after a few thousand iterations...). \n", + " * In basic q-learning implementation it takes about 40k steps to \"warm up\" agent before it starts to get better.\n", + "* **Initial state V** is the expected discounted reward for episode in the oppinion of the agent. It should behave more smoothly than **mean reward**. It should get higher over time but sometimes can experience drawdowns because of the agaent's overestimates.\n", + "* **buffer size** - this one is simple. It should go up and cap at max size.\n", + "* **epsilon** - agent's willingness to explore. If you see that agent's already at 0.01 epsilon before it's average reward is above 0 - it means you need to increase epsilon. Set it back to some 0.2 - 0.5 and decrease the pace at which it goes down.\n", + "* Smoothing of plots is done with a gaussian kernel\n", "\n", "At first your agent will lose quickly. Then it will learn to suck less and at least hit the ball a few times before it loses. Finally it will learn to actually score points.\n", "\n", - "__Training will take time.__ A lot of it actually. An optimistic estimate is to say it's gonna start winning (average reward > 10) after 10k steps. \n", + "**Training will take time.** A lot of it actually. Probably you will not see any improvment during first **150k** time steps (note that by default in this notebook agent is evaluated every 5000 time steps).\n", "\n", - "But hey, look on the bright side of things:\n", + "But hey, long training time isn't _that_ bad:\n", "\n", "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/training.png)" ] @@ -712,7 +962,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Video" + "## About hyperparameters:\n", + "\n", + "The task has something in common with supervised learning: loss is optimized through the buffer (instead of Train dataset). But the distribution of states and actions in the buffer **is not stationary** and depends on the policy that generated it. It can even happen that the mean TD error across the buffer is very low but the performance is extremely poor (imagine the agent collecting data to the buffer always manages to avoid the ball).\n", + "\n", + "* Total timesteps and training time: It seems to be so huge, but actually it is normal for RL.\n", + "\n", + "* $\\epsilon$ decay shedule was taken from the original paper and is like traditional for epsilon-greedy policies. At the beginning of the training the agent's greedy policy is poor so many random actions should be taken.\n", + "\n", + "* Optimizer: In the original paper RMSProp was used (they did not have Adam in 2013) and it can work not worse than Adam. For us Adam was default and it worked.\n", + "\n", + "* lr: $10^{-3}$ would probably be too huge\n", + "\n", + "* batch size: This one can be very important: if it is too small the agent can fail to learn. Huge batch takes more time to process.\n", + "\n", + "* target network update frequency: has something in common with learning rate. Too frequent updates can lead to divergence. Too rare can lead to slow learning. For millions of total timesteps thousands of inner steps seem ok. One iteration of target network updating is an iteration of the (this time approximate) $\\gamma$-compression that stands behind Q-learning. The more inner steps it makes the more accurate is the compression." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Let's have a closer look at this.\n", + "\n", + "If average episode score is below 200 using all 5 lives, then probably DQN has not converged fully. But anyway let's make a more complete record of an episode." ] }, { @@ -721,8 +994,54 @@ "metadata": {}, "outputs": [], "source": [ - "# Don't forget to reset epsilon back to previous value if you want to go on training\n", - "agent.epsilon = 0" + "eval_env = make_env(clip_rewards=False)\n", + "record = utils.play_and_log_episode(eval_env, agent)\n", + "print('total reward for life:', np.sum(record['rewards']))\n", + "for key in record:\n", + " print(key)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = plt.figure(figsize=(5, 5))\n", + "ax = fig.add_subplot(1, 1, 1)\n", + "\n", + "ax.scatter(record['v_mc'], record['v_agent'])\n", + "ax.plot(sorted(record['v_mc']), sorted(record['v_mc']),\n", + " 'black', linestyle='--', label='x=y')\n", + "\n", + "ax.grid()\n", + "ax.legend()\n", + "ax.set_title('State Value Estimates')\n", + "ax.set_xlabel('Monte-Carlo')\n", + "ax.set_ylabel('Agent')\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$\\hat V_{Monte-Carlo}(s_t) = \\sum_{\\tau=0}^{episode~end} \\gamma^{\\tau-t}r_t$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Is there a big bias? It's ok, anyway it works." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Video" ] }, { @@ -736,7 +1055,7 @@ "import gym.wrappers\n", "\n", "with gym.wrappers.Monitor(make_env(), directory=\"videos\", force=True) as env_monitor:\n", - " sessions = [evaluate(env_monitor, agent, n_games=1) for _ in range(100)]" + " sessions = [evaluate(env_monitor, agent, n_games=n_lives, greedy=True) for _ in range(10)]" ] }, { @@ -749,88 +1068,117 @@ "# work for you, you can download the videos and view them locally.\n", "\n", "from pathlib import Path\n", + "from base64 import b64encode\n", "from IPython.display import HTML\n", "\n", - "video_names = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_paths = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_path = video_paths[-1] # You can also try other indices\n", + "\n", + "if 'google.colab' in sys.modules:\n", + " # https://stackoverflow.com/a/57378660/1214547\n", + " with video_path.open('rb') as fp:\n", + " mp4 = fp.read()\n", + " data_url = 'data:video/mp4;base64,' + b64encode(mp4).decode()\n", + "else:\n", + " data_url = str(video_path)\n", "\n", "HTML(\"\"\"\n", "\n", - "\"\"\".format(video_names[-1])) # You can also try other indices" + "\"\"\".format(data_url))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Assignment part I (5 pts)\n", - "\n", - "We'll start by implementing target network to stabilize training.\n", - "\n", - "To do that you should use TensorFlow functionality. \n", - "\n", - "We recommend thoroughly debugging your code on simple tests before applying it in Atari dqn." + "## Bonus I (2 pts)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Bonus I (2+ pts)\n", + "**1.** Plot several (say 3) states with high and low spreads of Q estimate by actions i.e.\n", + "$$\\max_a \\hat Q(s,a) - \\min_a \\hat Q(s,a)\\$$\n", + "Please take those states from different episodes to make sure that the states are really different.\n", "\n", - "Implement and train double q-learning.\n", + "What should high and low spread mean at least in the world of perfect Q-fucntions?\n", "\n", - "This task contains of\n", - "* Implementing __double q-learning__ or __dueling q-learning__ or both (see tips below)\n", - "* Training a network till convergence\n", - " * Full points will be awwarded if your network gets average score of >=10 (see \"evaluating results\")\n", - " * Higher score = more points as usual\n", - " * If you're running out of time, it's okay to submit a solution that hasn't converged yet and updating it when it converges. _Lateness penalty will not increase for second submission_, so submitting first one in time gets you no penalty.\n", + "Comment the states you like most.\n", "\n", + "**2.** Plot several (say 3) states with high td-error and several states with high values of\n", + "$$| \\hat V_{Monte-Carlo}(s) - \\hat V_{agent}(s)|,$$ \n", + "$$\\hat V_{agent}(s)=\\max_a \\hat Q(s,a).$$ Please take those states from different episodes to make sure that the states are really different. From what part (i.e. beginning, middle, end) of an episode did these states come from?\n", "\n", - "#### Tips:\n", - "* Implementing __double q-learning__ shouldn't be a problem if you've already have target networks in place.\n", - " * You will probably need `tf.argmax` to select best actions\n", - " * Here's an original [article](https://arxiv.org/abs/1509.06461)\n", + "Comment the states you like most." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from utils import play_and_log_episode, img_by_obs\n", "\n", - "* __Dueling__ architecture is also quite straightforward if you have standard DQN.\n", - " * You will need to change network architecture, namely the q-values layer\n", - " * It must now contain two heads: V(s) and A(s,a), both dense layers\n", - " * You should then add them up via elemwise sum layer.\n", - " * Here's an [article](https://arxiv.org/pdf/1511.06581.pdf)" + "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Bonus II (5+ pts): Prioritized experience replay\n", + "## Bonus II (1-5 pts). Get High Score!\n", + "\n", + "1 point to you for each 50 points of your agent. Truncated by 5 points. Starting with 50 points, **not** 50 + threshold.\n", "\n", - "In this section, you're invited to implement prioritized experience replay\n", + "One way is to train for several days and use heavier hardware (why not actually).\n", + "\n", + "Another way is to apply modifications (see **Bonus III**)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bonus III (2+ pts). Apply modifications to DQN.\n", "\n", - "* You will probably need to provide a custom data structure\n", - "* Once pool.update is called, collect the pool.experience_replay.observations, actions, rewards and is_alive and store them in your data structure\n", - "* You can now sample such transitions in proportion to the error (see [article](https://arxiv.org/abs/1511.05952)) for training.\n", + "For inspiration see [Rainbow](https://arxiv.org/abs/1710.02298) - a version of q-learning that combines lots of them.\n", "\n", - "It's probably more convenient to explicitly declare inputs for \"sample observations\", \"sample actions\" and so on to plug them into q-learning.\n", + "Points for Bonus II and Bonus III fully stack. So if modified agent gets score 250+ you get 5 pts for Bonus II + points for modifications. If the final score is 40 then you get the points for modifications.\n", "\n", - "Prioritized (and even normal) experience replay should greatly reduce amount of game sessions you need to play in order to achieve good performance. \n", "\n", - "While it's effect on runtime is limited for atari, more complicated envs (further in the course) will certainly benefit for it.\n", + "Some modifications:\n", + "* [Prioritized experience replay](https://arxiv.org/abs/1511.05952) (5 pts for your own implementation, 3 pts for using a ready one)\n", + "* [double q-learning](https://arxiv.org/abs/1509.06461) (2 pts)\n", + "* [dueling q-learning](https://arxiv.org/abs/1511.06581) (2 pts)\n", + "* multi-step heuristics (see [Rainbow](https://arxiv.org/abs/1710.02298)) (3 pts)\n", + "* [Noisy Nets](https://arxiv.org/abs/1706.10295) (3 pts)\n", + "* [distributional RL](https://arxiv.org/abs/1707.06887)(distributional and distributed stand for different things here) (5 pts)\n", + "* Other modifications (2+ pts depending on complexity)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bonus IV (4+ pts). Distributed RL.\n", "\n", - "Prioritized experience replay only supports off-policy algorithms, so pls enforce `n_steps=1` in your q-learning reference computation (default is 10)." + "Solve the task in a distributed way. It can strongly speed up learning. See [article](https://arxiv.org/pdf/1602.01783.pdf) or some guides." ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "**As usual bonus points for all the tasks fully stack.**" + ] } ], "metadata": { + "accelerator": "GPU", "language_info": { "name": "python", "pygments_lexer": "ipython3" diff --git a/week04_approx_rl/img/dqn_arch.png b/week04_approx_rl/img/dqn_arch.png new file mode 100644 index 000000000..47a82a18a Binary files /dev/null and b/week04_approx_rl/img/dqn_arch.png differ diff --git a/week04_approx_rl/img/dueling.png b/week04_approx_rl/img/dueling.png new file mode 100644 index 000000000..b72a9ad2d Binary files /dev/null and b/week04_approx_rl/img/dueling.png differ diff --git a/week04_approx_rl/img/dueling_basic.png b/week04_approx_rl/img/dueling_basic.png new file mode 100644 index 000000000..13b86ef38 Binary files /dev/null and b/week04_approx_rl/img/dueling_basic.png differ diff --git a/week04_approx_rl/img/dueling_nature.png b/week04_approx_rl/img/dueling_nature.png new file mode 100644 index 000000000..3ad0ec1e9 Binary files /dev/null and b/week04_approx_rl/img/dueling_nature.png differ diff --git a/week04_approx_rl/img/dueling_single.png b/week04_approx_rl/img/dueling_single.png new file mode 100644 index 000000000..297701529 Binary files /dev/null and b/week04_approx_rl/img/dueling_single.png differ diff --git a/week04_approx_rl/img/exp_replay.png b/week04_approx_rl/img/exp_replay.png new file mode 100644 index 000000000..cded84fb5 Binary files /dev/null and b/week04_approx_rl/img/exp_replay.png differ diff --git a/week04_approx_rl/requirements.txt b/week04_approx_rl/requirements.txt new file mode 100644 index 000000000..69414565d --- /dev/null +++ b/week04_approx_rl/requirements.txt @@ -0,0 +1,13 @@ +gymnasium[atari,accept-rom-license,classic-control] +ale_py +ipython +ipywidgets +notebook +matplotlib +moviepy +numpy +opencv-python +scipy +tensorboard +torch +torchvision diff --git a/week04_approx_rl/seminar_lasagne.ipynb b/week04_approx_rl/seminar_lasagne.ipynb deleted file mode 100644 index d80665ed7..000000000 --- a/week04_approx_rl/seminar_lasagne.ipynb +++ /dev/null @@ -1,350 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Approximate q-learning\n", - "\n", - "In this notebook you will teach a lasagne neural network to do Q-learning." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Frameworks__ - we'll accept this homework in any deep learning framework. For example, it translates to TensorFlow almost line-to-line. However, we recommend you to stick to theano/lasagne unless you're certain about your skills in the framework of your choice." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%env THEANO_FLAGS = 'floatX=float32'\n", - "import os\n", - "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", - " !bash ../xvfb start\n", - " os.environ['DISPLAY'] = ':1'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import gym\n", - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "env = gym.make(\"CartPole-v0\").env\n", - "env.reset()\n", - "n_actions = env.action_space.n\n", - "state_dim = env.observation_space.shape\n", - "\n", - "plt.imshow(env.render(\"rgb_array\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Approximate (deep) Q-learning: building the network\n", - "\n", - "In this section we will build and train naive Q-learning with theano/lasagne" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First step is initializing input variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import theano\n", - "import theano.tensor as T\n", - "\n", - "# create input variables. We'll support multiple states at once\n", - "\n", - "\n", - "current_states = T.matrix(\"states[batch,units]\")\n", - "actions = T.ivector(\"action_ids[batch]\")\n", - "rewards = T.vector(\"rewards[batch]\")\n", - "next_states = T.matrix(\"next states[batch,units]\")\n", - "is_end = T.ivector(\"vector[batch] where 1 means that session just ended\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import lasagne\n", - "from lasagne.layers import *\n", - "\n", - "# input layer\n", - "l_states = InputLayer((None,)+state_dim)\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "# output layer\n", - "l_qvalues = DenseLayer( , num_units=n_actions, nonlinearity=None)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Predicting Q-values for `current_states`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get q-values for ALL actions in current_states\n", - "predicted_qvalues = get_output(l_qvalues, {l_states: current_states})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# compiling agent's \"GetQValues\" function\n", - "get_qvalues = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# select q-values for chosen actions\n", - "predicted_qvalues_for_actions = predicted_qvalues[T.arange(\n", - " actions.shape[0]), actions]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Loss function and `update`\n", - "Here we write a function similar to `agent.update`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# predict q-values for next states\n", - "predicted_next_qvalues = get_output(l_qvalues, {l_states: < theano input with for states> })\n", - "\n", - "\n", - "# Computing target q-values under\n", - "gamma = 0.99\n", - "target_qvalues_for_actions = \n", - "\n", - "# zero-out q-values at the end\n", - "target_qvalues_for_actions = (1-is_end)*target_qvalues_for_actions\n", - "\n", - "# don't compute gradient over target q-values (consider constant)\n", - "target_qvalues_for_actions = theano.gradient.disconnected_grad(\n", - " target_qvalues_for_actions)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# mean squared error loss function\n", - "loss = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# all network weights\n", - "all_weights = get_all_params(l_qvalues, trainable=True)\n", - "\n", - "# network updates. Note the small learning rate (for stability)\n", - "updates = lasagne.updates.sgd(loss, all_weights, learning_rate=1e-4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Training function that resembles agent.update(state,action,reward,next_state)\n", - "# with 1 more argument meaning is_end\n", - "train_step = theano.function([current_states, actions, rewards, next_states, is_end],\n", - " updates=updates)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Playing the game" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "epsilon = 0.25 # initial epsilon\n", - "\n", - "\n", - "def generate_session(t_max=1000):\n", - " \"\"\"play env with approximate q-learning agent and train it at the same time\"\"\"\n", - "\n", - " total_reward = 0\n", - " s = env.reset()\n", - "\n", - " for t in range(t_max):\n", - "\n", - " # get action q-values from the network\n", - " q_values = get_qvalues([s])[0]\n", - "\n", - " a = \n", - "\n", - " new_s, r, done, info = env.step(a)\n", - "\n", - " # train agent one step. Note that we use one-element arrays instead of scalars\n", - " # because that's what function accepts.\n", - " train_step([s], [a], [r], [new_s], [done])\n", - "\n", - " total_reward += r\n", - "\n", - " s = new_s\n", - " if done:\n", - " break\n", - "\n", - " return total_reward" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for i in range(100):\n", - "\n", - " rewards = [generate_session() for _ in range(100)] # generate new sessions\n", - "\n", - " epsilon *= 0.95\n", - "\n", - " print(\"mean reward:%.3f\\tepsilon:%.5f\" % (np.mean(rewards), epsilon))\n", - "\n", - " if np.mean(rewards) > 300:\n", - " print(\"You Win!\")\n", - " break\n", - "\n", - " assert epsilon != 0, \"Please explore environment\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Video" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "epsilon = 0 # Don't forget to reset epsilon back to initial value if you want to go on training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# record sessions\n", - "import gym.wrappers\n", - "\n", - "env = gym.wrappers.Monitor(gym.make(\"CartPole-v0\"),\n", - " directory=\"videos\", force=True)\n", - "sessions = [generate_session() for _ in range(100)]\n", - "env.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# show video\n", - "from IPython.display import HTML\n", - "import os\n", - "\n", - "video_names = list(\n", - " filter(lambda s: s.endswith(\".mp4\"), os.listdir(\"./videos/\")))\n", - "\n", - "HTML(\"\"\"\n", - "\n", - "\"\"\".format(\"./videos/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/week04_approx_rl/seminar_pytorch.ipynb b/week04_approx_rl/seminar_pytorch.ipynb index 75c4fda6f..1cab93585 100644 --- a/week04_approx_rl/seminar_pytorch.ipynb +++ b/week04_approx_rl/seminar_pytorch.ipynb @@ -1,382 +1,454 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Approximate q-learning\n", - "\n", - "In this notebook you will teach a __PyTorch__ neural network to do Q-learning." - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "kr_aKWMGEmh-" + }, + "source": [ + "# Approximate q-learning\n", + "\n", + "In this notebook you will teach a __PyTorch__ neural network to do Q-learning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oaMu65ONEmh_" + }, + "outputs": [], + "source": [ + "import sys, os\n", + "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + " !touch .setup_complete\n", + "\n", + "# This code creates a virtual display to draw game images on.\n", + "# It will have no effect if your machine has a monitor.\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " os.environ['DISPLAY'] = ':1'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "avILCRKkEpaX" + }, + "outputs": [], + "source": [ + "!pip install gymnasium[classic_control]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "K_SRk2ASEmh_" + }, + "outputs": [], + "source": [ + "import gymnasium as gym\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x2YvkgprEmh_" + }, + "outputs": [], + "source": [ + "env = gym.make(\"CartPole-v0\", render_mode=\"rgb_array\").env\n", + "env.reset()\n", + "n_actions = env.action_space.n\n", + "state_dim = env.observation_space.shape\n", + "\n", + "plt.imshow(env.render())\n", + "env.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sI8W19CwEmh_" + }, + "source": [ + "# Approximate Q-learning: building the network\n", + "\n", + "To train a neural network policy one must have a neural network policy. Let's build it.\n", + "\n", + "\n", + "Since we're working with a pre-extracted features (cart positions, angles and velocities), we don't need a complicated network yet. In fact, let's build something like this for starters:\n", + "\n", + "![img](https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/yet_another_week/_resource/qlearning_scheme.png)\n", + "\n", + "For your first run, please only use linear layers (`nn.Linear`) and activations. Stuff like batch normalization or dropout may ruin everything if used haphazardly.\n", + "\n", + "Also please avoid using nonlinearities like sigmoid & tanh: since agent's observations are not normalized, sigmoids might be saturated at initialization. Instead, use non-saturating nonlinearities like ReLU.\n", + "\n", + "Ideally you should start small with maybe 1-2 hidden layers with < 200 neurons and then increase network size if agent doesn't beat the target score." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "YdWXv8WJEmiA" + }, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "y2-PcaIQEmiA" + }, + "outputs": [], + "source": [ + "network = nn.Sequential()\n", + "\n", + "network.add_module('layer1', )\n", + "\n", + "\n", + "\n", + "# hint: use state_dim[0] as input size" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "8xuWPGriEmiA" + }, + "outputs": [], + "source": [ + "def get_action(state, epsilon=0):\n", + " \"\"\"\n", + " sample actions with epsilon-greedy policy\n", + " recap: with p = epsilon pick random action, else pick action with highest Q(s,a)\n", + " \"\"\"\n", + " state = torch.tensor(state[None], dtype=torch.float32)\n", + " q_values = network(state).detach().numpy()\n", + "\n", + " \n", + "\n", + " return int( )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wroEfSRNEmiA" + }, + "outputs": [], + "source": [ + "s, _ = env.reset()\n", + "assert tuple(network(torch.tensor([s]*3, dtype=torch.float32)).size()) == (\n", + " 3, n_actions), \"please make sure your model maps state s -> [Q(s,a0), ..., Q(s, a_last)]\"\n", + "assert isinstance(list(network.modules(\n", + "))[-1], nn.Linear), \"please make sure you predict q-values without nonlinearity (ignore if you know what you're doing)\"\n", + "assert isinstance(get_action(s), int), \"get_action(s) must return int, not %s. try int(action)\" % (type(get_action(s)))\n", + "\n", + "# test epsilon-greedy exploration\n", + "for eps in [0., 0.1, 0.5, 1.0]:\n", + " state_frequencies = np.bincount(\n", + " [get_action(s, epsilon=eps) for i in range(10000)], minlength=n_actions)\n", + " best_action = state_frequencies.argmax()\n", + " assert abs(state_frequencies[best_action] -\n", + " 10000 * (1 - eps + eps / n_actions)) < 200\n", + " for other_action in range(n_actions):\n", + " if other_action != best_action:\n", + " assert abs(state_frequencies[other_action] -\n", + " 10000 * (eps / n_actions)) < 200\n", + " print('e=%.1f tests passed' % eps)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f88ovLBQEmiA" + }, + "source": [ + "### Q-learning via gradient descent\n", + "\n", + "We shall now train our agent's Q-function by minimizing the TD loss:\n", + "$$ L = { 1 \\over N} \\sum_i (Q_{\\theta}(s,a) - [r(s,a) + \\gamma \\cdot max_{a'} Q_{-}(s', a')]) ^2 $$\n", + "\n", + "\n", + "Where\n", + "* $s, a, r, s'$ are current state, action, reward and next state respectively\n", + "* $\\gamma$ is a discount factor defined two cells above.\n", + "\n", + "The tricky part is with $Q_{-}(s',a')$. From an engineering standpoint, it's the same as $Q_{\\theta}$ - the output of your neural network policy. However, when doing gradient descent, __we won't propagate gradients through it__ to make training more stable (see lectures).\n", + "\n", + "To do so, we shall use `x.detach()` function which basically says \"consider this thing constant when doing backprop\"." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "bOIpO142EmiB" + }, + "outputs": [], + "source": [ + "def compute_td_loss(states, actions, rewards, next_states, is_done, gamma=0.99, check_shapes=False):\n", + " \"\"\" Compute td loss using torch operations only. Use the formula above. \"\"\"\n", + " states = torch.tensor(\n", + " states, dtype=torch.float32) # shape: [batch_size, state_size]\n", + " actions = torch.tensor(actions, dtype=torch.long) # shape: [batch_size]\n", + " rewards = torch.tensor(rewards, dtype=torch.float32) # shape: [batch_size]\n", + " # shape: [batch_size, state_size]\n", + " next_states = torch.tensor(next_states, dtype=torch.float32)\n", + " is_done = torch.tensor(is_done, dtype=torch.uint8) # shape: [batch_size]\n", + "\n", + " # get q-values for all actions in current states\n", + " predicted_qvalues = network(states) # shape: [batch_size, n_actions]\n", + "\n", + " # select q-values for chosen actions\n", + " predicted_qvalues_for_actions = predicted_qvalues[ # shape: [batch_size]\n", + " range(states.shape[0]), actions\n", + " ]\n", + "\n", + " # compute q-values for all actions in next states\n", + " predicted_next_qvalues = \n", + "\n", + " # compute V*(next_states) using predicted next q-values\n", + " next_state_values = \n", + " assert next_state_values.dtype == torch.float32\n", + "\n", + " # compute \"target q-values\" for loss - it's what's inside square parentheses in the above formula.\n", + " target_qvalues_for_actions = \n", + "\n", + " # at the last state we shall use simplified formula: Q(s,a) = r(s,a) since s' doesn't exist\n", + " target_qvalues_for_actions = torch.where(\n", + " is_done, rewards, target_qvalues_for_actions)\n", + "\n", + " # mean squared error loss to minimize\n", + " loss = torch.mean((predicted_qvalues_for_actions -\n", + " target_qvalues_for_actions.detach()) ** 2)\n", + "\n", + " if check_shapes:\n", + " assert predicted_next_qvalues.data.dim(\n", + " ) == 2, \"make sure you predicted q-values for all actions in next state\"\n", + " assert next_state_values.data.dim(\n", + " ) == 1, \"make sure you computed V(s') as maximum over just the actions axis and not all axes\"\n", + " assert target_qvalues_for_actions.data.dim(\n", + " ) == 1, \"there's something wrong with target q-values, they must be a vector\"\n", + "\n", + " return loss" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "id": "lKi6AK3DEmiB" + }, + "outputs": [], + "source": [ + "# sanity checks\n", + "s, _ = env.reset()\n", + "a = env.action_space.sample()\n", + "next_s, r, terminated, _, _ = env.step(a)\n", + "loss = compute_td_loss([s], [a], [r], [next_s], [terminated], check_shapes=True)\n", + "loss.backward()\n", + "\n", + "assert len(loss.size()) == 0, \"you must return scalar loss - mean over batch\"\n", + "assert np.any(next(network.parameters()).grad.detach().numpy() !=\n", + " 0), \"loss must be differentiable w.r.t. network weights\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LgL6G5lFEmiB" + }, + "source": [ + "### Playing the game" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "id": "zsHb_fjjEmiB" + }, + "outputs": [], + "source": [ + "opt = torch.optim.Adam(network.parameters(), lr=1e-4)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "id": "nJ_-xtsjEmiB" + }, + "outputs": [], + "source": [ + "def generate_session(env, t_max=1000, epsilon=0, train=False):\n", + " \"\"\"play env with approximate q-learning agent and train it at the same time\"\"\"\n", + " total_reward = 0\n", + " s, _ = env.reset()\n", + "\n", + " for t in range(t_max):\n", + " a = get_action(s, epsilon=epsilon)\n", + " next_s, r, terminated, truncated, _ = env.step(a)\n", + "\n", + " if train:\n", + " opt.zero_grad()\n", + " compute_td_loss([s], [a], [r], [next_s], [terminated]).backward()\n", + " opt.step()\n", + "\n", + " total_reward += r\n", + " s = next_s\n", + " if terminated or truncated:\n", + " break\n", + "\n", + " return total_reward" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "id": "40mKYuVIEmiB" + }, + "outputs": [], + "source": [ + "epsilon = 0.5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EXy8ij00EmiB" + }, + "outputs": [], + "source": [ + "for i in range(1000):\n", + " session_rewards = [generate_session(env, epsilon=epsilon, train=True) for _ in range(100)]\n", + " print(\"epoch #{}\\tmean reward = {:.3f}\\tepsilon = {:.3f}\".format(i, np.mean(session_rewards), epsilon))\n", + "\n", + " epsilon *= 0.99\n", + " assert epsilon >= 1e-4, \"Make sure epsilon is always nonzero during training\"\n", + "\n", + " if np.mean(session_rewards) > 300:\n", + " print(\"You Win!\")\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XJPoF9XtEmiB" + }, + "source": [ + "### How to interpret results\n", + "\n", + "\n", + "Welcome to the f.. world of deep f...n reinforcement learning. Don't expect agent's reward to smoothly go up. Hope for it to go increase eventually. If it deems you worthy.\n", + "\n", + "Seriously though,\n", + "* __ mean reward__ is the average reward per game. For a correct implementation it may stay low for some 10 epochs, then start growing while oscilating insanely and converges by ~50-100 steps depending on the network architecture.\n", + "* If it never reaches target score by the end of for loop, try increasing the number of hidden neurons or look at the epsilon.\n", + "* __ epsilon__ - agent's willingness to explore. If you see that agent's already at < 0.01 epsilon before it's is at least 200, just reset it back to 0.1 - 0.5." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lhKiN-qOEmiB" + }, + "source": [ + "### Record videos\n", + "\n", + "As usual, we now use `gymnasium.wrappers.RecordVideo` to record a video of our agent playing the game. Unlike our previous attempts with state binarization, this time we expect our agent to act ~~(or fail)~~ more smoothly since there's no more binarization error at play.\n", + "\n", + "As you already did with tabular q-learning, we set epsilon=0 for final evaluation to prevent agent from exploring himself to death." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2yqPkj6HEmiB" + }, + "outputs": [], + "source": [ + "# Record sessions\n", + "\n", + "from gymnasium.wrappers import RecordVideo\n", + "\n", + "with gym.make(\"CartPole-v0\", render_mode=\"rgb_array\") as record_env, RecordVideo(\n", + " record_env, video_folder=\"videos\"\n", + ") as env_monitor:\n", + " sessions = [\n", + " generate_session(env_monitor, epsilon=0, train=False) for _ in range(100)\n", + " ]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "afqi2qomEmiC" + }, + "outputs": [], + "source": [ + "# Show video. This may not work in some setups. If it doesn't\n", + "# work for you, you can download the videos and view them locally.\n", + "\n", + "from pathlib import Path\n", + "from base64 import b64encode\n", + "from IPython.display import HTML\n", + "\n", + "video_paths = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_path = video_paths[-1] # You can also try other indices\n", + "\n", + "if 'google.colab' in sys.modules:\n", + " # https://stackoverflow.com/a/57378660/1214547\n", + " with video_path.open('rb') as fp:\n", + " mp4 = fp.read()\n", + " data_url = 'data:video/mp4;base64,' + b64encode(mp4).decode()\n", + "else:\n", + " data_url = str(video_path)\n", + "\n", + "HTML(\"\"\"\n", + "\n", + "\"\"\".format(data_url))" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys, os\n", - "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", - " !touch .setup_complete\n", - "\n", - "# This code creates a virtual display to draw game images on.\n", - "# It will have no effect if your machine has a monitor.\n", - "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", - " !bash ../xvfb start\n", - " os.environ['DISPLAY'] = ':1'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import gym\n", - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "env = gym.make(\"CartPole-v0\").env\n", - "env.reset()\n", - "n_actions = env.action_space.n\n", - "state_dim = env.observation_space.shape\n", - "\n", - "plt.imshow(env.render(\"rgb_array\"))\n", - "env.close()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Approximate Q-learning: building the network\n", - "\n", - "To train a neural network policy one must have a neural network policy. Let's build it.\n", - "\n", - "\n", - "Since we're working with a pre-extracted features (cart positions, angles and velocities), we don't need a complicated network yet. In fact, let's build something like this for starters:\n", - "\n", - "![img](https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/yet_another_week/_resource/qlearning_scheme.png)\n", - "\n", - "For your first run, please only use linear layers (`nn.Linear`) and activations. Stuff like batch normalization or dropout may ruin everything if used haphazardly. \n", - "\n", - "Also please avoid using nonlinearities like sigmoid & tanh: since agent's observations are not normalized, sigmoids might be saturated at initialization. Instead, use non-saturating nonlinearities like ReLU.\n", - "\n", - "Ideally you should start small with maybe 1-2 hidden layers with < 200 neurons and then increase network size if agent doesn't beat the target score." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "network = nn.Sequential()\n", - "\n", - "network.add_module('layer1', )\n", - "\n", - "\n", - "\n", - "# hint: use state_dim[0] as input size" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_action(state, epsilon=0):\n", - " \"\"\"\n", - " sample actions with epsilon-greedy policy\n", - " recap: with p = epsilon pick random action, else pick action with highest Q(s,a)\n", - " \"\"\"\n", - " state = torch.tensor(state[None], dtype=torch.float32)\n", - " q_values = network(state).detach().numpy()\n", - "\n", - " \n", - "\n", - " return int( )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "s = env.reset()\n", - "assert tuple(network(torch.tensor([s]*3, dtype=torch.float32)).size()) == (\n", - " 3, n_actions), \"please make sure your model maps state s -> [Q(s,a0), ..., Q(s, a_last)]\"\n", - "assert isinstance(list(network.modules(\n", - "))[-1], nn.Linear), \"please make sure you predict q-values without nonlinearity (ignore if you know what you're doing)\"\n", - "assert isinstance(get_action(\n", - " s), int), \"get_action(s) must return int, not %s. try int(action)\" % (type(get_action(s)))\n", - "\n", - "# test epsilon-greedy exploration\n", - "for eps in [0., 0.1, 0.5, 1.0]:\n", - " state_frequencies = np.bincount(\n", - " [get_action(s, epsilon=eps) for i in range(10000)], minlength=n_actions)\n", - " best_action = state_frequencies.argmax()\n", - " assert abs(state_frequencies[best_action] -\n", - " 10000 * (1 - eps + eps / n_actions)) < 200\n", - " for other_action in range(n_actions):\n", - " if other_action != best_action:\n", - " assert abs(state_frequencies[other_action] -\n", - " 10000 * (eps / n_actions)) < 200\n", - " print('e=%.1f tests passed' % eps)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Q-learning via gradient descent\n", - "\n", - "We shall now train our agent's Q-function by minimizing the TD loss:\n", - "$$ L = { 1 \\over N} \\sum_i (Q_{\\theta}(s,a) - [r(s,a) + \\gamma \\cdot max_{a'} Q_{-}(s', a')]) ^2 $$\n", - "\n", - "\n", - "Where\n", - "* $s, a, r, s'$ are current state, action, reward and next state respectively\n", - "* $\\gamma$ is a discount factor defined two cells above.\n", - "\n", - "The tricky part is with $Q_{-}(s',a')$. From an engineering standpoint, it's the same as $Q_{\\theta}$ - the output of your neural network policy. However, when doing gradient descent, __we won't propagate gradients through it__ to make training more stable (see lectures).\n", - "\n", - "To do so, we shall use `x.detach()` function which basically says \"consider this thing constant when doingbackprop\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def compute_td_loss(states, actions, rewards, next_states, is_done, gamma=0.99, check_shapes=False):\n", - " \"\"\" Compute td loss using torch operations only. Use the formula above. \"\"\"\n", - " states = torch.tensor(\n", - " states, dtype=torch.float32) # shape: [batch_size, state_size]\n", - " actions = torch.tensor(actions, dtype=torch.long) # shape: [batch_size]\n", - " rewards = torch.tensor(rewards, dtype=torch.float32) # shape: [batch_size]\n", - " # shape: [batch_size, state_size]\n", - " next_states = torch.tensor(next_states, dtype=torch.float32)\n", - " is_done = torch.tensor(is_done, dtype=torch.uint8) # shape: [batch_size]\n", - "\n", - " # get q-values for all actions in current states\n", - " predicted_qvalues = network(states)\n", - "\n", - " # select q-values for chosen actions\n", - " predicted_qvalues_for_actions = predicted_qvalues[\n", - " range(states.shape[0]), actions\n", - " ]\n", - "\n", - " # compute q-values for all actions in next states\n", - " predicted_next_qvalues = \n", - "\n", - " # compute V*(next_states) using predicted next q-values\n", - " next_state_values = \n", - " assert next_state_values.dtype == torch.float32\n", - "\n", - " # compute \"target q-values\" for loss - it's what's inside square parentheses in the above formula.\n", - " target_qvalues_for_actions = \n", - "\n", - " # at the last state we shall use simplified formula: Q(s,a) = r(s,a) since s' doesn't exist\n", - " target_qvalues_for_actions = torch.where(\n", - " is_done, rewards, target_qvalues_for_actions)\n", - "\n", - " # mean squared error loss to minimize\n", - " loss = torch.mean((predicted_qvalues_for_actions -\n", - " target_qvalues_for_actions.detach()) ** 2)\n", - "\n", - " if check_shapes:\n", - " assert predicted_next_qvalues.data.dim(\n", - " ) == 2, \"make sure you predicted q-values for all actions in next state\"\n", - " assert next_state_values.data.dim(\n", - " ) == 1, \"make sure you computed V(s') as maximum over just the actions axis and not all axes\"\n", - " assert target_qvalues_for_actions.data.dim(\n", - " ) == 1, \"there's something wrong with target q-values, they must be a vector\"\n", - "\n", - " return loss" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# sanity checks\n", - "s = env.reset()\n", - "a = env.action_space.sample()\n", - "next_s, r, done, _ = env.step(a)\n", - "loss = compute_td_loss([s], [a], [r], [next_s], [done], check_shapes=True)\n", - "loss.backward()\n", - "\n", - "assert len(loss.size()) == 0, \"you must return scalar loss - mean over batch\"\n", - "assert np.any(next(network.parameters()).grad.detach().numpy() !=\n", - " 0), \"loss must be differentiable w.r.t. network weights\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Playing the game" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "opt = torch.optim.Adam(network.parameters(), lr=1e-4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def generate_session(env, t_max=1000, epsilon=0, train=False):\n", - " \"\"\"play env with approximate q-learning agent and train it at the same time\"\"\"\n", - " total_reward = 0\n", - " s = env.reset()\n", - "\n", - " for t in range(t_max):\n", - " a = get_action(s, epsilon=epsilon)\n", - " next_s, r, done, _ = env.step(a)\n", - "\n", - " if train:\n", - " opt.zero_grad()\n", - " compute_td_loss([s], [a], [r], [next_s], [done]).backward()\n", - " opt.step()\n", - "\n", - " total_reward += r\n", - " s = next_s\n", - " if done:\n", - " break\n", - "\n", - " return total_reward" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "epsilon = 0.5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for i in range(1000):\n", - " session_rewards = [generate_session(env, epsilon=epsilon, train=True) for _ in range(100)]\n", - " print(\"epoch #{}\\tmean reward = {:.3f}\\tepsilon = {:.3f}\".format(i, np.mean(session_rewards), epsilon))\n", - "\n", - " epsilon *= 0.99\n", - " assert epsilon >= 1e-4, \"Make sure epsilon is always nonzero during training\"\n", - "\n", - " if np.mean(session_rewards) > 300:\n", - " print(\"You Win!\")\n", - " break" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### How to interpret results\n", - "\n", - "\n", - "Welcome to the f.. world of deep f...n reinforcement learning. Don't expect agent's reward to smoothly go up. Hope for it to go increase eventually. If it deems you worthy.\n", - "\n", - "Seriously though,\n", - "* __ mean reward__ is the average reward per game. For a correct implementation it may stay low for some 10 epochs, then start growing while oscilating insanely and converges by ~50-100 steps depending on the network architecture. \n", - "* If it never reaches target score by the end of for loop, try increasing the number of hidden neurons or look at the epsilon.\n", - "* __ epsilon__ - agent's willingness to explore. If you see that agent's already at < 0.01 epsilon before it's is at least 200, just reset it back to 0.1 - 0.5." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Record videos\n", - "\n", - "As usual, we now use `gym.wrappers.Monitor` to record a video of our agent playing the game. Unlike our previous attempts with state binarization, this time we expect our agent to act ~~(or fail)~~ more smoothly since there's no more binarization error at play.\n", - "\n", - "As you already did with tabular q-learning, we set epsilon=0 for final evaluation to prevent agent from exploring himself to death." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Record sessions\n", - "\n", - "import gym.wrappers\n", - "\n", - "with gym.wrappers.Monitor(gym.make(\"CartPole-v0\"), directory=\"videos\", force=True) as env_monitor:\n", - " sessions = [generate_session(env_monitor, epsilon=0, train=False) for _ in range(100)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Show video. This may not work in some setups. If it doesn't\n", - "# work for you, you can download the videos and view them locally.\n", - "\n", - "from pathlib import Path\n", - "from IPython.display import HTML\n", - "\n", - "video_names = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", - "\n", - "HTML(\"\"\"\n", - "\n", - "\"\"\".format(video_names[-1])) # You can also try other indices" - ] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/week04_approx_rl/seminar_tf.ipynb b/week04_approx_rl/seminar_tf.ipynb index 70c51945a..4f69a60d4 100644 --- a/week04_approx_rl/seminar_tf.ipynb +++ b/week04_approx_rl/seminar_tf.ipynb @@ -27,7 +27,7 @@ " %tensorflow_version 1.x\n", " \n", " if not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", "\n", " !touch .setup_complete\n", "\n", @@ -99,6 +99,18 @@ "keras.backend.set_session(sess)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert not tf.test.is_gpu_available(), \\\n", + " \"Please complete this assignment without a GPU. If you use a GPU, the code \" \\\n", + " \"will run a lot slower due to a lot of copying to and from GPU memory. \" \\\n", + " \"To disable the GPU in Colab, go to Runtime → Change runtime type → None.\"" + ] + }, { "cell_type": "code", "execution_count": null, @@ -123,7 +135,7 @@ " recap: with p = epsilon pick random action, else pick action with highest Q(s,a)\n", " \"\"\"\n", " \n", - " q_values = network(state[None])[0]\n", + " q_values = network.predict(state[None])[0]\n", " \n", " \n", "\n", @@ -368,15 +380,25 @@ "# work for you, you can download the videos and view them locally.\n", "\n", "from pathlib import Path\n", + "from base64 import b64encode\n", "from IPython.display import HTML\n", "\n", - "video_names = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_paths = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_path = video_paths[-1] # You can also try other indices\n", + "\n", + "if 'google.colab' in sys.modules:\n", + " # https://stackoverflow.com/a/57378660/1214547\n", + " with video_path.open('rb') as fp:\n", + " mp4 = fp.read()\n", + " data_url = 'data:video/mp4;base64,' + b64encode(mp4).decode()\n", + "else:\n", + " data_url = str(video_path)\n", "\n", "HTML(\"\"\"\n", "\n", - "\"\"\".format(video_names[-1])) # You can also try other indices" + "\"\"\".format(data_url))" ] } ], diff --git a/week04_approx_rl/test_td_loss/compute_td_loss.py b/week04_approx_rl/test_td_loss/compute_td_loss.py new file mode 100644 index 000000000..6c418ba18 --- /dev/null +++ b/week04_approx_rl/test_td_loss/compute_td_loss.py @@ -0,0 +1,214 @@ +import torch +import torch.nn as nn +from typing import Protocol + + +class ComputeTdLossProtocol(Protocol): + """ + An Protocol which the compute_td_loss function should match. + """ + + def __call__( + self, + states: torch.Tensor, + actions: torch.Tensor, + rewards: torch.Tensor, + next_states: torch.Tensor, + is_done: torch.Tensor, + agent: nn.Module, + target_network: nn.Module, + gamma: float, + ): + pass + + +class MockAgent(nn.Module): + """ + An nn.Module, which outputs a value which does not depend on its input. + Designed to be used for testing the compute_td_loss function. + """ + + def __init__(self, output_q_values: torch.Tensor): + super().__init__() + assert output_q_values.dtype == torch.float, output_q_values.dtype + assert output_q_values.ndim == 2, output_q_values.shape + self.output_q_values = nn.Parameter(output_q_values) + + def forward(self, state): + return torch.clone(self.output_q_values) + + +@torch.no_grad() +def test_is_done_is_used(compute_td_loss: ComputeTdLossProtocol): + """ + Tries to catch the error when compute_td_loss ignores its is_done argument. + """ + + states = torch.empty(1) + actions = torch.tensor([0]) + rewards = torch.tensor([1], dtype=torch.float) + is_done_first = torch.tensor([True]) + is_done_second = torch.tensor([False]) + next_states = torch.empty(1) + gamma = 0.99 + + q_values_agent = torch.tensor([[1, 1, 1]], dtype=torch.float) + q_values_target_network = torch.tensor([[1, 1, 1]], dtype=torch.float) + agent = MockAgent(q_values_agent) + target_network = MockAgent(q_values_target_network) + + loss_kwargs = dict( + states=states, + actions=actions, + rewards=rewards, + next_states=next_states, + agent=agent, + target_network=target_network, + gamma=gamma, + ) + + loss_first = compute_td_loss(is_done=is_done_first, **loss_kwargs).item() + loss_second = compute_td_loss(is_done=is_done_second, **loss_kwargs).item() + + abs_diff = abs(loss_first - loss_second) + if abs_diff > 0.5: + msg = "compute_td_loss returned close values for different is_done inputs" + + assert abs(loss_first - loss_second) > 0.5, msg + + +@torch.no_grad() +def test_compute_td_loss_vanilla(compute_td_loss: ComputeTdLossProtocol): + """ + Checks compute_td_loss on manually precomputed examples. + Note: this is a test for vanilla compute_td_loss + and it should NOT be used for double_dqn + """ + + samples = [ + { + "q_agent": [0, 1, 2], + "action": 1, + "is_done": False, + "q_target": [0, 1, 2], + "gamma": 0.5, + "reward": 5, + "answer": 25, + }, + { + "q_agent": [0, 1, 2], + "action": 1, + "is_done": False, + "q_target": [2, 0, 1], + "gamma": 0.5, + "reward": 5, + "answer": 25, + }, + { + "q_agent": [3, 1, 2], + "action": 1, + "is_done": True, + "q_target": [0, 1, 2], + "gamma": 0.5, + "reward": 5, + "answer": 16, + }, + { + "q_agent": [0, 1, 2], + "action": 0, + "is_done": False, + "q_target": [0, 1, 2], + "gamma": 0.5, + "reward": 5, + "answer": 36, + }, + ] + + for sample in samples: + agent = MockAgent(torch.tensor(sample["q_agent"], dtype=torch.float)[None]) + tn = MockAgent(torch.tensor(sample["q_target"], dtype=torch.float)[None]) + ans = compute_td_loss( + states=torch.empty(1), + actions=torch.tensor(sample["action"])[None], + rewards=torch.tensor(sample["reward"])[None], + next_states=torch.empty(1), + is_done=torch.tensor(sample["is_done"])[None], + agent=agent, + target_network=tn, + gamma=sample["gamma"], + ).item() + abs_diff = abs(ans - sample["answer"]) + assert abs_diff < 1e-8, abs_diff + + +@torch.no_grad() +def test_compute_td_loss_double(compute_td_loss: ComputeTdLossProtocol): + """ + Checks compute_td_loss on manually precomputed examples. + Note: this is a test for vanilla compute_td_loss + and it should NOT be used for double_dqn + """ + + samples = [ + { + "q_agent": [0, 1, 2], + "action": 1, + "is_done": False, + "q_target": [0, 1, 2], + "gamma": 0.5, + "reward": 5, + "answer": 25, + }, + { + "q_agent": [0, 1, 2], + "action": 1, + "is_done": False, + "q_target": [2, 0, 1], + "gamma": 0.5, + "reward": 5, + "answer": 20.25, + }, + { + "q_agent": [3, 1, 2], + "action": 1, + "is_done": False, + "q_target": [-1, 1, 2], + "gamma": 0.5, + "reward": 5, + "answer": 12.25, + }, + { + "q_agent": [3, 1, 2], + "action": 1, + "is_done": True, + "q_target": [-1, 1, 2], + "gamma": 0.5, + "reward": 5, + "answer": 16, + }, + { + "q_agent": [0, 1, 2], + "action": 0, + "is_done": False, + "q_target": [0, 1, 2], + "gamma": 0.5, + "reward": 5, + "answer": 36, + }, + ] + + for sample in samples: + agent = MockAgent(torch.tensor(sample["q_agent"], dtype=torch.float)[None]) + tn = MockAgent(torch.tensor(sample["q_target"], dtype=torch.float)[None]) + ans = compute_td_loss( + states=torch.empty(1), + actions=torch.tensor(sample["action"])[None], + rewards=torch.tensor(sample["reward"])[None], + next_states=torch.empty(1), + is_done=torch.tensor(sample["is_done"])[None], + agent=agent, + target_network=tn, + gamma=sample["gamma"], + ).item() + abs_diff = abs(ans - sample["answer"]) + assert abs_diff < 1e-8, abs_diff diff --git a/week04_approx_rl/utils.py b/week04_approx_rl/utils.py deleted file mode 100644 index 0229b3007..000000000 --- a/week04_approx_rl/utils.py +++ /dev/null @@ -1,90 +0,0 @@ -import numpy as np -import psutil -from scipy.signal import convolve, gaussian -import torch -from torch import nn -import os - - -def get_cum_discounted_rewards(rewards, gamma): - """ - evaluates cumulative discounted rewards: - r_t + gamma * r_{t+1} + gamma^2 * r_{t_2} + ... - """ - cum_rewards = [] - cum_rewards.append(rewards[-1]) - for r in reversed(rewards[:-1]): - cum_rewards.insert(0, r + gamma * cum_rewards[0]) - return cum_rewards - - -def play_and_log_episode(env, agent, gamma=0.99, t_max=10000): - """ - always greedy - """ - states = [] - v_mc = [] - v_agent = [] - q_spreads = [] - td_errors = [] - rewards = [] - - s = env.reset() - for step in range(t_max): - states.append(s) - qvalues = agent.get_qvalues([s]) - max_q_value, min_q_value = np.max(qvalues), np.min(qvalues) - v_agent.append(max_q_value) - q_spreads.append(max_q_value - min_q_value) - if step > 0: - td_errors.append( - np.abs(rewards[-1] + gamma * v_agent[-1] - v_agent[-2])) - - action = qvalues.argmax(axis=-1)[0] - - s, r, done, _ = env.step(action) - rewards.append(r) - if done: - break - td_errors.append(np.abs(rewards[-1] + gamma * v_agent[-1] - v_agent[-2])) - - v_mc = get_cum_discounted_rewards(rewards, gamma) - - return_pack = { - 'states': np.array(states), - 'v_mc': np.array(v_mc), - 'v_agent': np.array(v_agent), - 'q_spreads': np.array(q_spreads), - 'td_errors': np.array(td_errors), - 'rewards': np.array(rewards), - 'episode_finished': np.array(done) - } - - return return_pack - - -def img_by_obs(obs, state_dim): - """ - Unwraps obs by channels. - observation is of shape [c, h=w, w=h] - """ - return obs.reshape([-1, state_dim[2]]) - - -def is_enough_ram(min_available_gb=0.1): - mem = psutil.virtual_memory() - return mem.available >= min_available_gb * (1024 ** 3) - - -def linear_decay(init_val, final_val, cur_step, total_steps): - if cur_step >= total_steps: - return final_val - return (init_val * (total_steps - cur_step) + - final_val * cur_step) / total_steps - - -def smoothen(values): - kernel = gaussian(100, std=100) - # kernel = np.concatenate([np.arange(100), np.arange(99, -1, -1)]) - kernel = kernel / np.sum(kernel) - return convolve(values, kernel, 'valid') diff --git a/week05_explore/README.md b/week05_explore/README.md index bb31b3f9d..5cbb50c25 100644 --- a/week05_explore/README.md +++ b/week05_explore/README.md @@ -1,4 +1,4 @@ -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring20/week05_explore/week5.ipynb) +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week05_explore/week5.ipynb) ### Slides - [here](https://yadi.sk/i/H0zVBROe3TWWHz) @@ -6,14 +6,14 @@ * [__main__] David Silver lecture on exploration and expoitation - [video](https://www.youtube.com/watch?v=sGuiWX07sKw) * Alternative lecture by J. Schulman - [video](https://www.youtube.com/watch?v=SfCa1HQMkuw) * Alternative lecture by N. de Freitas (with bayesian opt) - [video](https://www.youtube.com/watch?v=vz3D36VXefI) -* Our lectures (russian) +* Our lectures (russian) - "mathematical" lecture (by Alexander Vorobev) '17 - [slides](https://yadi.sk/i/JAeItALT3JmvCL), [video](https://yadi.sk/i/bVHmu9gt3Hi9Ym) - "practical" lecture '18 - [video](https://yadi.sk/i/_myWJ13O3TdzXo) - Seminar - [video](https://yadi.sk/i/du7FLXs13TdzZS) - - - -## More materials + + + +## More materials * Gittins Index - the less heuristical approach to bandit exploration - [article](http://www.ece.mcgill.ca/~amahaj1/projects/bandits/book/2013-bandit-computations.pdf) * "Deep" version: variational information maximizing exploration - [video](https://www.youtube.com/watch?v=sRIjxxjVrnY) * Same topics in russian - [video](https://yadi.sk/i/_2_0yqeW3HDbcn) @@ -22,17 +22,9 @@ * Same topics in russian - [video](https://www.youtube.com/watch?v=WCE9hhPbCmc) * Note: UCB-1 is not for bernoulli rewards, but for arbitrary r in [0,1], so you can just scale any reward to [0,1] to obtain a peace of mind. It's derived directly from Hoeffding's inequality. +* Very interesting blog post written by Lilian Weng that summarises this week's materials: [The Multi-Armed Bandit Problem and Its Solutions](https://lilianweng.github.io/posts/2018-01-23-multi-armed-bandit/) + ## Seminar -In this seminar, you'll be solvilg basic and contextual bandits with uncertainty-based exploration like Bayesian UCB and Thompson Sampling. - -You will also need Bayesian Neural Networks. You will need theano/lasagne for this one: -``` -# either -conda install Theano -# or -pip install --upgrade https://github.com/Theano/Theano/archive/master.zip -# and then lasagne -pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip -``` +In this seminar, you'll be solving basic and contextual bandits with uncertainty-based exploration like Bayesian UCB and Thompson Sampling. You will also get acquainted with Bayesian Neural Networks. Everything else is in the notebook :) diff --git a/week05_explore/action_rewards.npy b/week05_explore/action_rewards.npy deleted file mode 100644 index 231bcb18b..000000000 Binary files a/week05_explore/action_rewards.npy and /dev/null differ diff --git a/week05_explore/all_states.npy b/week05_explore/all_states.npy deleted file mode 100644 index 43940d9ba..000000000 Binary files a/week05_explore/all_states.npy and /dev/null differ diff --git a/week05_explore/bayes.py b/week05_explore/bayes.py deleted file mode 100644 index ffb9b9adc..000000000 --- a/week05_explore/bayes.py +++ /dev/null @@ -1,153 +0,0 @@ -""" -A single-file module that makes your lasagne network into a bayesian neural net. -Originally created by github.com/ferrine , rewritten by github.com/justheuristic for simplicity - -See example in the notebook -""" - -import numpy as np - -from theano import tensor as T -from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams - -import lasagne -from lasagne import init -from lasagne.random import get_rng - -from functools import wraps - -__all__ = ['NormalApproximation', 'get_var_cost', 'bbpwrap'] - - -class NormalApproximation(object): - def __init__(self, mu=0, std=np.exp(-3), seed=None): - """ - Approximation that samples network weights from factorized normal distribution. - - :param mu: prior mean for gaussian weights - :param std: prior std for gaussian weights - :param seed: random seed - """ - self.prior_mu = mu - self.prior_std = std - self.srng = RandomStreams(seed or get_rng().randint(1, 2147462579)) - - def log_normal(self, x, mean, std, eps=0.0): - """computes log-proba of normal distribution""" - std += eps - return - 0.5 * np.log(2 * np.pi) - T.log(T.abs_(std)) - \ - (x - mean) ** 2 / (2 * std ** 2) - - def log_prior(self, weights): - """ - Logarithm of prior probabilities for weights: - log P(weights) aka log P(theta) - """ - return self.log_normal(weights, self.prior_mu, self.prior_std) - - def log_posterior_approx(self, weights, mean, rho): - """ - Logarithm of ELBO on posterior probabilities: - log q(weights|learned mu and rho) aka log q(theta|x) - """ - std = T.log1p(T.exp(rho)) # rho to std - return self.log_normal(weights, mean, std) - - def __call__(self, layer, spec, shape, name=None, **tags): - # case when user uses default init specs - assert tags.get( - 'variational', False), "Please declare param as variational to avoid confusion" - - if not isinstance(spec, dict): - initial_rho = np.log(np.expm1(self.prior_std)) # std to rho - assert np.isfinite(initial_rho), "too small std to initialize correctly. Please pass explicit"\ - " initializer (dict with {'mu':mu_init, 'rho':rho_init})." - spec = {'mu': spec, 'rho': init.Constant(initial_rho)} - - mu_spec, rho_spec = spec['mu'], spec['rho'] - - rho = layer.add_param( - rho_spec, shape, name=( - name or 'unk') + '.rho', **tags) - mean = layer.add_param( - mu_spec, shape, name=( - name or 'unk') + '.mu', **tags) - - # Reparameterization trick - e = self.srng.normal(shape, std=1) - W = mean + T.log1p(T.exp(rho)) * e - - # KL divergence KL(q,p) = E_(w~q(w|x)) [log q(w|x) - log P(w)] aka - # variational cost - q_p = T.sum( - self.log_posterior_approx(W, mean, rho) - - self.log_prior(W) - ) - - # accumulate variational cost - layer._bbwrap_var_cost += q_p - return W - - -def get_var_cost(layer_or_layers, treat_as_input=None): - """ - Returns total variational cost aka KL(q(theta|x)||p(theta)) for all layers in the network - - :param layer_or_layers: top layer(s) of your network, just like with lasagne.layers.get_output - :param treat_as_input: don't accumulate over layers below these layers. See same param for lasagne.layers.get_all_layers - - Alternatively, one can manually get weights for one layer via layer.get_var_cost() - """ - cost = 0 - for layer in lasagne.layers.get_all_layers( - layer_or_layers, treat_as_input): - if hasattr(layer, 'get_var_cost'): - # if layer is bayesian or pretends so - cost += layer.get_var_cost() - return cost - - -def bbpwrap(approximation=NormalApproximation()): - """ - A decorator that makes arbitrary lasagne layer into a bayesian network layer: - BayesDenseLayer = bbwrap()(DenseLayer) - or more verbosely, - @bbpwrap(NormalApproximation(pstd=0.01)) - BayesDenseLayer(DenseLayer): - pass - - """ - - def decorator(cls): - def add_param_wrap(add_param): - @wraps(add_param) - def wrapped(self, spec, shape, name=None, **tags): - # we should take care about some user specification - # to avoid bbp hook just set tags['variational'] = True - if not tags.get('trainable', True) or \ - tags.get('variational', False): - return add_param(self, spec, shape, name, **tags) - else: - # we declare that params we add next - # are the ones we need to fit the distribution - # they don't need to be regularized, strictly - tags['variational'] = True - tags['regularizable'] = False - param = self.approximation(self, spec, shape, name, **tags) - return param - return wrapped - - def get_var_cost(self): - """ - Returns total variational cost aka KL(q(theta|x)||p(theta)) for this layer. - Alternatively, use function get_var_cost(layer) to get total cost for all layers below this one. - """ - return self._bbwrap_var_cost - - cls.approximation = approximation - cls._bbwrap_var_cost = 0 - cls.add_param = add_param_wrap(cls.add_param) - cls.get_var_cost = get_var_cost - return cls - - return decorator diff --git a/week05_explore/bnn.png b/week05_explore/bnn.png deleted file mode 100644 index 6ff8059fb..000000000 Binary files a/week05_explore/bnn.png and /dev/null differ diff --git a/week05_explore/deep_see.png b/week05_explore/deep_see.png new file mode 100644 index 000000000..a1601b725 Binary files /dev/null and b/week05_explore/deep_see.png differ diff --git a/week05_explore/q_learning_agent.py b/week05_explore/q_learning_agent.py new file mode 100644 index 000000000..f7f52fca7 --- /dev/null +++ b/week05_explore/q_learning_agent.py @@ -0,0 +1,112 @@ +from collections import defaultdict +import random +import math +import numpy as np + + +class QLearningAgent: + def __init__(self, alpha, epsilon, discount, get_legal_actions): + """ + Q-Learning Agent + based on https://inst.eecs.berkeley.edu/~cs188/sp19/projects.html + Instance variables you have access to + - self.epsilon (exploration prob) + - self.alpha (learning rate) + - self.discount (discount rate aka gamma) + + Functions you should use + - self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable} + which returns legal actions for a state + - self.get_qvalue(state,action) + which returns Q(state,action) + - self.set_qvalue(state,action,value) + which sets Q(state,action) := value + !!!Important!!! + Note: please avoid using self._qValues directly. + There's a special self.get_qvalue/set_qvalue for that. + """ + + self.get_legal_actions = get_legal_actions + self._qvalues = defaultdict(lambda: defaultdict(lambda: 0)) + self.alpha = alpha + self.epsilon = epsilon + self.discount = discount + + def get_qvalue(self, state, action): + """ Returns Q(state,action) """ + return self._qvalues[state][action] + + def set_qvalue(self, state, action, value): + """ Sets the Qvalue for [state,action] to the given value """ + self._qvalues[state][action] = value + + def get_value(self, state): + """ + Compute your agent's estimate of V(s) using current q-values + V(s) = max_over_action Q(state,action) over possible actions. + Note: please take into account that q-values can be negative. + """ + possible_actions = self.get_legal_actions(state) + + # If there are no legal actions, return 0.0 + if len(possible_actions) == 0: + return 0.0 + + value = max([self.get_qvalue(state, a) for a in possible_actions]) + return value + + def update(self, state, action, reward, next_state, done): + """ + You should do your Q-Value update here: + Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s')) + """ + + # agent parameters + gamma = self.discount + learning_rate = self.alpha + + q = reward + gamma * (1 - done) * self.get_value(next_state) + q = (1 - learning_rate) * self.get_qvalue(state, action) + learning_rate * q + + self.set_qvalue(state, action, q) + + def get_best_action(self, state): + """ + Compute the best action to take in a state (using current q-values). + """ + possible_actions = self.get_legal_actions(state) + + # If there are no legal actions, return None + if len(possible_actions) == 0: + return None + + idx = np.argmax([self.get_qvalue(state, a) for a in possible_actions]) + + return possible_actions[idx] + + def get_action(self, state): + """ + Compute the action to take in the current state, including exploration. + With probability self.epsilon, we should take a random action. + otherwise - the best policy action (self.get_best_action). + + Note: To pick randomly from a list, use random.choice(list). + To pick True or False with a given probablity, generate uniform number in [0, 1] + and compare it with your probability + """ + + # Pick Action + possible_actions = self.get_legal_actions(state) + action = None + + # If there are no legal actions, return None + if len(possible_actions) == 0: + return None + + # agent parameters: + epsilon = self.epsilon + + if np.random.rand() < epsilon: + return np.random.choice(possible_actions) + + return self.get_best_action(state) \ No newline at end of file diff --git a/week04_approx_rl/replay_buffer.py b/week05_explore/replay_buffer.py similarity index 99% rename from week04_approx_rl/replay_buffer.py rename to week05_explore/replay_buffer.py index 915a107b9..9136dd078 100644 --- a/week04_approx_rl/replay_buffer.py +++ b/week05_explore/replay_buffer.py @@ -72,3 +72,4 @@ def sample(self, batch_size): for _ in range(batch_size) ] return self._encode_sample(idxes) + diff --git a/week05_explore/river_swim.png b/week05_explore/river_swim.png deleted file mode 100644 index 233244c6c..000000000 Binary files a/week05_explore/river_swim.png and /dev/null differ diff --git a/week05_explore/und1.mp4 b/week05_explore/und1.mp4 new file mode 100644 index 000000000..d67190f54 Binary files /dev/null and b/week05_explore/und1.mp4 differ diff --git a/week05_explore/und2.mp4 b/week05_explore/und2.mp4 new file mode 100644 index 000000000..1e41469fe Binary files /dev/null and b/week05_explore/und2.mp4 differ diff --git a/week05_explore/week5.ipynb b/week05_explore/week5.ipynb index 8d9a270bc..b4c5f3538 100644 --- a/week05_explore/week5.ipynb +++ b/week05_explore/week5.ipynb @@ -2,21 +2,62 @@ "cells": [ { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 98, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Looking in indexes: https://pypi.yandex-team.ru/simple/\n", + "Requirement already satisfied: bsuite in /home/npytincev/.local/lib/python3.8/site-packages (0.3.5)\n", + "Requirement already satisfied: plotnine in /home/npytincev/.local/lib/python3.8/site-packages (from bsuite) (0.8.0)\n", + "Requirement already satisfied: matplotlib in /home/npytincev/.local/lib/python3.8/site-packages (from bsuite) (3.5.1)\n", + "Requirement already satisfied: pandas in /home/npytincev/.local/lib/python3.8/site-packages (from bsuite) (1.4.2)\n", + "Requirement already satisfied: termcolor in /home/npytincev/.local/lib/python3.8/site-packages (from bsuite) (1.1.0)\n", + "Requirement already satisfied: absl-py in /home/npytincev/.local/lib/python3.8/site-packages (from bsuite) (0.12.0)\n", + "Requirement already satisfied: scikit-image in /home/npytincev/.local/lib/python3.8/site-packages (from bsuite) (0.19.2)\n", + "Requirement already satisfied: numpy in /home/npytincev/.local/lib/python3.8/site-packages (from bsuite) (1.22.3)\n", + "Requirement already satisfied: six in /home/npytincev/.local/lib/python3.8/site-packages (from bsuite) (1.15.0)\n", + "Requirement already satisfied: immutabledict in /home/npytincev/.local/lib/python3.8/site-packages (from bsuite) (2.2.1)\n", + "Requirement already satisfied: dm-env in /home/npytincev/.local/lib/python3.8/site-packages (from bsuite) (1.5)\n", + "Requirement already satisfied: scipy in /home/npytincev/.local/lib/python3.8/site-packages (from bsuite) (1.6.0)\n", + "Requirement already satisfied: dm-tree in /home/npytincev/.local/lib/python3.8/site-packages (from dm-env->bsuite) (0.1.6)\n", + "Requirement already satisfied: packaging>=20.0 in /home/npytincev/.local/lib/python3.8/site-packages (from matplotlib->bsuite) (20.9)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /home/npytincev/.local/lib/python3.8/site-packages (from matplotlib->bsuite) (2.8.2)\n", + "Requirement already satisfied: pyparsing>=2.2.1 in /home/npytincev/.local/lib/python3.8/site-packages (from matplotlib->bsuite) (2.4.7)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /home/npytincev/.local/lib/python3.8/site-packages (from matplotlib->bsuite) (4.31.2)\n", + "Requirement already satisfied: pillow>=6.2.0 in /home/npytincev/.local/lib/python3.8/site-packages (from matplotlib->bsuite) (8.4.0)\n", + "Requirement already satisfied: cycler>=0.10 in /home/npytincev/.local/lib/python3.8/site-packages (from matplotlib->bsuite) (0.10.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /home/npytincev/.local/lib/python3.8/site-packages (from matplotlib->bsuite) (1.3.1)\n", + "Requirement already satisfied: pytz>=2020.1 in /home/npytincev/.local/lib/python3.8/site-packages (from pandas->bsuite) (2022.1)\n", + "Requirement already satisfied: descartes>=1.1.0 in /home/npytincev/.local/lib/python3.8/site-packages (from plotnine->bsuite) (1.1.0)\n", + "Requirement already satisfied: patsy>=0.5.1 in /home/npytincev/.local/lib/python3.8/site-packages (from plotnine->bsuite) (0.5.1)\n", + "Requirement already satisfied: mizani>=0.7.3 in /home/npytincev/.local/lib/python3.8/site-packages (from plotnine->bsuite) (0.7.4)\n", + "Requirement already satisfied: statsmodels>=0.12.1 in /home/npytincev/.local/lib/python3.8/site-packages (from plotnine->bsuite) (0.12.2)\n", + "Requirement already satisfied: tifffile>=2019.7.26 in /home/npytincev/.local/lib/python3.8/site-packages (from scikit-image->bsuite) (2022.3.25)\n", + "Requirement already satisfied: PyWavelets>=1.1.1 in /home/npytincev/.local/lib/python3.8/site-packages (from scikit-image->bsuite) (1.3.0)\n", + "Requirement already satisfied: networkx>=2.2 in /home/npytincev/.local/lib/python3.8/site-packages (from scikit-image->bsuite) (2.5.1)\n", + "Requirement already satisfied: imageio>=2.4.1 in /home/npytincev/.local/lib/python3.8/site-packages (from scikit-image->bsuite) (2.16.1)\n", + "Requirement already satisfied: palettable in /home/npytincev/.local/lib/python3.8/site-packages (from mizani>=0.7.3->plotnine->bsuite) (3.3.0)\n", + "Requirement already satisfied: decorator<5,>=4.3 in /usr/lib/python3/dist-packages (from networkx>=2.2->scikit-image->bsuite) (4.4.2)\n" + ] + } + ], "source": [ "import sys, os\n", "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", - "\n", - " !pip install --upgrade https://github.com/Theano/Theano/archive/master.zip\n", - " !pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip\n", - "\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week05_explore/bayes.py\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week05_explore/action_rewards.npy\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week05_explore/all_states.npy\n", - "\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week05_explore/q_learning_agent.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week05_explore/replay_buffer.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week05_explore/und1.mp4\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week05_explore/und2.mp4\n", + "\n", + " !pip install -q gymnasium\n", + " !pip install -q shimmy[bsuite]\n", " !touch .setup_complete\n", "\n", "# This code creates a virtual display to draw game images on.\n", @@ -28,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ @@ -39,12 +80,75 @@ "np.set_printoptions(precision=3)\n", "np.set_printoptions(suppress=True)\n", "\n", - "import pandas\n", + "import pandas as pd\n", "\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" ] }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.display import HTML\n", + "\n", + "HTML(\"\"\"\n", + " \n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "HTML(\"\"\"\n", + " \n", + "\"\"\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -52,14 +156,12 @@ "## Contents\n", "* [1. Bernoulli Bandit](#Part-1.-Bernoulli-Bandit)\n", " * [Bonus 1.1. Gittins index (5 points)](#Bonus-1.1.-Gittins-index-%285-points%29.)\n", - " * [HW 1.1. Nonstationary Bernoulli bandit](#HW-1.1.-Nonstationary-Bernoulli-bandit)\n", - "* [2. Contextual bandit](#Part-2.-Contextual-bandit)\n", - " * [2.1 Bulding a BNN agent](#2.1-Bulding-a-BNN-agent)\n", - " * [2.2 Training the agent](#2.2-Training-the-agent)\n", - " * [HW 2.1 Better exploration](#HW-2.1-Better-exploration)\n", - "* [3. Exploration in MDP](#Part-3.-Exploration-in-MDP)\n", - " * [Bonus 3.1 Posterior sampling RL (3 points)](#Bonus-3.1-Posterior-sampling-RL-%283-points%29)\n", - " * [Bonus 3.2 Bootstrapped DQN (10 points)](#Bonus-3.2-Bootstrapped-DQN-%2810-points%29)\n" + " * [HW 1.1. Nonstationary Bernoulli bandit (2 points)](#HW-1.1.-Nonstationary-Bernoulli-bandit)\n", + "* [2. Exploration in MDP](#Part-2.-Exploration-in-MDP)\n", + " * [2.1 Epsilon-greedy q-learning](#2.1-Epsilon-greedy-q-learning)\n", + " * [2.2 Reward shaping](#2.2-Reward-shaping)\n", + " * [2.3 Curiosity-driven Exploration](#2.3-Curiosity-driven-Exploration)\n", + " * [HW 2.1 Random network distillation (3 points)](#HW-2.1:-Random-network-distillation)\n" ] }, { @@ -72,9 +174,9 @@ "\n", "The bandit has $K$ actions. Action produce 1.0 reward $r$ with probability $0 \\le \\theta_k \\le 1$ which is unknown to agent, but fixed over time. Agent's objective is to minimize regret over fixed number $T$ of action selections:\n", "\n", - "$$\\rho = T\\theta^* - \\sum_{t=1}^T r_t$$\n", + "$$\\rho = T\\theta^* - \\sum_{t=1}^T \\theta_{a_t}$$\n", "\n", - "Where $\\theta^* = \\max_k\\{\\theta_k\\}$\n", + "Where $\\theta^* = \\max_k\\{\\theta_k\\}$ and $\\theta_{a_t}$ corresponds to the chosen action $a_t$ on each step.\n", "\n", "**Real-world analogy:**\n", "\n", @@ -85,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ @@ -107,6 +209,11 @@ " \"\"\"\n", " return np.max(self._probs)\n", "\n", + " def action_value(self, action):\n", + " \"\"\" Used for regret calculation\n", + " \"\"\"\n", + " return self._probs[action]\n", + "\n", " def step(self):\n", " \"\"\" Used in nonstationary version\n", " \"\"\"\n", @@ -119,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -186,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -195,7 +302,10 @@ " self._epsilon = epsilon\n", "\n", " def get_action(self):\n", - " \n", + " if np.random.random() < self._epsilon:\n", + " return np.random.randint(len(self._successes))\n", + " else:\n", + " return np.argmax(self._successes / (self._successes + self._failures + 0.1))\n", "\n", " @property\n", " def name(self):\n", @@ -207,13 +317,13 @@ "metadata": {}, "source": [ "### UCB Agent\n", - "Epsilon-greedy strategy heve no preference for actions. It would be better to select among actions that are uncertain or have potential to be optimal. One can come up with idea of index for each action that represents otimality and uncertainty at the same time. One efficient way to do it is to use UCB1 algorithm:\n", + "Epsilon-greedy strategy have no preference for actions. It would be better to select among actions that are uncertain or have potential to be optimal. One can come up with idea of index for each action that represents optimality and uncertainty at the same time. One efficient way to do it is to use UCB1 algorithm:\n", "\n", "**for** $t = 1,2,...$ **do**\n", "\n", "   **for** $k = 1,...,K$ **do**\n", "\n", - "       $w_k \\leftarrow \\alpha_k / (\\alpha_k + \\beta_k) + \\sqrt{2log\\ t \\ / \\ (\\alpha_k + \\beta_k)}$\n", + "       $w_k \\leftarrow \\alpha_k / (\\alpha_k + \\beta_k) + \\sqrt{2\\log(t) \\ / \\ (\\alpha_k + \\beta_k)}$\n", "\n", "   **end for** \n", "\n", @@ -226,20 +336,21 @@ "\n", "**end for**\n", "\n", - "__Note:__ in practice, one can multiply $\\sqrt{2log\\ t \\ / \\ (\\alpha_k + \\beta_k)}$ by some tunable parameter to regulate agent's optimism and wilingness to abandon non-promising actions.\n", + "__Note:__ in practice, one can multiply $\\sqrt{2\\log(t) \\ / \\ (\\alpha_k + \\beta_k)}$ by some tunable parameter to regulate agent's optimism and wilingness to abandon non-promising actions.\n", "\n", "More versions and optimality analysis - https://homes.di.unimi.it/~cesabian/Pubblicazioni/ml-02.pdf" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ "class UCBAgent(AbstractAgent):\n", " def get_action(self):\n", - " " + " pulls = self._successes + self._failures + 0.1\n", + " return np.argmax(self._successes / pulls + np.sqrt(2 * np.log(self._total_pulls + 0.1) / pulls))" ] }, { @@ -273,18 +384,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 117, "metadata": {}, "outputs": [], "source": [ "class ThompsonSamplingAgent(AbstractAgent):\n", " def get_action(self):\n", - " \n" + " return np.argmax(np.random.beta(self._successes + 1, self._failures + 1))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ @@ -308,7 +419,7 @@ " action = agent.get_action()\n", " reward = env.pull(action)\n", " agent.update(action, reward)\n", - " scores[agent.name][i] += optimal_reward - reward\n", + " scores[agent.name][i] += optimal_reward - env.action_value(action)\n", "\n", " env.step() # change bandit's state if it is unstationary\n", "\n", @@ -331,26 +442,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 120, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":4: RuntimeWarning: invalid value encountered in sqrt\n", + " return np.argmax(self._successes / pulls + np.sqrt(2 * np.log(self._total_pulls + 0.1) / pulls))\n" + ] + }, { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "" + "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" } ], "source": [ "# Uncomment agents\n", "agents = [\n", - " # EpsilonGreedyAgent(),\n", - " # UCBAgent(),\n", - " # ThompsonSamplingAgent()\n", + " EpsilonGreedyAgent(),\n", + " UCBAgent(),\n", + " ThompsonSamplingAgent()\n", "]\n", "\n", "regret = get_regret(BernoulliBandit(), agents, n_steps=10000, n_trials=10)\n", @@ -363,7 +484,7 @@ "source": [ "# Bonus 1.1. Gittins index (5 points).\n", "\n", - "Bernoulli bandit problem has an optimal solution - Gittins index algorithm. Implement finite horizon version of the algorithm and demonstrate it's performance with experiments. some articles:\n", + "Bernoulli bandit problem has an optimal solution - Gittins index algorithm. Implement finite horizon version of the algorithm and demonstrate it's performance with experiments. Some articles:\n", "- Wikipedia article - https://en.wikipedia.org/wiki/Gittins_index\n", "- Different algorithms for index computation - http://www.ece.mcgill.ca/~amahaj1/projects/bandits/book/2013-bandit-computations.pdf (see \"Bernoulli\" section)\n", " " @@ -380,7 +501,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 300, "metadata": {}, "outputs": [], "source": [ @@ -429,17 +550,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 301, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "" + "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" } ], @@ -452,7 +575,7 @@ " drifting_probs.append(drifting_env._probs)\n", "\n", "plt.figure(figsize=(17, 8))\n", - "plt.plot(pandas.DataFrame(drifting_probs).rolling(window=20).mean())\n", + "plt.plot(pd.DataFrame(drifting_probs).rolling(window=20).mean())\n", "\n", "plt.xlabel(\"steps\")\n", "plt.ylabel(\"Success probability\")\n", @@ -470,7 +593,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 302, "metadata": {}, "outputs": [], "source": [ @@ -479,17 +602,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 303, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":5: RuntimeWarning: invalid value encountered in sqrt\n", + " np.sqrt(2 * np.log(self._total_pulls + 0.1) / (self._successes + self._failures + 0.1)))\n" + ] + }, { "data": { - "image/png": "\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEGCAYAAACUzrmNAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAABUYElEQVR4nO3dd1hUV/rA8e+hIwqIoiKoFEWRIgrYu7H32DVGU3RjiimbbJJN3bRNYnqyiTE/U0zs3dhLNPYCigUsdAERBQRBOnN+f9yBoBFBZRjK+TwPD8Plzr3vzDDzck95j5BSoiiKoih3YmLsABRFUZTqTyULRVEUpVwqWSiKoijlUslCURRFKZdKFoqiKEq5zIwdgCE0btxYurq6GjsMRVGUGiUkJCRFSul4u9/VymTh6upKcHCwscNQFEWpUYQQcWX9TjVDKYqiKOVSyUJRFEUpl0oWiqIoSrlqZZ/F7RQUFJCQkEBubq6xQ1GUGs3KygoXFxfMzc2NHYpShepMskhISKBBgwa4uroihDB2OIpSI0kpSU1NJSEhATc3N2OHo1ShOtMMlZubS6NGjVSiUJT7IISgUaNG6gq9DqozyQJQiUJRKoF6H9VNdSpZKIqi1FqFeRDyM5z93SCHV8miiqSmpuLv74+/vz/NmjXD2dkZf39/7O3tad++vbHDu2s6nY65c+fi4+ODr68vQUFBxMTEGPScrq6upKSkANC9e/f7Pt6YMWPo2rXrfR+nLD///DOXLl0y2PEVBYCCHDj0LXzVEX5/Fs6sMchp6kwHt7E1atSI0NBQAN5++23q16/Piy++SGxsLCNGjDBucPdg+fLlXLp0iVOnTmFiYkJCQgI2NjZVdv6DBw/e1/3T09MJCQmhfv36REdH4+7uXkmR/eXnn3/Gx8eH5s2bV/qxFYVrcXBkPoQuhtwMaNUDRn0NHv0NcjqDX1kIIUyFECeEEBv1P7sJIY4IISKFEMuFEBb67Zb6nyP1v3ctdYxX9dvPCyEGGzrmqlZUVMSsWbPw9vZm0KBB5OTkABAaGkrXrl3x8/Nj7NixXLt2DYC+ffvy/PPPExgYiJeXF8eOHePBBx+kTZs2vP766wDExsbSrl07pk2bhpeXF+PHjyc7OxuAV155hfbt2+Pn58eLL75Ysn///v3x8/NjwIABXLx4EYCZM2cyd+5cunfvjru7O6tWrQIgKSkJJycnTEy0PyEXFxcaNmwIwJw5cwgMDMTb25u33nqr5HG6urry6quv4u/vT2BgIMePH2fw4MF4eHgwf/58APbs2UPv3r0ZPnw4bdu25YknnkCn0/3tOatfv37J/n379mX8+PElj7d49cfNmzfTrl07AgICmDt37k1Jec2aNYwcOZLJkyezbNmyku1RUVF07doVX19fXn/99ZLzAMybN4+goCD8/PxKHldsbCxeXl5/e/1WrVpFcHAw06ZNw9/fv+Q1VZT7lnUV1j0FX/nD0R+g9QPwyFZ4ZDO0HgCG6lOSUhr0C3gBWAJs1P+8Apisvz0fmKO//SQwX397MrBcf7s9cBKwBNyAKMD0TucMCAiQtwoPDy+5/faGM3Li/IOV+vX2hjN/O2dZ3nrrLTlv3jwppZQxMTHS1NRUnjhxQkop5YQJE+Svv/4qpZTS19dX7tmzR0op5RtvvCGfffZZKaWUffr0kf/617+klFJ+8cUX0snJSV66dEnm5uZKZ2dnmZKSImNiYiQg9+/fL6WU8pFHHpHz5s2TKSkp0tPTU+p0OimllNeuXZNSSjlixAj5888/SymlXLhwoRw9erSUUsoZM2bI8ePHy6KiIhkWFiY9PDyklFLGx8fLVq1ayQ4dOsgXXnhBHj9+vOTxpaamSimlLCwslH369JEnT56UUkrZqlUr+e2330oppXzuueekr6+vvH79urxy5Yps0qSJlFLK3bt3S0tLSxkVFSULCwvlAw88IFeuXFly/6tXr0oppbSxsSnZ39bWVsbHx8uioiLZtWtXuW/fPpmTkyNdXFxkdHS0lFLKyZMny+HDh5fE+MADD8i9e/fK8+fPSx8fn5Ltw4cPl0uWLJFSSvndd9+VnGfbtm1y1qxZUqfTyaKiIjl8+HD5559/3vH169Onjzx27Fh5fw41Uun3k1JFdDopQ5dK+UELKf/TSMotr0iZnlCppwCCZRmfqwa9shBCuADDgf/T/yyA/sAq/S6/AGP0t0frf0b/+wH6/UcDy6SUeVLKGCAS6GzIuKuam5sb/v7+AAQEBBAbG0tGRgbp6en06dMHgBkzZrB3796S+4waNQoAX19fvL29cXJywtLSEnd3d+Lj4wFo0aIFPXr0AOChhx5i//792NnZYWVlxWOPPcaaNWuoV68eAIcOHWLq1KkATJ8+nf3795eca8yYMZiYmNC+fXuSk5MB7Uri/Pnz/Pe//8XExIQBAwawa9cuAFasWEGnTp3o2LEjYWFhhIeH3zbuLl260KBBAxwdHbG0tCQ9PR2Azp074+7ujqmpKVOmTLkpltvp3LkzLi4umJiY4O/vT2xsLOfOncPd3b1kLsCUKVNK9k9OTiYiIoKePXvi6emJubk5Z86cKXkeJkyYAFDyfABs376d7du307FjRzp16sS5c+eIiIgo8/VTlEqVdBJ+GQlr/wFN28OcgzDkv2DnXGUhGLrP4gvgX0AD/c+NgHQpZaH+5wSg+NE6A/EAUspCIUSGfn9n4HCpY5a+TwkhxGxgNkDLli3vGNRbI73v/pEYkKWlZcltU1PTCjVZFN/HxMTkpvubmJhQWKg9vbcOcRRCYGZmxtGjR9m1axerVq3im2++4Y8//qhwfFLfxFO8fejQoQwdOpSmTZuybt063N3d+eSTTzh27BgNGzZk5syZN43Jv9e4KxqfqalpyXHKsmLFCq5du1aSSK5fv87SpUt5//33y7yPlJJXX32Vf/zjHzdtj42NvafXT1EqRFcEB76A3f8FKzsY+jEEPQ4mplUeisGuLIQQI4ArUsoQQ52jNCnlAilloJQy0NHxtuXYaxQ7OzsaNmzIvn37APj1119LrjIq6uLFixw6dAiAJUuW0LNnT7KyssjIyGDYsGF8/vnnnDx5EtBGFxW33S9evJhevXrd8djHjx8vGemj0+k4deoUrVq14vr169jY2GBnZ0dycjJbtmy5q5gBjh49SkxMDDqdjuXLl9OzZ8+7Pkbbtm2Jjo4u+S9/+fLlJb9bunQpW7duJTY2ltjYWEJCQkoee9euXVm9ejXATX0ZgwcP5scffyQrKwuAxMRErly5cscYGjRoQGZm5l3HriiA1oH92zjY9Q60Gw5PH4Mu/zBKogDDXln0AEYJIYYBVoAt8CVgL4Qw019duACJ+v0TgRZAghDCDLADUkttL1b6PrXaL7/8whNPPEF2djbu7u789NNPd3X/tm3b8r///Y9HH32U9u3bM2fOHDIyMhg9ejS5ublIKfnss88A+Prrr3nkkUeYN28ejo6O5Z7rypUrzJo1i7y8PEBrCnr66aexsrKiY8eOtGvX7qZmsLsRFBTE008/TWRkJP369WPs2LF3fQxra2u+/fZbhgwZgo2NDUFBQYB2JRAXF3fTkFk3Nzfs7Ow4cuQIX3zxBQ899BDvv/8+Q4YMwc7ODoBBgwZx9uxZunXrBmgd7L/99humpmW/cWfOnMkTTzyBtbU1hw4dwtra+q4fh1IHSQmnlsPml7TbIz6HgEcM13Fd8bgM28Gtb7boy18d3Cu5uYP7Sf3tp7i5g3uF/rY3N3dwR3OfHdx1QUxMjPT29jZ2GHdt9+7dN3VE34/MzEwppZQ6nU7OmTNHfvbZZ+Xe58aNGyWd/0uXLpWjRo2qlFhqm7r2fqoyN1KlXP6wlG/ZSrlwsJRpMVV6eu7QwW2MeRYvA8uEEO8BJ4CF+u0LgV+FEJFAGlrCQEoZJoRYAYQDhcBTUsqiqg9bqWl++OEHfvnlF/Lz8+nYsePf+htuJyQkhKeffhopJfb29vz4449VEKmiAFF/wNo5kJ0KA96CHs8arcnpdoQs1WFZWwQGBspbl1U9e/YsXl5eRopIUWoX9X6qRDod7PkA9s6Dxm1h3A/g1MEooQghQqSUgbf7nZrBrSiKYix5Wdpw2HMboeNDMHQeWNQzdlS3pZKFoiiKMdxIgV/HQHIYDP4vdJ1j/E7sO1DJQlEUpaplp8GiMZAaAVNXQJuBxo6oXCpZKIqiVKWcdPh1LKRcgKnLDFb4r7KpEuVVyNTUtKRMub+/Px9++OFdHyM4OJi5c+cCWlXTp59++p7jiYiIYMSIEXh4eBAQEEC/fv1uKilSmfr27cutgw5uZ926dQghOHfunEHiCA0NZfPmzX875zvvvFOp5xk2bFhJ+ZLSxQgrm5SSuXPn0rp1a/z8/Dh+/Pht9wsJCcHX15fWrVszd+7ckpn4K1euxNvbGxMTk5ten9OnTzNz5kyDxV1n5WXC4vFa09OkX2tMogCVLKqUtbU1oaGhJV+vvPLKXR8jMDCQr7766r5jyc3NZfjw4cyePZuoqChCQkL4+uuviY6O/tu+5ZXPqExLly6lZ8+eLF261CDHv12y+Pjjj3nyyScr9TybN2/G3t6+Uo95O1u2bCEiIoKIiAgWLFjAnDlzbrvfnDlz+OGHH0r23bp1KwA+Pj6sWbOG3r1737S/r68vCQkJJdWHlUqQfwMWT4DE4zDhJ/CsWQW0VbKoBlxdXfnXv/6Fr68vnTt3JjIyEtD+6/Px8aFDhw4lb+Y9e/bcdv2Luy0xvnjxYrp161ZS2A+0D47i/ybffvttpk+fTo8ePZg+fTpXr15l3LhxBAUFERQUxIEDBwC4ceMGjz76KJ07d6Zjx46sX78egJycHCZPnoyXlxdjx44tqZf0448/8txzz5Wc84cffuD5558HICsri/3797Nw4cKbSm3odDqefPJJ2rVrx8CBAxk2bFjJ4wgJCaFPnz4EBAQwePBgkpKSAO1K5uWXX6Zz5854enqyb98+8vPzefPNN1m+fDn+/v4sX76cCxcuYGlpSePGjQHKfJzFz0e3bt1o06YNP/zwA6CVau/duzf+/v74+PiUlGcpvVBTMSklL730UsmCUcUlSO5UZr0869ev5+GHH0YIQdeuXUlPTy95DoolJSVx/fp1unbtihCChx9+mHXr1gHg5eVF27Ztb3vskSNH3vQ6KPehIAeWTob4IzDu/8BrpLEjumt1s89iyytw+XTlHrOZLwy9c7NSTk5OSXVSgFdffZVJkyYBWi2o06dPs2jRIp577jk2btzIO++8w7Zt23B2di5p0ijLM888w4wZM5gxYwY//vgjc+fOLflASEpKYv/+/Zw7d45Ro0Yxfvx4wsLC6NSp0x2PGR4ezv79+7G2tmbq1Kk8//zz9OzZk4sXLzJ48GDOnj3L+++/T//+/fnxxx9JT0+nc+fOPPDAA3z//ffUq1ePs2fPcurUqZJzTZw4kffff5958+Zhbm7OTz/9xPfffw9oH3xDhgzB09OTRo0aERISQkBAAGvWrCE2Npbw8HCuXLmCl5cXjz76KAUFBTzzzDOsX78eR0dHli9fzmuvvVYyka6wsJCjR4+yefNm/vOf/7Bz507eeecdgoOD+eabbwD46aefbnoenn322ds+ToBTp05x+PBhbty4QceOHRk+fDhLly5l8ODBvPbaaxQVFZWsGXI7a9asITQ0lJMnT5KSkkJQUFDJPwEnTpwgLCyM5s2b06NHDw4cOEDPnj15/vnn2b1799+ONXnyZF555RUSExNp0eKvajguLi4kJibi5ORUsi0xMREXF5e/7VOewMBAPvzwQ/71r3+Vu69yB9lpsHSKlijGfg8+Dxo7ontSN5OFkRQ3Q91OcQntKVOmlPyn3aNHD2bOnMnEiRN58ME7/4EdOnSINWu05RSnT59+0xv8diXGbzV27FgiIiLw9PQsOc6oUaNK6hnt3LnzplLj169fJysri+3bt7NhwwY++eQTQGveunjxInv37i3pW/Hz88PPzw/Q2u/79+/Pxo0b8fLyoqCgAF9fX0Brgnr22WcB7cNw6dKlBAQEsH//fiZMmICJiQnNmjWjX79+AJw/f54zZ84wcKA2kqSoqOimD8ni5+xOZcOTkpIoXXiyrMcJMHr0aKytrbG2tqZfv34cPXqUoKCgksQ1ZsyYm/4ZuNX+/fuZMmUKpqamNG3alD59+nDs2DFsbW1LyqwDJWXWe/bsyeeff17m8QytSZMmalnY+5USqV1RpMfB+B9rbKKAuposyrkCMIbSZbiLb8+fP58jR46wadMmAgICCAm5twK+tysx7u3tfVNn9tq1awkODi5ZOQ+4aZlUnU7H4cOHsbKyuunYUkpWr15dZlPG7Tz++ON88MEHtGvXjkceeQSAtLQ0/vjjD06fPo0QgqKiIoQQzJs3r8zjSCnx9vYuqaxb1uO+U9lya2trMjIyyn2ccPvS6b1792bv3r1s2rSJmTNn8sILL/Dwww/f+Qm4Q6y3xlvelYWzs3PJ+iUACQkJODvfXMHf2dmZhISEO+5zO7m5uar44b3Ky4KQn7RZ2SZmMH0duN59Uc3qRPVZVBPF7dfLly8vqWwaFRVFly5deOedd3B0dLzpQ+FWd1tifOrUqRw4cIANGzaUbLtTE8qgQYP4+uuvS34uvkIaPHgwX3/9dUkSOnHiBAC9e/dmyZIlAJw5c4ZTp06V3LdLly7Ex8ezZMmSkiuqVatWMX36dOLi4oiNjSU+Ph43Nzf27dtHjx49WL16NTqdjuTkZPbs2QNoVXWvXr1akiwKCgoICwu74+O+tWy4l5dXSR/RnR4naM1kubm5pKamsmfPHoKCgoiLi6Np06bMmjWLxx9/vMzRSAC9evVi+fLlFBUVcfXqVfbu3Uvnzndex+vzzz+/aVDErYMjRo0axaJFi5BScvjwYezs7G66ugJwcnLC1taWw4cPI6Vk0aJFjB49+o7nBbhw4QI+Pj7l7qeUUlQIRxbAFz6w/XVw8odZu2t8ogCVLKpUcZ9F8Vfp0VDXrl3Dz8+PL7/8sqTp4aWXXsLX1xcfHx+6d+9Ohw5l14v5+uuv+emnn/Dz8+PXX3/lyy+/vGMs1tbWbNy4kfnz5+Pu7k63bt147733StbwvtVXX31FcHAwfn5+tG/fvmTN7DfeeIOCggL8/Pzw9vbmjTfeALTRN1lZWXh5efHmm28SEBBw0/EmTpxIjx49StbtXrp06d9KkY8bN46lS5cybtw4XFxcaN++PQ899BCdOnXCzs4OCwsLVq1axcsvv0yHDh3w9/fn4MGDd3zc/fr1Izw8vKSDu3fv3pw4caIk2ZX1OEFrTuvXrx9du3bljTfeoHnz5uzZs4cOHTrQsWNHli9fXtKMdjtjx47Fz8+PDh060L9/fz7++GOaNWt2x3jLM2zYMNzd3WndujWzZs3i22+/Lfld6Saxb7/9lscff5zWrVvj4eHB0KFDAe2K0sXFhUOHDjF8+HAGD/5rhM7u3bsZPnz4fcVXp6THw4+DYMtL0MwPHtsJMzZAw1bGjqxylFWOtiZ/1bQS5aXXlq4rhg8fLnfu3Fnh/YvLjaekpEh3d3eZlJRUabHMnTtX7tix4477lF43vS7Izc2VXbp0kQUFBbf9fXV+PxnFuc1Sfugq5QcuUp5epa2XXQNhrDW4FeVW6enpeHp6Ym1tzYABAyp8vxEjRuDv70+vXr1444037vs/8tL+/e9/37EJri66ePEiH374IWZmdbNbs8J0RfDH+1ontp2z1uTkM65a13i6V6pEuaIod029n9DKdqx+DCJ3gv80GP4pmNfsAQGqRLmiKEplunwGVs6Ea7Ew/DMIfLRWXk2UppKFoijK3QhfD+ueBIv6MH0tuN155GFtoZKFoihKRRTkwK534fD/wDkQJi7S+inqCJUsFEVRypN0ElbPgpTzEDQLBr8PZpbl368WUaOhqlBsbOzfJjm9/fbbJaUyPvnkE9q1a4e/vz9BQUEsWrQI0IritW3bFn9/f7y8vFiwYMFNxwgNDUUIUVJJ1BBxF0+wU5Q6RVcE+z6FHwZA3nWt2Wn4J3UuUYBKFtXG/Pnz2bFjB0ePHiU0NJRdu3bdVHl08eLFhIaGcuDAAV5++WXy8/NLfmfost4qWSh1UtZVWDIJdr0D7YbBnIM1av2JyqaSRTXxwQcf8N1332FrawuAra0tM2bM+Nt+WVlZ2NjYYGpqCmiTKleuXMnPP//Mjh07yM3NLdn33XffpW3btvTs2ZMpU6aUXMFERUUxZMgQAgIC6NWrV8lCQ2WVM3/llVfYt28f/v7+Ri1spyhVJnoPfNcdYvZqo50mLoJ6DsaOyqjqZJ/FR0c/4lxa5a7E1s6hHS93fvme7pudnU1mZibu7u5l7jNt2jQsLS2JiIjgiy++KEkWBw8exM3NDQ8PD/r27cumTZsYN24cx44dY/Xq1Zw8eZKCggI6depUUnJj9uzZzJ8/nzZt2nDkyBGefPJJ/vjjD+D25cw//PBDPvnkEzZu3HhPj09RagydDvZ/Crs/gEZt4OF10NTb2FFVC3UyWRjLrVVLi1VkYuTixYsJDAzk6tWrdO/enSFDhtCqVSuWLl3K5MmTAa0S6aJFixg3bhwHDhxg9OjRWFlZYWVlxciR2mIrWVlZHDx4kAkTJpQcOy8vr+R2RcqZK0qtdCMV1s7WJtn5ToARX4Cl4ZbErWnqZLK41yuA+9WoUSOuXbt207a0tDQCAgKoX78+0dHRd7y6AHB0dKRTp04cOXIEFxcXVq9ezfr163n//feRUpKamnpTVdVb6XQ67O3ty1xX43blzBWlVpMSzqyGba9BTlqdmWR3t1SfRRWqX78+Tk5OJU0+aWlpbN26lZ49e/Lqq6/y1FNPcf36dUC7AigeDVVadnY2J06cwMPDg127duHn50d8fDyxsbHExcUxbtw41q5dS48ePfj999/Jzc0lKyurpAnJ1tYWNzc3Vq5cCWgJ4eTJk3eM+9ay3opSa2SnwfKHtLIdDZrC4zsh6DGVKG5DJYsqtmjRIt599138/f3p378/b731Fh4eHsyZM4d+/foRFBSEj48PvXr1wsTkr5dn2rRp+Pv7ExAQwMyZMwkICLhjWe+goCBGjRqFn58fQ4cOxdfXFzs7O0Br0lq4cCEdOnTA29u7ZN3ssvj5+WFqakqHDh1UB7dSe8Tsg2+7woWtMPBdrQigU9nLANR1qpBgLZaVlUX9+vXJzs6md+/eLFiwoNx1txWlImr0+0lKOPQ/2PEmOLjD+IUqSeipQoJ11OzZswkPDyc3N5cZM2aoRKEoBbmw4Wk4vRK8RsKY78CygbGjqhFUsqjF1EQ6RSkl8zIsnw4JR6H/G9Drn6pv4i7UqWQhpSxz+KqiKBVTI5uuz22G3+dCfjZM+AW8xxg7ohqnznRwW1lZkZqaWjP/0BWlmigenm1lZWXsUComIwFWPAzLpkD9ZjBrl0oU96jOXFm4uLiQkJDA1atXjR2KotRoVlZWuLi4GDuMOysqgANfwp8fa01N/V6HHs+CmYWxIzOoIp0kO7+QBlbmlX7sOpMszM3NcXNzM3YYiqIY2qVQbXGiK2HQfjQMeg/sWxo7KoPKyS9iVUg8C/fH0NW9ER+O86v0c9SZZKEoSi2Xnw1/fggHv4H6TWDSYvAaYeyoDEank+w8m8zSoxc5EJVKfqEOPxc7+rdrYpDzqWShKErNd34rbHkJ0i9Cx+kw6F2wbmjsqAxCSsmus1eYt+0855MzaW5nxdTOLRnh50Sgq+Eq46pkoShKzaTTwdkNcOR7uHgQHNvBjI21ek3s0wkZvL85nMPRabg1tuHzSR0Y6dccM1PDj1VSyUJRlJqlMF+rDLvrP3D1HNi3gsH/haDHa20HdmJ6Dp9sO8/aE4k42FjwzmhvpnRuiXkVJIliKlkoilIz5GXBnx/B8V8gNwMausL4H6H9GDAxNXZ0BpGZW8B3e6JYuD8GCczp68Gcvh7YGmC0U3kMliyEEFbAXsBSf55VUsq3hBBuwDKgERACTJdS5gshLIFFQACQCkySUsbqj/Uq8BhQBMyVUm4zVNyKolRDUX/AhmchIx68x4LfRPAYUGuvJAqKdCw7epEvdkaQeiOfsR2deXFwW5ztrY0WkyGvLPKA/lLKLCGEObBfCLEFeAH4XEq5TAgxHy0JfKf/fk1K2VoIMRn4CJgkhGgPTAa8gebATiGEp5SyyICxK4pSHeSkw/bX4MRv2sp1j26Dll2MHZXBXM7IZdPpJH47HEdMyg26ujvw87D2+LrYGTs0wyULqU2VztL/aK7/kkB/YKp++y/A22jJYrT+NsAq4Buh1eYYDSyTUuYBMUKISKAzcMhQsSuKUg2c2wQbX4AbV6Hn89DnFTCvITPH71JIXBo/HYhly5nLFOkkfi52/PBwIA94Nak2JYoM2mchhDBFa2pqDfwPiALSpZSF+l0SAGf9bWcgHkBKWSiEyEBrqnIGDpc6bOn7KIpS2+RmaEnizCpo6gNTl0HzjsaOyiB2n7vC/3ZHEhx3DVsrMx7r6cbEwBa0blL9lnM1aLLQNxX5CyHsgbVAO0OdSwgxG5gN0LJl7Z6tqSi1VuJxWPWoNl+i77+1K4pa2C9x7UY+72wMZ+2JRFo1qsfrw72Y2qUl9Syq75ijKolMSpkuhNgNdAPshRBm+qsLFyBRv1si0AJIEEKYAXZoHd3F24uVvk/pcywAFoC2+JGhHouiKAZQmA/7P4e986B+U5i5CVp1M3ZUBrH1TBKvrztDenYBcwe04al+HliaVf/RXAYbpCuEcNRfUSCEsAYGAmeB3cB4/W4zgOI1PTfof0b/+z/0/R4bgMlCCEv9SKo2wFFDxa0oShW7fBoW9IE9H2gVYZ/YVysTRWZuAf9ccZInfjuOk501G57uyQsDPWtEogDDXlk4Ab/o+y1MgBVSyo1CiHBgmRDiPeAEsFC//0LgV30HdhraCCiklGFCiBVAOFAIPKVGQilKLZB7XVva9PgisHGEKcuh7RBjR2UQx2LTeH55KJfSc5jbvzXPDGhTpRPqKkOdWYNbUZRqJPYArHtCW2+i82zo/S+waWTsqCrd9dwCPtt+gUWHYnFpWI/PJ3UgoJXh6jfdL7UGt6Io1UNhHuz+QFtrwsFNmzfRorOxo6p06dn5LDsWz4/7Y0jJymNql5a8MtSL+pY19yO35kauKErNkngc1s3R6jl1elir52RZ/YaI3ispJcFx11h0KI5tYZfJL9TR2c2BHx4OpEMLe2OHd99UslAUxbDys2Hfp9pop/pNYdoqaDPQ2FFVqvBL13l3YziHolOxtTJjYqAL07q0wsvJ1tihVRqVLBRFMZxrcfDbOEiNAL/JMPTDWrXOREpWHp9uP8+yY/HYW5vz5oj2TO7colrPl7hXte8RKYpSPVwKhWVTIT8LHl4P7n2NHVGlyS/U8fPBGL7eFUlOQRGPdHfj2QFtsKtX9dVgq4pKFoqiVL4L22HlTLC21xYkcqr8NaGNQUrJljOX+WjrOeJSs+nfrgmvDffCw7H29L2URSULRVEq15nVsPpxra7TtFXQoKmxI6oUZxIz+Pfa05xKyMCzaX1+fiSIvm0Ns951daSShaIolefMalg9C1p2g6krasVoJyklvxyM5YPN52hoY84nEzowxr9qljKtTlSyUBTl/kkJh7/T1p5o0RWmLq8ViSIju4AXV51kR3gyA9o14ZMJHWhoU/sKG1aEShaKotyfa3Gw9VU4vwnajYAHfwCLesaO6r7Fpd7g0Z+PcTEtmzdGtOfRHq7VZm0JY1DJQlGUe6MrgmP/BzvfBgQMfAe6PQMmNb95Zve5Kzy3PBQh4LfHutDFvfaVIrlbKlkoinL3kk7B5hch/gi0HggjvwA7F2NHdd+klPzfvhj+u+UsXk62fDctgJaNav5VUmVQyUJRlIorzIc/P9JmY1s3hDHzocNkqAXNM0U6yX9+D2PRoTiG+jTj04kdauXkunulnglFUSomOQx+fw4SjoL/NBj8fq2ZjX05I5eXVp1kX0QKs3u788qQdpiY1PwEWJlUslAU5c4yEmHnW3BmDVjYwISfwXussaOqFFJKVgYn8PbvYRTqJP990JcpndWyzLejkoWiKLenK4LgH2HHWyB10HUO9Pon1Ku+6zHcjay8Qt5Yd4a1JxLp5t6Ij8b5qf6JO1DJQlGUv7t8Bn5/FhKDtZpOI7+Ehq7GjqrSnL+cyZOLQ4hJucHzD3jydP/WmKpmpztSyUJRlL+kxWhXE4e/BSs7GLsA/CbWig7sYiuD43lj/RnqW5rz2+Nd6O7R2Ngh1QgqWSiKAlfPwx/vwtnftZ/9H4JB79aaJieAvMIiPt56noX7Y+jm3ogvJ/vTxNbK2GHVGCpZKEpdlnkZ9n0GwQvB3Ebrk+g0Axq2MnZklSoiOZO5y0I5m3Sdh7u14o0R7TGvY7Wd7pdKFopSFxXkwN55cOhb0BWA/1QY8BbY1K4mGSklvx25yHsbw7GxNGPhjEAGeNWOKrhVTSULRalrYvfDhrmQFgW+E6Dfv8HB3dhRVbqwSxn8e+0ZTsan09vTkU8m+NGkgWp2ulcqWShKXZF1Fba/DqeWgX2rWrd6XbHikuLvbTqLfT1zPnzQl4mBLdQku/ukkoWi1Ha6Igj5GXb9B/KztX6JXi/Wisqwt8rJL+Lfa0+z9kQiD3hpJcXt69XNkuKVTSULRanNLoXCphcgMQRce8HwT8GxrbGjMogziRm8sCKUiCtZ/HOgJ0/1a62uJiqRShaKUhsVFWilww9/C/UaaWtM+E6oVfMliqVk5fHVrgiWHLmIg40FvzzSmd6ejsYOq9ZRyUJRapvcDFg2DWL3QeCj2igna3tjR1XpLqXn8NvhOH49FEd2QRGTg1rw0uC2qtnJQCqULIQQllLKvPK2KYpiZMlhsPIRbaTTmPngP8XYEVWqnPwiNpxMZOfZK/xx7go6KRni3YwXB7fFw7HmL+NanVX0yuIQ0KkC2xRFMYaiQjj4Fez+QLuKmL4W3HobO6pKk5iew+qQBH4+GEvajXya2VrxeE83HuraihYOta+jvjq6Y7IQQjQDnAFrIURHoLjB0xZQr5CiVAcpkbDuCUg4Bu1Hw/DPwabmLwOq00kORKWwIjiBjacuISX0b9eEf/R2p7ObQ51eD9sYyruyGAzMBFyAz0ptvw7820AxKYpSEQW5cOhr2PspmFnCuIXgM67Gd2Jfzy1g65nLfLs7ktjUbGwsTJndy53JnVvi1tjG2OHVWXdMFlLKX4BfhBDjpJSrqygmRVHuREo4s1qbN5F+EbxGwtB5YOtk7MjumZSSkwkZLDt6kTUnEskv1NGuWQO+nOzPEJ9mWJqZGjvEOq+ifRYHhBALgeZSyqFCiPZANynlQgPGpijKrdIvwvqnIeZPcOoAI78Cj37GjuqeFRTpWHYsnl8PxXIhOQsrcxPGdXJhfIALnVraq6amaqSiyeIn/ddr+p8vAMsBlSwUpSqkx8O+T+HEb2BqAcM+0YbFmtTM/7iTr+ey+XQSvx6KIzrlBh1c7HhvjA+j/Jtja2Vu7PCU26hosmgspVwhhHgVQEpZKIQoMmBciqKANvN632dwYSsgoNPD0PN5sG9h7MjuWpFOsvfCVRYfucgf55LRSfBubsvCGYH0b9dEXUVUcxVNFjeEEI0ACSCE6ApkGCwqRanrLp2APR/BhS1g3RC6PgmdZ9fIJHElM5dlR+NZfiyexPQcGte35Ik+HjzYyYXWTdTciJqiosniBWAD4CGEOAA4AuMNFpWi1FXXk7QV60IXg7UD9H1VSxRWtsaO7K4lZeQwf08US4/Gk1+ko1ebxrw23IsHvJpiYaYWHqppyk0WQghToI/+qy3aXIvzUsoCA8emKHVHQQ4c+gb2fa4tRtTjWa0ybA1MEpfSc/huTxTLj8Wjk5LxAS7M7u2Ou5phXaOVmyyklEVCiClSys+BsCqISVHqjqJCOLUcdr8P1xO1YbAD36mRixElX8/lq10RrAiOB2B8QAue7OuhZljXEnczdPYbtBFQN4o3SimPGyQqRantpISwNbDnQ0i5oA2DHfs9uPUydmR3LSO7gB/2RfPjgRgKinRMDGzBnL4euDRUSaI2qWiy8Nd/f6fUNgn0L+sOQogWwCKgqX7fBVLKL4UQDmhJxxWIBSZKKa8JbSjEl8AwIBuYWZyMhBAzgNf1h35PP1lQUWqmzGT4/Vmt87pJe5j0G7QbUeNmXucX6vj5YAxf74okM6+Q4X5OvDSoLa5qlnWtVKFkIaW8l1k/hcA/pZTHhRANgBAhxA608iG7pJQfCiFeAV4BXgaGAm30X12A74Au+uTyFhCIlnRChBAbpJTX7iEmRTGus7/Dhme0PopB70PXOTVurkR+oY7FR+L4/s9oLl/PpX+7Jrw0uC1eTjWvf0WpuIqWKH/hNpszgBApZejt7iOlTAKS9LczhRBn0YoSjgb66nf7BdiDlixGA4uklBI4LISwF0I46ffdIaVM08eyAxgCLK1I7IpSLeRnw/bXIPhHrcnpwf8DR09jR3XXLiRn8tyyUMKTrtPV3YH/jvOlr6ejmiNRB1S0GSpQ//W7/ucRwCngCSHESinlx3e6sxDCFegIHAGa6hMJwGW0ZirQEkl8qbsl6LeVtf3Wc8wGZgO0bNmygg9LUapAahQsnw5XwqD7M9D/TTCreQv0rAiO57W1p7GxNOP76QEMat9UJYlqJqcwh5zCHBysHCr92BVNFi5AJyllFoAQ4i1gE9AbCAHKTBZCiPrAauA5KeX10n9cUkophJD3GPtNpJQLgAUAgYGBlXJMRblvkTthxQwwNYeHVkPrB4wd0V0r0kk+3nqO7/dG06tNY76Y5E+j+pbGDkspJb8onzURa1hwagFBzYL4qPdHlX6OiiaLJkDpVfEK0K4QcoQQZa6WJ4QwR0sUi6WUa/Sbk4UQTlLKJH0z0xX99kSg9PRUF/22RP5qtirevqeCcSuK8ZxZA2tmg2M7mLoM7FyMHdFdu5FXyHPLQ9kRnsz0rq14a2R7zEzVhLrqIiMvg9URq1lydgnJ2cl0atKJCZ4TDHKuiiaLxcARIcR6/c8jgSVCCBsg/HZ30I9uWgiclVKWXgtjAzAD+FD/fX2p7U8LIZahdXBn6BPKNuADIURD/X6DgFcrGLeiVD0p4cCXsPMtaNkNpi4HKztjR3XXjkSn8sqa08Sl3uA/o7yZ0d3V2CHVeVJK4jPjCUkO4eClg/xx8Q/ydfl0btaZd7q/Q7fm3QzWNFjR0VDvCiG2AD30m56QUgbrb08r4249gOnAaSFEqH7bv9GSxAohxGNAHDBR/7vNaMNmI9GGzj6iP3eaEOJd4Jh+v3eKO7sVpdrJz4aNz2kT7bwfhDHfgbmVsaO6K9du5PPVHxH8fDAWZ3trfnu8C909Ghs7rDpLSklwcjBLzy3lcNJhMvMzAXCwcmBsm7FM8JxAW4e2Bo+jolcWAFbAdSnlT0IIRyGEm5QypqydpZT7+WsZ1lsNuM3+EniqjGP9CPx4F7EqStW7FgvLH4LkMK2mU+9/gUnNabLJLSjipwOxfLsnkht5hUwOaskbI7yoZ3E3HxNKZbmSfYXtsdtZE7mGiGsR2FnaMajVILwcvAhqFoSrnSsmour+vio6dLZ4nkNbtHUtzIHf+OtKQ1HqtuQw+G08FGTDlOXgOcjYEVWYTidZeyKRT7ef51JGLgPaNeHloe3wbNrA2KHVOXlFeey+uJt1Ues4dOkQOqmjfaP2vN3tbYa7D8fKzHhXqRX9l2Es2tDX4wBSykv6iXaKopxaqc3ItmwAMzdBMx9jR1Rh+yKu8sHmc5xNuo6fix2fTvSnm0cjY4dVp1zJvsL+xP3sS9jH4aTDZBVk0dymOY/5PMYIjxG421WPOmEVTRb5pYe56ju2FaVuK8iFba9qE+1adoPxP4Jtc2NHVa4inWTz6SR+2BfNqYQMWjhY89WUjozwdcLERM2bMKSCogIuXLvAmZQzRKZHEpwcTGR6JABNrJswsNVAhrgOoWvzrlXaxFQRFSlRLoCNQojvAXshxCzgUeAHQwenKNVW/DFY/xSknIfuc2HAm9pcimpMp5NsOp3EV7siiLiSResm9XlzRHumdW2JpVnNKjlS3RXqCrl4/SJn086SkJlAYlYisddjCUsJI1+XD4C1mTX+jv6M8hhF9+bd8WzoWa0nOVakRLkUQkxAWwDpOlq/xZtSyh2GDk5Rqh0p4cj32hVFg+bw0Bpo/bfxGtWKlJItZy7z+Y4LRFzJok2T+nwztSPDfNSVxP0q0BUQeS2S6Ixo0nLTiM2I5cK1C5xLO0duUW7Jfo2tG+NS34XJ7Sbj5+iHX2M/mtk0q9bJ4VYVbYY6DqRLKV8yZDCKUq2lRGp9E3H7wXMIPPhDtV6cSErJH+eu8MXOCE4nZtC6SX2+ntKRYb5OmKokcdeklFy6cYnTKac5ffU0YalhhKWE3ZQUGpg3oE3DNozzHIeXgxdejbxwqe9CPfOaX669osmiCzBNCBHHzetZ+BkkKkWpTooKtFXsdv8XzCxh5JfQaUa1Lil+I6+Q19edYe2JRFo61OPj8X6M6+SiksRd0kkdJ66cYGfcTjZFb+Janlbs2sLEgnYO7RjvOR4/Rz88G3rSyKoRdpZ2Nepq4W5UNFkMNmgUilJdxe6HTS/C1bPaKnbDPoUGTcu/nxFFXslkzm/HibqaxQsDPZnT1wNzVaLjrmTmZ7IhagO/hv9KYlYiFiYW9HLpRTenbvg4+uBp74l5Ne+jqmwVncEdZ+hAFKVaSY+HXf+B0yvBviVMXgLthhs7qjvS6STLg+N5f9NZLM1M+PWxLvRorWZe343LNy6z8PRC1kWuI7coFz9HP57p+Ay9XXrTwKJuzxZQUzMVpTQp4cRvsOVl0BVos7B7vQDm1saO7I7OX87kjfVnOBqTRld3Bz6b6E9z++odc3VyLfca35/6nlUXVlEkixjhPoLJbSfj3djb2KFVGypZKEqxnHRtOOy5jeDaC8Z8q11VVGOpWXl8sv08y47FY2tlzkfjfJkQ0EKNcqqg/KJ8fjv7Gz+c+oHswmxGeYziiQ5P4Fz/b0vm1HkqWSgKQEIwrHoUrifqlzt9strXdfrjXDL/WnWajJx8ZnZ3ZW7/NjS0qXmLKhlDQVEBG6M3suDUAhKyEujt0pvnOz1P64atjR1ataWShVK3pUTCwS+1pidbZ3hkC7TobOyo7ii/UMfHW8/xf/tj8HKyZdGjnWnfvPoO4a1OcgpzWBOxhp/O/ERydjJeDl7Mf2A+PZxVmbvyqGSh1E1FhbD3Y9j3GQgTCHxUm4VdzdediE/L5pmlJwiNT2dGt1a8OswLK3M1+7o8ablpLDm7hOXnl5Oel06nJp14u/vb9Gjeo9YOda1sKlkodU9iiDYc9tJx8JsEA9+t9sNhAbaeucxLq06ChG+ndWKYr5OxQ6r2wlLDWHJ2CVtjtpKvy6dvi77MaD+DwGaBxg6txlHJQqk7stNg1zsQ8jPUb6IV/vMZZ+yoypWenc9HW8+z9OhFOrjY8c3UTrRwqPkzgg0lpzCHjdEbWXVhFeGp4VibWTO2zVimtpuKu331qOBaE6lkodR+Oh2E/gY73oLcDK3zuu8r1bpUB2jlJbaeuczr686QnlPArF5uvDi4rSr6dxvpuensuriLI5eP8Gf8n2QXZtOmYRteDnqZ0a1H1/k5EpVBJQuldrt0QmtySgyGlt1h+CfQtPqPnb92I5+3NoSx4eQlvJvb8tvjXfByqt7JrSoV12naE7+H3Rd3E5IcQqEspLF1Yx5o9QBjW48loGmA6o+oRCpZKLVTfjZsfRmOLwKbJjB2AfhNrNb1nIqFXcrgsZ+DScnK44WBnjzZ1wMzVa4D0GZYLz23lK0xW7l04xIAre1bM917OkNdh9LOoZ1KEAaikoVS+ySHw8oZkBKhrTXR+8VqP8oJICe/iK//iOCHfdE0srFk3VM98HGu/nFXhbOpZ/kl/Be2xWxDh45ezr2Y6TOTzs0642HvYezw6gSVLJTaQ0pt1brtr2tLnE5fCx79jB1VhVxIzuSJX0OITrnBg52ceW2YF43qWxo7LKOLyYjh/SPvcyTpCPXM6jHVayoPeT2EU301EqyqqWSh1A4pEbDpnxDzJ7j3gzHfgW31/0CRUrL6eCJvrj9DPQszljzehe51vPhfga6AQ5cOsTFqIzvidlDPvB4vBLzAOM9x2FqofhtjUclCqdl0Oji6QKsQa2IOwz+FgEerfakO0Oo6Pbc8lH0RKQS2asj/pnWiqa2VscMyiss3LnPs8jGOXj7KgcQDXM25ir2lPZPaTeJx38dpbF23E2h1oJKFUnNdT4J1T0D0Hmg9EEZ9XSOuJgB2nU3m5dWnuZ5bwLujvZnWpVWdKv53+cZljicf5+jloxy7fIyLmRcBsLWwJaBpAGNaj6GXc686t2ZEdaaShVLz5GXCga/g4Ncgi2rEynXFLmfk8vHWc6w5kUi7Zg3qVF2nlJwU1kasZefFnYSnhgPaMqQBTQOY1HYSnZ0649nQExNR/a8K6yKVLJSaQ0o4tVybXJd1GbzHavWcHKr/rNzs/EK+/zOa7/dGodPBU/08mDugTa2fYJdflM+OuB38HvU7Ry4foVBXSAfHDjzX6Tm6Ne9G24ZtMTWp3c9BbaGShVIz5GbA789C2Fpw6QyTfq321WEBinSStScSmbftHMnX8xju68TLQ9rRslHtLteRkJnAygsrWRe5jrTcNJxsnHjI6yHGthmLu131T+7K36lkoVR/V8/D0imQHgf934Cez0MN+G/0YFQKH245x6mEDDq42PG/qZ0IdHUwdlgGcy33GvsT97MxeiOHLh1CCEFfl75MajuJrs27qualGk4lC6V6u7Ad1swCUwuY8Tu06m7siMoVEneNL3ZeYF9ECk52Vnw+qQOjOzjXug7s7IJsgpODOZx0mKNJR7lw7QISSXOb5sz2m814z/E0s2lm7DCVSqKShVI9FeZrw2EPfQNNfWDyYmjoauyo7igpI4f3Np1l06kkGtYz5/XhXjzUtVWtWW9CJ3WcuHKCI0lHOJJ0hFNXT1EoC7EwsaBjk4486f8k3Zp3w6+xnyq5UQupZKFUP9lpsHw6xO2HoMe1ZU7Nq+/8Ayklvx6O44PNZ9FJeP4BT2b1dqOeRc1/e2XkZbA3YS+nrp5id/xukrOTEQjaN2rPDO8ZdHHqQscmHbEyq76vj1I5av5fs1K7JIbAypmQmQwP/qAV/6vG4lJv8NraM+yPTKGPpyPvjfGpsWtNZORlcCblDHHX47iYeZGDlw4SmxGLRGJlakVXp668EPACPZx7YGepalbVNSpZKNVDcV2nra9A/abaWtguAcaOqky5BUXM/zOKb/dEYWFqUuMm1t0ouEFUehSnrp4iIj2CU1dPEZUehUQCYGFiQZBTEEPdhtKjeQ+8G3mrIa51nEoWivHpdLDzTW2SXesHtCuKetV31FBMyg2e+DWE88mZjPBz4vXh7WlmV32bYZKykjiWfIzjyceJTI8kPjOetNy0kt/bWdrh09iHQa6DCGwaiJudGw5WDmr0knITlSwU4yrMg9+fg5NLIGgWDP242tZ1klKyMiSB/2wIw8LMhJ8eCaJf2ybGDutvinRFHL9ynB1xO9iXsI+ErARAK6XR1qEtfVv0xaW+C252bvg09qFpvaaqQ1opl0oWivHkXINl0yDuAPT9N/T5V7Ut2XE26TpvrQ/jaGwanV0d+GKyP83trY0dVgkpJadTTrMxWqvUmpKTgoWJBd2duzPNaxpBzYJo07CNulpQ7plKFopxXDmrjXhKj4MH/w/8Jhg7or+RUhIan86K4ARWBMdja2XGfx/0ZVJgi2rTN5GQmcDqiNWsj1zP1ZyrWJhY0KdFHwa1GkQvl17YmNsYO0SlllDJQql6MXthyWSwqKctUOTa09gR3SThWjbrTiSy5ngi0Sk3sDQzYXJQC14a3Bb7ehbGDg8pJcHJwSwKX8TehL3opI6+LfoysNVA+rXoRwOLBsYOUamFVLJQqlbIL9oiRY1aa4mimpQUv5FXyMZTl1gdksjRWK3zt4ubA//o485QXydsrYxfKjsqPYrNMZvZFL2JxKxEGlo25FGfR5noOVGtHKcYnMGShRDiR2AEcEVK6aPf5gAsB1yBWGCilPKa0HrXvgSGAdnATCnlcf19ZgCv6w/7npTyF0PFrBjYwW9g+2vgMQDGLwTrhkYNp0gnCY5NY+2JRH4/eYkb+UW4O9rwz4GejOnoXC3mS2QXZLMtdhurI1Zz8upJTIQJQc2CmNNhDoNcB2FtVn36TZTazZBXFj8D3wCLSm17BdglpfxQCPGK/ueXgaFAG/1XF+A7oIs+ubwFBAISCBFCbJBSXjNg3IohHPgSdrwJ7UfDuIVgpEVtpJQciUljW9hlNp5K4mpmHtbmpozs4MSkoJZ0amlv9JFBUkpOpZxibcRatsRsIbswGzc7N/4Z8E9GeIxQq8YpRmGwZCGl3CuEcL1l82igr/72L8AetGQxGlgkpZTAYSGEvRDCSb/vDillGoAQYgcwBFhqqLgVAzjwlZYovB/U5lCYVl3rZ2GRjrNJmRyNTSM4No1jsddIycrDwsyEvp6OjPJvTt+2TahvafwW2eyCbLbEbGHxucVEXIvA2syawa6DGddmHB0cOxg9iSl1W1W/Q5pKKZP0ty8DTfW3nYH4Uvsl6LeVtf1vhBCzgdkALVu2rMSQlfty4CvY8Ya2UFEVJQopJSFx11h7IpENJy+RmVsIQAsHa3q3aUyP1o0Z5uuEtUX1mJEclhLG6ojVbIreRHZhNp4NPXmr21sMcR1CfYv6xg5PUQAjdnBLKaUQQlbi8RYACwACAwMr7bjKPSrMhz/ehYNfVdkVhZSSjaeS+GLnBaKu3sDK3IQh3s0Y4NWUQNeGONlVr/b9Y5eP8cXxLzh19RRWplYMdh3MeM/x6ipCqZaqOlkkCyGcpJRJ+mamK/rtiUCLUvu56Lcl8lezVfH2PVUQp3I/EkNg/TNwJQwCH4Wh8wyeKGJTbvDG+jPsi0jBy8mWeeP9GOrrVC2al2514doFvj7xNXvi9+Bk48QrnV9hpMdIbC3qxlrcSs1U1e+kDcAM4EP99/Wltj8thFiG1sGdoU8o24APhBDFw2YGAa9WccxKReVnw54P4ND/tGKAU5ZB26EGPWV2fiEfbTnHb0cuYm1uyjv6gn6m1WTSXLGUnBS2xGxhY/RGwlPDsTG34dlOz/KQ10OqvLdSIxhy6OxStKuCxkKIBLRRTR8CK4QQjwFxQHH96c1ow2Yj0YbOPgIgpUwTQrwLHNPv905xZ7dSzUT9oc2fSIuGgEdg4H/AyrBlrE9cvMara05zPjmTh7q04pn+rWliW30+eIuXGd0Us4nDlw5TJIvwcvDipcCXGOkxkoZWxh06rCh3Q2gDkGqXwMBAGRwcbOww6oYbKbD9dTi5FBw8YMTn4N7HoKe8mpnHq2tOsfPsFRrZWPDZJH/6eDoa9JwVFZUexc64nQQnB3Mk6QgSiZONE8PdhzPCfQQe9h7GDlFRyiSECJFSBt7ud9WvQVepGQpy4fQKbUhsXib0fEErBGhu2E7k3eev8OKKk2TlFfLS4LbM6O5q9H6JzPxMVpxfwYaoDURnRAPgauvKLL9Z9HHpg09jH1XAT6nxVLJQ7o6uCA5/B/s+0arGNu8EY76FJl4GPW1BkY5Pt1/g+71RtG3agCWzutK2mXFrIEVnRLM4fDG/R/9OTmEOgU0DmdR2EoNcB6mJc0qto5KFUnHJYbDhGW20U+sHoNvT4N7X4GXFkzJymLv0BMdirzE5qAVvjmxvtPWtpZQcuHSAb058Q1hqGBYmFgx1G8oUryl4N/I2SkyKUhVUslDKV5gHez+B/Z+Blb1WrsNnnMGTxJXruawIjmf+n9EU6SRfTvZntP9t52QaXG5hLjvidrA2ci3HLh/Dub4zLwa+yHD34eoqQqkTVLJQ7uziEe1qIuU8+E2GwR+ATSODnU5KyamEDFaFaGtI5BXq6NvWkXdG+dCyUdUW9ivUFRKcHMz22O1sjdlKZkEmzW2a83LQy0xqOwlzI9W3UhRjUMlCub2CXNj1H61/ws4Fpq2GNg8Y5FS5BUUcjk5lz/mr/HHuChfTsrEwNWFMx+Y80ccDd8eqK3lx+cZljl0+xr7EfRy8dJCMvAysTK0Y0GoAY1uPJahZkOqsVuoklSyUv8tIhKWT4PJpbV3sB94Cy8rtTI5NucGh6FR2nU3mQGQqOQVFWJmb0MOjMXP6ejDMxwm7elXzn3vc9ThWnF/BvsR9xGTEAOBg5UBv5970b9mfHs49VClwpc5TyUK5WUYC/DQUctJh6grwHFxph5ZSsiM8mQV7owmO06rMuzS0ZkKgC/3aNaGbeyOszA1f3E8ndYSnhrMvYR97E/ZyJvUMZiZmdGnWhXFtxtG5WWc8G3pialI9Cg0qSnWgkoXyl8xkWDRaSxQPrwfnTpV26INRKXyxM4KjMWm0dKjHa8O86NPWkTZN6hu8aF5OYQ4R1yKIux7H4aTD7E/cT1puGgKBr6Mvz3Z6ljGtx6iOakW5A5UsFM3VC1rTU2YyPLS60hLFmcQMPtp6jn0RKTS1teQ/o7yZ1qUlZqaGa/eXUhKWGsaWmC0cTjpMVHoURbIIAFsLW3o496C3S296NO+hSm4oSgWpZKHA+S2wehaYWWpXFC2C7v+QlzP5fMcFtoZdxs7anNeHe/FQ11YGa2aSUnLiygl2xO1gd/xuErMSMTMxo3OzzvRt0Zf2jdrjUt+F1vatVfOSotwDlSzqstzrsOsdOPYDOHWASYvBvkX597uDmJQbfLHzAhtOXsLGwoxnB7ThsV5u2FoZprP6Wu41tsRsYX3UesJTw7EwsaCLUxf+4fcP+rXoh72VvUHOqyh1jUoWdZGUcHoVbH8Nsq5Alyfggbfvua5T8cp0vx6OY+OpJMxNBf/o7cE/ervT0MaicmNHm/+wP3E/W2O3si12G4W6Qto5tOONrm8w3H04NuY2lX5ORanrVLKoa/KyYO0/4NxGaN4RpiwF54B7OlSRTrL1zGUW7I3iZEIGNhamPNLdldl93GnSoHJLhWfkZXAo6RB/xv/J/sT9pOel08CiAePbjGe853jaOrSt1PMpinIzlSzqktQoWPEwXAmHQe9B1yfhHtrvr2TmsjI4gaVHL5JwLQe3xja8N8aHBzs5V2rNJikloVdDWR+5no3RG8kryqOBeQP6tujLgFYD6OPSBzMT9SesKFVBvdPqguJKsX+8p3ViT1upFQK8m0PoJPsiU1h65CI7zyZTqJN0dXfgtWFeDPZuhkklrUyXX5RPSHII+xK1ORBx1+OwNrNmhPsIxrQeg09jH5UgFMUI1Luutks6BRuf0yrFeg6B4Z+BXcWL8UkpWReayJc7I4hNzcbBxoLHeroxKahFpZThyC/KJ/lGMkcvH2Vvwl4OJx0muzAbCxMLApoG8JjPYwx2HUw986qtC6Uoys1UsqitctJh278hdAnYNL7rSrGnEzLYGpbExlNJxKVm4+Nsy1dTOjLYuymWZhVvusopzCE6I5rIa5FEpkcSlR5FdmE2qTmp5Bflk5abRm5RLgDNbJoxwn0EvV16E9QsSCUIRalGVLKojWL2aZ3YmZeh+9PaKnb1HMq9W36hjm1hl/m//TGcjE/H1ETQ1d2BZwe0YYy/8x2bmqSUpOamEpUexcXMi0SnR7M/cT9x1+OQaEv3mpuY42rniq2FLW0atsHS1BJ7S3ta27fGz9GP1vatDT6bW1GUe6OSRW1SVAh7P4a988DBHR7fUaGRTjEpN1gRHM/K4ARSsvJwbVSPd0Z7M6pDc+zrlT30VUrJubRzbInZwtbYrSTdSCr5nYWJBUFOQQxzG0brhq3xsPegZYOWqr9BUWoo9c6tLVKjtHUn4g5Ah6kwbB5Y3r5PIe1GPiFx14hPy+ZAZAq7zl3BRED/dk2Y1rUVvds4YnqHq4iYjBi2xmxlc8xmYq/HYibM6Na8G9PbT8fD3gN3O3caWzdWiUFRahH1bq7pdEVw8CvY/QGYWcHY76HD5Jt2yS/UEZ2Sxc7wZNaHXiLiSlbJ75ztrXm6X2umd2tFU9vbz42QUnLh2gX2J+5nW+w2zqadRSAIaBrA9PbTGdhqoKqxpCi1nEoWNdn1S7BuDkTvAa9RMPRjsHVCSkl0yg12hCdzIDKFozFp5BXqAOjq7sDYTm3p7OpAS4d6ODawLLOf4Gr2VdZHrWdj1EaiMqIAaN+oPS8GvsgQ1yE0tWlaVY9UURQjU8miJioqhBOLtLpOhXkw8kvoNIPcQh0rDsXy84FYolNuAODZtD5TOrekY0t7/FvY06rRnUthpOWmcejSITbHbOZA4gGKZBEdm3Tkja5v0L9lf1XGW1HqKJUsahJdEZxZA39+CKmR0KoHjPySeBNnFv4ezqqQBLLyCunU0p7/jPJmYPumNLcvv95TQmYC22K3sSNuB2GpYQA4WjvyiM8jjGk9hla2rQz9yBRFqeZUsqgJigrh7AbY9xkkn4Ym7WHSb8Q59uN/e6JYc3wPQsAIv+ZMDmpBZzeHcoegZuZnsjdhL4vPLuZ0ymkA/Br78bT/03Rv3h3vxt5qrWlFUUqoZFGdFRXA6ZWw9xNIi9KGw45bSFTTQfxvTzTrQ/diZiJ4qGsr/tHHHSe7sq8iMvMzCb0SysmrJzl19RQhySHk6/Jp2aAlLwa+SP8W/Wlhe3/lyRVFqb1UsqiOdDo4swr2/BfSoqGZL0z8lYiGvfnmzxh+X7IPCzMTrcJrb3ea3DKKKSUnhZNXThKWGkZkeiSXsi4RkR6BTuoQCNzs3JjcbjL9W/bH39FfLQakKEq5VLKoTvIyIWwtHPoWrp6Fpr4weSmh9brxy6E41oUewNrclFm93ZnVy53G9S0pKCog9Eoox68c52zqWcJTw7mYeREAU2FKK9tWNLNpxoCWA+jUtBO+jX1VGQ1FUe6aShbGdiMVzv0OUbshYjsUZEOT9mSP+p7VeV1Ytj2BsEsHqWdhyuM93Xi8dyuuF15i/+Ut7EvYx6FLh8gsyATAycaJdg7tGO85noCmAXg29MTKrHLXlVAUpW5SyaKq6XSQdAIubIfInVo1WCQ0aI70nch5p5EsjGnMxrWXySkIp22z+jw12Brnpqnsu/Q1ozYcJ7swGwAHKwceaPUAvVx60alJJxpZNzLuY1MUpdZSyaKqZF3RVqc78j1cPQcIcAmEvq+Q2WogqxIbsuxYAucPZlLP8iLdfa5j3ziKk2n7WHQxGS6Cc31nRnqMxL+JP14OXrjZuakRS4qiVAmVLAxJp9OGvB7+DuKPABKa+sDo/0GbwURmW/P9n1Gs33mJAs7TskUE/oGxxOee5mhuHhaXLOjh3IOn/J/Cv4k/rWxbqeSgKIpRqGRhCMWT5/Z9ol1FOLhDv3+T5z6IoznNORqTxqGjpziRHI5FvUSc28WSWhROGpIGpq2Y4DmeXs698G/ij435nWdcK4qiVAWVLCpTUQGcWgH7PtXmRTh6kTdmASuKGrH2/D4iwt9DZ3EJYXoDYaWjnn5itHW9Fjzh/gTD3Yer2dKKolRLKlncLykh8bg2kunEb+RkJnLGqR37g2aw/XoGCSfmgUkeUgoa2Ljg3ag7no7NaVLPATc7N3wa+6iOaUVRqj2VLO5FbgZE/QHnt1AYtZu4/DSCrazY08iZw41cKeQGpOxGl+eIo3kXBrbqzWOBA2lWv/zV6hRFUaojlSwqqjAP4g6SeeIXwmJ2cdoMwurVJ6RJfdLR5jLoCiwpzOxIMws/HmzfnckBXmWuEaEoilKTqGRxJ/nZyDNriIzYyI4rx9hlaUaEhTmyiT0AMr8RhRktkNlt8G3sRxeXdjzQvikdW6qFgBRFqV1qTLIQQgwBvgRMgf+TUn5oqHPJoiKC97/HzrAlHDQXxFqYIxrUo0FuYwqv+pKX646HbTu6u7Wgm0cjOrs60NCm7LWqFUVRaroakSyEEKbA/4CBQAJwTAixQUoZXtnnCovYzcd//pPj5gWYW1tgkd2c3LRAnM270MPNje7dG9PV3YFG9S0r+9SKoijVVo1IFkBnIFJKGQ0ghFgGjAYqNVn8cWQlr4e9jTSBTul+NHZ6nl7+znTzaKT6HhRFqdNqSrJwBuJL/ZwAdCm9gxBiNjAboGXLlvd0EhcnL1xDrXmi84f07vjAPYaqKIpS+9SUZFEuKeUCYAFAYGCgvJdjeLb0Yck/gis1LkVRlNqgphQaSgRKL+Pmot+mKIqiVIGakiyOAW2EEG5CCAtgMrDByDEpiqLUGTWiGUpKWSiEeBrYhjZ09kcpZZiRw1IURakzakSyAJBSbgY2GzsORVGUuqimNEMpiqIoRqSShaIoilIulSwURVGUcqlkoSiKopRLSHlP89eqNSHEVSDuPg7RGEippHAqk4rr7qi47o6K6+7UxrhaSSkdb/eLWpks7pcQIlhKGWjsOG6l4ro7Kq67o+K6O3UtLtUMpSiKopRLJQtFURSlXCpZ3N4CYwdQBhXX3VFx3R0V192pU3GpPgtFURSlXOrKQlEURSmXShaKoihKuVSyKEUIMUQIcV4IESmEeKUKztdCCLFbCBEuhAgTQjyr3/62ECJRCBGq/xpW6j6v6uM7L4QYbKjYhRCxQojT+vMH67c5CCF2CCEi9N8b6rcLIcRX+nOfEkJ0KnWcGfr9I4QQM+4zpralnpNQIcR1IcRzxni+hBA/CiGuCCHOlNpWac+PECJA//xH6u8r7iOueUKIc/pzrxVC2Ou3uwohcko9b/PLO39Zj/Ee46q0101oyxcc0W9fLrSlDO41ruWlYooVQoQa4fkq67PBeH9jUkr1pfXbmAJRgDtgAZwE2hv4nE5AJ/3tBsAFoD3wNvDibfZvr4/LEnDTx2tqiNiBWKDxLds+Bl7R334F+Eh/exiwBRBAV+CIfrsDEK3/3lB/u2Elvl6XgVbGeL6A3kAn4Iwhnh/gqH5fob/v0PuIaxBgpr/9Uam4XEvvd8txbnv+sh7jPcZVaa8bsAKYrL89H5hzr3Hd8vtPgTeN8HyV9dlgtL8xdWXxl85ApJQyWkqZDywDRhvyhFLKJCnlcf3tTOAs2nrjZRkNLJNS5kkpY4BIfdxVFfto4Bf97V+AMaW2L5Kaw4C9EMIJGAzskFKmSSmvATuAIZUUywAgSkp5p5n6Bnu+pJR7gbTbnO++nx/972yllIel9q5eVOpYdx2XlHK7lLJQ/+NhtJUmy1TO+ct6jHcd1x3c1eum/4+4P7CqMuPSH3cisPROxzDQ81XWZ4PR/sZUsviLMxBf6ucE7vzBXamEEK5AR+CIftPT+svJH0tdupYVoyFil8B2IUSIEGK2fltTKWWS/vZloKkR4io2mZvfxMZ+vqDynh9n/e3Kjg/gUbT/Iou5CSFOCCH+FEL0KhVvWecv6zHeq8p43RoB6aUSYmU9X72AZCllRKltVf583fLZYLS/MZUsqgEhRH1gNfCclPI68B3gAfgDSWiXwlWtp5SyEzAUeEoI0bv0L/X/jRhl3LW+PXoUsFK/qTo8Xzcx5vNTFiHEa0AhsFi/KQloKaXsCLwALBFC2Fb0eJXwGKvd63aLKdz8D0mVP1+3+Wy4r+PdD5Us/pIItCj1s4t+m0EJIczR/hgWSynXAEgpk6WURVJKHfAD2uX3nWKs9NillIn671eAtfoYkvWXr8WX3leqOi69ocBxKWWyPkajP196lfX8JHJzU9F9xyeEmAmMAKbpP2TQN/Ok6m+HoPUHeJZz/rIe412rxNctFa3ZxeyW7fdMf6wHgeWl4q3S5+t2nw13OJ7h/8Yq0tlSF77QlpiNRutQK+488zbwOQVaW+EXt2x3KnX7ebT2WwBvbu74i0br9KvU2AEboEGp2wfR+hrmcXPn2sf628O5uXPtqH67AxCD1rHWUH/boRKet2XAI8Z+vrilw7Mynx/+3vk47D7iGgKEA4637OcImOpvu6N9WNzx/GU9xnuMq9JeN7SrzNId3E/ea1ylnrM/jfV8UfZng9H+xgz2QVgTv9BGFFxA+4/htSo4X0+0y8hTQKj+axjwK3Bav33DLW+q1/TxnafU6IXKjF3/Rjip/worPh5a2/AuIALYWeqPTgD/05/7NBBY6liPonVQRlLqA/4+YrNB+0/SrtS2Kn++0JonkoACtPbexyrz+QECgTP6+3yDvtrCPcYVidZuXfw3Nl+/7zj96xsKHAdGlnf+sh7jPcZVaa+b/m/2qP6xrgQs7zUu/fafgSdu2bcqn6+yPhuM9jemyn0oiqIo5VJ9FoqiKEq5VLJQFEVRyqWShaIoilIulSwURVGUcqlkoSiKopRLJQtFqWRCq4Rbz9hxKEplUkNnFaWSCSFi0ca5pxg7FkWpLOrKQlHugxDCRgixSQhxUghxRgjxFtAc2C2E2K3fZ5AQ4pAQ4rgQYqW+3k/xmiEf69cUOCqEaK3fPkF/rJNCiL3Ge3SK8heVLBTl/gwBLkkpO0gpfYAvgEtAPyllPyFEY+B14AGpFWYMRitCVyxDSumLNoP2C/22N4HBUsoOaAUTFcXoVLJQlPtzGhgohPhICNFLSplxy++7oi1ac0BoK67NQFuwqdjSUt+76W8fAH4WQsxCq4mkKEZnVv4uiqKURUp5Qb+E5TDgPSHErlt2EWiLz0wp6xC33pZSPiGE6IJWHC5ECBEg9dVOFcVY1JWFotwHIURzIFtK+RtaRdBOQCbaUpigrUzXo1R/hI0QwrPUISaV+n5Iv4+HlPKIlPJN4Co3l5hWFKNQVxaKcn98gXlCCB1a5dI5aM1JW4UQl/T9FjOBpUIIS/19XkernArQUAhxCshDW2wH/fHaoF2V7EKr/qsoRqWGziqKkaghtkpNopqhFEVRlHKpKwtFURSlXOrKQlEURSmXShaKoihKuVSyUBRFUcqlkoWiKIpSLpUsFEVRlHL9P1MwrRB4knmhAAAAAElFTkSuQmCC", "text/plain": [ - "" + "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" } ], @@ -498,944 +631,878 @@ " ThompsonSamplingAgent(),\n", " EpsilonGreedyAgent(),\n", " UCBAgent(),\n", - " YourAgent()\n", + "# YourAgent()\n", "]\n", "\n", - "plot_regret(DriftingBandit(), drifting_agents, n_steps=20000, n_trials=10)" + "regret = get_regret(DriftingBandit(), drifting_agents, n_steps=20000, n_trials=10)\n", + "plot_regret(drifting_agents, regret)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Part 2. Contextual bandit\n", + "## Part 2. Exploration in MDP\n", "\n", - "Now we will solve much more complex problem - reward will depend on bandit's state.\n", - "\n", - "**Real-word analogy:**\n", - "\n", - "> Contextual advertising. We have a lot of banners and a lot of different users. Users can have different features: age, gender, search requests. We want to show banner with highest click probability.\n", - "\n", - "If we want use strategies from above, we need some how store reward distributions conditioned both on actions and bandit's state. \n", - "One way to do this - use bayesian neural networks. Instead of giving pointwise estimates of target, they maintain probability distributions\n", + "The following problem, called \"deep see\", illustrates importance of exploration in context of mdp's." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", "\n", - "\n", - "Picture from https://arxiv.org/pdf/1505.05424.pdf\n", + "The deep sea problem is implemented as an $N×N$ grid with a one-hot encoding for state.\n", + "The agent begins each episode in the top left corner of the grid and descends one row\n", + "per timestep. Each episode terminates after N steps, when the agent reaches the bottom\n", + "row. In each state there is a random but fixed mapping between actions $A = {0,1}$ and\n", + "the transitions ‘left’ and ‘right’. At each timestep there is a small cost $r = −0.01/N$ of\n", + "moving right, and $r = 0$ for moving left. However, should the agent transition right at every\n", + "timestep of the episode it will be rewarded with an additional reward of $+1$.\n", "\n", + "**Question:** Why is the deep see a challengin exploration problem?\n", "\n", - "More material:\n", - " * A post on the matter - [url](http://twiecki.github.io/blog/2016/07/05/bayesian-deep-learning/)\n", - " * Theano+PyMC3 for more serious stuff - [url](http://pymc-devs.github.io/pymc3/notebooks/bayesian_neural_network_advi.html)\n", - " * Same stuff in tensorflow - [url](http://edwardlib.org/tutorials/bayesian-neural-network)\n", - " \n", - "Let's load our dataset:" + "See full paper [here](https://openreview.net/forum?id=rygf-kSYwH)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 304, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "State size: 60, actions: 10\n" + "\u001b[1m\u001b[37mLoaded bsuite_id: deep_sea/0.\u001b[0m\n" ] + }, + { + "data": { + "text/plain": [ + "{'size': 10, 'mapping_seed': 42}" + ] + }, + "execution_count": 304, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "all_states = np.load(\"all_states.npy\")\n", - "action_rewards = np.load(\"action_rewards.npy\")\n", + "import gymnasium as gym\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import torch\n", + "from torch import nn\n", + "from time import sleep\n", + "from tqdm import tqdm\n", + "from IPython.display import clear_output\n", "\n", - "state_size = all_states.shape[1]\n", - "n_actions = action_rewards.shape[1]\n", + "from q_learning_agent import QLearningAgent\n", + "from replay_buffer import ReplayBuffer\n", "\n", - "print(\"State size: %i, actions: %i\" % (state_size, n_actions))" + "env = gym.make(\"bsuite/deep_sea-v0\", size=10, seed=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.1 Epsilon-greedy q-learning " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 150, "metadata": {}, "outputs": [], "source": [ - "import theano\n", - "import theano.tensor as T\n", - "import lasagne\n", - "from lasagne import init\n", - "from lasagne.layers import *\n", - "import bayes\n", - "\n", - "as_bayesian = bayes.bbpwrap(bayes.NormalApproximation(std=0.1))\n", - "BayesDenseLayer = as_bayesian(DenseLayer)" + "def get_state_number(s):\n", + " return np.argmax(s.flatten())\n", + "\n", + "def test_agent(agent, greedy=True, delay=.5):\n", + " v = get_all_states_value(agent)\n", + " s, _ = env.reset()\n", + " done = False\n", + " while not done:\n", + " fig, ax = plt.subplots(ncols=2)\n", + " ax[0].imshow(s)\n", + " ax[0].set_title('State')\n", + " im = ax[1].imshow(v)\n", + " plt.colorbar(im)\n", + " ax[1].set_title('Value function')\n", + " clear_output(True)\n", + " plt.show()\n", + " s = get_state_number(s)\n", + " if greedy:\n", + " a = agent.get_best_action(s)\n", + " else:\n", + " a = agent.get_action(s)\n", + "\n", + " s, r, terminated, truncated, _ = env.step(a)\n", + " done = terminated or truncated\n", + " sleep(delay)\n", + "\n", + "def get_all_states_value(agent):\n", + " s_shape = env.observation_space.shape\n", + " s_shape_flatten = np.prod(s_shape)\n", + " v = np.zeros(s_shape_flatten)\n", + " for i in range(s_shape_flatten):\n", + " v[i] = agent.get_value(i)\n", + " v = v.reshape(s_shape)\n", + " return v\n", + "\n", + "def to_one_hot(x, ndims):\n", + " \"\"\" helper: take an integer vector and convert it to 1-hot matrix. \"\"\"\n", + " x = x.long().view(-1, 1)\n", + " x = torch.zeros(\n", + " x.size()[0], ndims).scatter_(1, x, 1)\n", + " return x" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 151, "metadata": {}, + "outputs": [], "source": [ - "## 2.1 Bulding a BNN agent\n", - "\n", - "Let's implement epsilon-greedy BNN agent" + "agent = QLearningAgent(\n", + " epsilon=1, \n", + " alpha=0.5, \n", + " discount=1, \n", + " get_legal_actions=lambda s: range(env.action_space.n)\n", + ")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 152, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ - "class BNNAgent:\n", - " \"\"\"a bandit with bayesian neural net\"\"\"\n", - "\n", - " def __init__(self, state_size, n_actions):\n", - " input_states = T.matrix(\"states\")\n", - " target_actions = T.ivector(\"actions taken\")\n", - " target_rewards = T.vector(\"rewards\")\n", - "\n", - " self.total_samples_seen = theano.shared(\n", - " np.int32(0), \"number of training samples seen so far\")\n", - " batch_size = target_actions.shape[0] # por que?\n", - "\n", - " # Network\n", - " inp = InputLayer((None, state_size), name='input')\n", - " # YOUR NETWORK HERE\n", - " out = \n", - "\n", - " # Prediction\n", - " prediction_all_actions = get_output(out, inputs=input_states)\n", - " self.predict_sample_rewards = theano.function(\n", - " [input_states], prediction_all_actions)\n", - "\n", - " # Training\n", - "\n", - " # select prediction for target action\n", - " prediction_target_actions = prediction_all_actions[T.arange(\n", - " batch_size), target_actions]\n", - "\n", - " # loss = negative log-likelihood (mse) + KL\n", - " negative_llh = T.sum((prediction_target_actions - target_rewards)**2)\n", - "\n", - " kl = bayes.get_var_cost(out) / (self.total_samples_seen+batch_size)\n", - "\n", - " loss = (negative_llh + kl)/batch_size\n", - "\n", - " self.weights = get_all_params(out, trainable=True)\n", - " self.out = out\n", - "\n", - " # gradient descent\n", - " updates = lasagne.updates.adam(loss, self.weights)\n", - " # update counts\n", - " updates[self.total_samples_seen] = self.total_samples_seen + \\\n", - " batch_size.astype('int32')\n", - "\n", - " self.train_step = theano.function([input_states, target_actions, target_rewards],\n", - " [negative_llh, kl],\n", - " updates=updates,\n", - " allow_input_downcast=True)\n", - "\n", - " def sample_prediction(self, states, n_samples=1):\n", - " \"\"\"Samples n_samples predictions for rewards,\n", - "\n", - " :returns: tensor [n_samples, state_i, action_i]\n", - " \"\"\"\n", - " assert states.ndim == 2, \"states must be 2-dimensional\"\n", - "\n", - " return np.stack([self.predict_sample_rewards(states) for _ in range(n_samples)])\n", - "\n", - " epsilon = 0.25\n", - "\n", - " def get_action(self, states):\n", - " \"\"\"\n", - " Picks action by \n", - " - with p=1-epsilon, taking argmax of average rewards\n", - " - with p=epsilon, taking random action\n", - " This is exactly e-greedy policy.\n", - " \"\"\"\n", - "\n", - " reward_samples = self.sample_prediction(states, n_samples=100)\n", - " # ^-- samples for rewards, shape = [n_samples,n_states,n_actions]\n", - "\n", - " best_actions = reward_samples.mean(axis=0).argmax(axis=-1)\n", - " # ^-- we take mean over samples to compute expectation, then pick best action with argmax\n", - "\n", - " \n", - " chosen_actions = \n", - "\n", - " return chosen_actions\n", - "\n", - " def train(self, states, actions, rewards, n_iters=10):\n", - " \"\"\"\n", - " trains to predict rewards for chosen actions in given states\n", - " \"\"\"\n", - " loss_sum = kl_sum = 0\n", - " for _ in range(n_iters):\n", - " loss, kl = self.train_step(states, actions, rewards)\n", - " loss_sum += loss\n", - " kl_sum += kl\n", - "\n", - " return loss_sum / n_iters, kl_sum / n_iters\n", - "\n", - " @property\n", - " def name(self):\n", - " return self.__class__.__name__" + "test_agent(agent, greedy=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 2.2 Training the agent" + "Let's try to solve this by q-learning with high epsilon!" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "N_ITERS = 100" + "for i in range(5000):\n", + " s, _ = env.reset()\n", + " done = False\n", + " while not done:\n", + " i_s = get_state_number(s)\n", + " a = agent.get_action(i_s)\n", + " s_next, r, terminated, truncated, _ = env.step(a)\n", + " done = terminated or truncated\n", + " i_s_next = get_state_number(s_next)\n", + " agent.update(i_s, a, r, i_s_next, terminated)\n", + " s = s_next" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ - "def get_new_samples(states, action_rewards, batch_size=10):\n", - " \"\"\"samples random minibatch, emulating new users\"\"\"\n", - " batch_ix = np.random.randint(0, len(states), batch_size)\n", - " return states[batch_ix], action_rewards[batch_ix]" + "test_agent(agent)" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from IPython.display import clear_output\n", - "\n", - "from pandas import DataFrame\n", - "moving_average = lambda x, **kw: DataFrame(\n", - " {'x': np.asarray(x)}).x.ewm(**kw).mean().values\n", - "\n", - "def train_contextual_agent(agent, batch_size=10, n_iters=100):\n", - " rewards_history = []\n", - "\n", - " for i in range(n_iters):\n", - " b_states, b_action_rewards = get_new_samples(\n", - " all_states, action_rewards, batch_size)\n", - " b_actions = agent.get_action(b_states)\n", - " b_rewards = b_action_rewards[\n", - " np.arange(batch_size), b_actions\n", - " ]\n", - "\n", - " mse, kl = agent.train(b_states, b_actions, b_rewards, n_iters=100)\n", - "\n", - " rewards_history.append(b_rewards.mean())\n", - "\n", - " if i % 10 == 0:\n", - " clear_output(True)\n", - " print(\"iteration #%i\\tmean reward=%.3f\\tmse=%.3f\\tkl=%.3f\" %\n", - " (i, np.mean(rewards_history[-10:]), mse, kl))\n", - " plt.plot(rewards_history)\n", - " plt.plot(moving_average(np.array(rewards_history), alpha=0.1))\n", - " plt.title(\"Reward per epesode\")\n", - " plt.xlabel(\"Episode\")\n", - " plt.ylabel(\"Reward\")\n", - " plt.show()\n", - "\n", - " samples = agent.sample_prediction(\n", - " b_states[:1], n_samples=100).T[:, 0, :]\n", - " for i in range(len(samples)):\n", - " plt.hist(samples[i], alpha=0.25, label=str(i))\n", - " plt.legend(loc='best')\n", - " print('Q(s,a) std:', ';'.join(\n", - " list(map('{:.3f}'.format, np.std(samples, axis=1)))))\n", - " print('correct', b_action_rewards[0].argmax())\n", - " plt.title(\"p(Q(s, a))\")\n", - " plt.show()\n", - "\n", - " return moving_average(np.array(rewards_history), alpha=0.1)" + "But if we do bigger env:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 305, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "iteration #90\tmean reward=0.560\tmse=0.457\tkl=0.044\n" + "\u001b[1m\u001b[37mLoaded bsuite_id: deep_sea/1.\u001b[0m\n" ] }, { "data": { - "image/png": "\n", "text/plain": [ - "" + "{'size': 12, 'mapping_seed': 42}" ] }, + "execution_count": 305, "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Q(s,a) std: 0.178;0.011;0.000;0.000;0.195;0.000;0.000;0.124;0.023;0.000\n", - "correct 4\n" - ] - }, + "output_type": "execute_result" + } + ], + "source": [ + "env = gym.make(\"bsuite/deep_sea-v0\", size=12, seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 317, + "metadata": {}, + "outputs": [], + "source": [ + "agent = QLearningAgent(\n", + " epsilon=1,\n", + " alpha=0.5,\n", + " discount=1,\n", + " get_legal_actions=lambda s: range(env.action_space.n)\n", + ")\n", + "\n", + "for i in range(5000):\n", + " s, _ = env.reset()\n", + " done = False\n", + " while not done:\n", + " i_s = get_state_number(s)\n", + " a = agent.get_action(i_s)\n", + " s_next, r, terminated, truncated, _ = env.step(a)\n", + " done = terminated or truncated\n", + " i_s_next = get_state_number(s_next)\n", + " agent.update(i_s, a, r, i_s_next, terminated)\n", + " s = s_next" + ] + }, + { + "cell_type": "code", + "execution_count": 318, + "metadata": {}, + "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "" + "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:37: FutureWarning: pd.ewm_mean is deprecated for ndarrays and will be removed in a future version\n" - ] } ], "source": [ - "bnn_agent = BNNAgent(state_size=state_size, n_actions=n_actions)\n", - "greedy_agent_rewards = train_contextual_agent(\n", - " bnn_agent, batch_size=10, n_iters=N_ITERS)" + "test_agent(agent)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## HW 2.1 Better exploration\n", + "## 2.2 Reward shaping" + ] + }, + { + "cell_type": "code", + "execution_count": 308, + "metadata": {}, + "outputs": [], + "source": [ + "class BaseIntrinsicRewardModule(nn.Module):\n", + " def __init__(self):\n", + " super().__init__()\n", + "\n", + " def get_intrinsic_reward(self, state, action, next_state):\n", + " return 0.0\n", "\n", - "Use strategies from first part to gain more reward in contextual setting" + " def get_loss(self, state_batch, action_batch, next_state_batch):\n", + " pass" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 309, "metadata": {}, "outputs": [], "source": [ - "class ThompsonBNNAgent(BNNAgent):\n", - " def get_action(self, states):\n", - " \"\"\"\n", - " picks action based by taking _one_ sample from BNN and taking action with highest sampled reward (yes, that simple)\n", - " This is exactly thompson sampling.\n", - " \"\"\"\n", + "def train_with_reward(env, agent, reward_module, n_episodes=100, update_reward_period=100, batch_size=100, n_iter=10):\n", + " buffer = ReplayBuffer(size=int(1e6))\n", + " \n", + " if list(reward_module.parameters()):\n", + " optimizer = torch.optim.Adam(reward_module.parameters())\n", + " else:\n", + " optimizer = None\n", + "\n", + " losses = []\n", + " s, _ = env.reset()\n", + "\n", + " for i in range(n_episodes):\n", + " done = False\n", + " \n", + " while not done:\n", + " i_s = get_state_number(s)\n", + " a = agent.get_action(i_s)\n", + " s_next, r, terminated, truncated, _ = env.step(a)\n", + " done = terminated or truncated\n", + " i_s_next = get_state_number(s_next)\n", + " \n", + " state_t = torch.tensor(s).float().view(1, -1)\n", + " action_t = torch.tensor(a).float().view(1, -1)\n", + " next_state_t = torch.tensor(s_next).float().view(1, -1)\n", + "\n", + " r_intr = reward_module.get_intrinsic_reward(state_t, action_t, next_state_t)\n", + " r += r_intr\n", + "\n", + " agent.update(i_s, a, r, i_s_next, terminated)\n", + " buffer.add(s, a, r, s_next, terminated)\n", + "\n", + " s = s_next\n", + "\n", + " if (i + 1) % update_reward_period == 0 and optimizer is not None:\n", + " \n", + " for _ in range(n_iter):\n", + " optimizer.zero_grad()\n", + " state_batch, action_batch, _, next_state_batch, _ = buffer.sample(batch_size)\n", + " \n", + " state_tensor = torch.tensor(state_batch).float().flatten(1, 2)\n", + " action_tensor = torch.tensor(action_batch).float().view(-1, 1)\n", + " next_state_tensor = torch.tensor(next_state_batch).float().flatten(1, 2)\n", + " \n", + " loss = reward_module.get_loss(state_tensor, action_tensor, next_state_tensor)\n", + " loss.backward()\n", + " optimizer.step()\n", + " losses.append(loss.item())\n", + " \n", + " fig, ax = plt.subplots(ncols=2)\n", + " ax[0].set_title('Value function after iter: %d' % i)\n", + " im = ax[0].imshow(get_all_states_value(agent))\n", + " ax[1].plot(losses)\n", + " ax[1].set_title('Random network distillation loss')\n", + " clear_output(True)\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 310, + "metadata": {}, + "outputs": [], + "source": [ + "class GoRightReward(BaseIntrinsicRewardModule):\n", + " def __init__(self):\n", + " super().__init__()\n", "\n", - " " + " def get_intrinsic_reward(self, state, action, next_state):\n", + " # " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 311, "metadata": {}, + "outputs": [], + "source": [ + "agent = QLearningAgent(\n", + " epsilon=.1, \n", + " alpha=0.5, \n", + " discount=.9, \n", + " get_legal_actions=lambda s: range(env.action_space.n)\n", + ")\n", + "\n", + "go_right = GoRightReward()\n", + "\n", + "train_with_reward(env, agent, go_right, n_episodes=500)" + ] + }, + { + "cell_type": "code", + "execution_count": 312, + "metadata": { + "scrolled": true + }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "iteration #90\tmean reward=0.360\tmse=0.590\tkl=0.038\n" - ] - }, { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "" + "
" ] }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Q(s,a) std: 0.000;0.028;0.277;0.000;0.044;0.059;0.063;0.093;0.000;0.018\n", - "correct 2\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAFo9JREFUeJzt3XuQ1eWd5/H3NzRIRCIIzcVutBs1Cl4GEYKWLiE6GmRSeIEYjBlRIeymMjuw2dTq7FZtCmsqManNGk2y2aDxMnFKkmGthSWGDUEtjRnFBjQhYS1RVJogtC0gV/vis3/00bQE6KbP7/Tp/vX7VdV1frfzPN9zWj/8+umnnxMpJSRJ+fWxchcgSSotg16Scs6gl6ScM+glKecMeknKOYNeknLOoJeknDPoJSnnDHpJyrmKchcAMHz48FRTU1PuMiSpV1m3bt3bKaXKjq7rEUFfU1NDXV1ducuQpF4lIt7ozHUO3UhSzhn0kpRzBr0k5VyPGKOXpHJpbm6mvr6eQ4cOlbuUoxo4cCDV1dX079+/S8836CX1afX19QwePJiamhoiotzl/IWUEo2NjdTX11NbW9ulNjocuomIByJiZ0RsbHfslIhYHRGvFB6HFo5HRNwbEZsj4ncRMbFLVUlSNzl06BDDhg3rkSEPEBEMGzasqJ84OjNG/xAw/bBjdwBrUkpnAWsK+wBXA2cVvhYAP+pyZZLUTXpqyH+g2Po6DPqU0tPAO4cdvgZ4uLD9MHBtu+P/lNo8BwyJiNFFVShJKkpXx+hHppS2F7bfAkYWtquAre2uqy8c244k9QL/+mpjpu1dcsawDq9ZtWoVCxcupLW1lfnz53PHHXd0+JzjUfQvY1NKKSKO+xPGI2IBbcM7nHbaacWW0af964ofHPdzLpn5dyWoRNLxam1t5atf/SqrV6+murqayZMnM3PmTMaPH59ZH12dR7/jgyGZwuPOwvFtwJh211UXjv2FlNKSlNKklNKkysoOl2qQpFxau3YtZ555JmPHjmXAgAHMmTOH5cuXZ9pHV4N+BTC3sD0XWN7u+M2F2TcXA3vaDfFIkg6zbds2xoz58/1xdXU127Yd8f64yzocuomIR4FpwPCIqAe+AdwF/Dwi5gFvADcULn8cmAFsBg4At2ZarSTpuHUY9CmlG49y6oojXJuArxZblI7Pxn4nA3Be654yVyLpeFVVVbF165/nsNTX11NVVZVpH651I0llNHnyZF555RW2bNlCU1MTS5cuZebMmZn24RIIktROZ6ZDZqmiooIf/OAHfPazn6W1tZXbbruNc889N9s+Mm1NknTcZsyYwYwZM0rWvkM3kpRzBr0k5ZxBL0k5Z9BLUs4Z9JKUcwa9JOWc0ytzoOWdto8LaD74Nv1HjezgaknHtOWZbNur/TcdXnLbbbexcuVKRowYwcaNGzu8/nh5Ry9JZXbLLbewatWqkrVv0EtSmU2dOpVTTjmlZO0b9JKUcwa9JOWcQS9JOWfQ50BF6wAqWgcwoOUEKg74LZX0UU6vlKT2OjEdMms33ngjTz31FG+//TbV1dUsXryYefPmZda+QS9JZfboo4+WtH1/zpeknDPoJSnnDHpJyjmDXpJyzqCXpJwz6CUp55xemQP7328CYM/7h+j33j5aI33k/Cc+PqgcZUm90gtvvZBpe5NHTT7m+a1bt3LzzTezY8cOIoIFCxawcOHCTGsw6CWpjCoqKvjud7/LxIkT2bt3LxdddBFXXnkl48ePz6wPh25yoKW5iZbmJpreO1TuUiQdp9GjRzNx4kQABg8ezLhx49i2bVumfRj0ktRDvP7662zYsIEpU6Zk2q5BL0k9wL59+5g1axbf+973+MQnPpFp2wa9JJVZc3Mzs2bN4qabbuL666/PvH2DXpLKKKXEvHnzGDduHF/72tdK0oezbiSpnY6mQ2bt2Wef5ac//Snnn38+EyZMAOCb3/wmM2bMyKyPooI+Iv4DMB9IwO+BW4HRwFJgGLAO+NuUUlORdUpSLl122WWklDq+sAhdHrqJiCrg74FJKaXzgH7AHODbwN0ppTOBXUB2q+dLko5bsWP0FcDHI6ICOBHYDlwOLCucfxi4tsg+JElF6HLQp5S2Af8NeJO2gN9D21DN7pRSS+GyeqCq2CIlSV1XzNDNUOAaoBY4FRgETD+O5y+IiLqIqGtoaOhqGZKkDhQzdPPXwJaUUkNKqRl4DLgUGFIYygGoBo74t7wppSUppUkppUmVlZVFlCFJOpZigv5N4OKIODEiArgC+CPwJDC7cM1cYHlxJUqSitHl6ZUppecjYhmwHmgBNgBLgF8ASyPiHwvHfpJFoZLUHfY/vzbT9gZN+dQxzx86dIipU6fy3nvv0dLSwuzZs1m8eHGmNRQ1jz6l9A3gG4cdfg049iuTJAFwwgkn8MQTT3DSSSfR3NzMZZddxtVXX83FF1+cWR8ugSBJZRQRnHTSSUDbmjfNzc20jYZnx6CXpDJrbW1lwoQJjBgxgiuvvNJliiUpb/r168eLL75IfX09a9euZePGjZm2b9BLUg8xZMgQPvOZz7Bq1apM2zXoJamMGhoa2L17NwAHDx5k9erVnHPOOZn24TLFktROR9Mhs7Z9+3bmzp1La2sr77//PjfccAOf+9znMu3DoJekMrrgggvYsGFDSftw6EaScs6gl6ScM+glKecMeknKOYNeknLOoJeknHN6pSS1s+3lXZm2V3X20E5d19rayqRJk6iqqmLlypWZ1uAdvST1APfccw/jxo0rSdsGvSSVWX19Pb/4xS+YP39+Sdo36CWpzBYtWsR3vvMdPvax0kSyQS9JZbRy5UpGjBjBRRddVLI+DHpJKqNnn32WFStWUFNTw5w5c3jiiSf40pe+lGkfBr0kldG3vvUt6uvref3111m6dCmXX345jzzySKZ9OL1Sktrp7HTI3sSgl6QeYtq0aUybNi3zdh26kaScM+glKecMeknKOYNeknLOoJeknDPoJSnnnF4pSe1s/cPvMm1vzLkXdHhNTU0NgwcPpl+/flRUVFBXV5dpDQa9JPUATz75JMOHDy9J2w7dSFLOGfSSVGYRwVVXXcVFF13EkiVLMm/foRtJKrPf/OY3VFVVsXPnTq688krOOeccpk6dmln7Rd3RR8SQiFgWEf8vIjZFxCURcUpErI6IVwqP+VshSJIyVFVVBcCIESO47rrrWLt2babtFzt0cw+wKqV0DvBXwCbgDmBNSuksYE1hX5J0BPv372fv3r0fbv/qV7/ivPPOy7SPLg/dRMTJwFTgFoCUUhPQFBHXANMKlz0MPAXcXkyRktRdOjMdMks7duzguuuuA6ClpYUvfvGLTJ8+PdM+ihmjrwUagAcj4q+AdcBCYGRKaXvhmreAkcWVKEn5NXbsWF566aWS9lHM0E0FMBH4UUrpQmA/hw3TpJQSkI705IhYEBF1EVHX0NBQRBmSpGMpJujrgfqU0vOF/WW0Bf+OiBgNUHjceaQnp5SWpJQmpZQmVVZWFlGGJOlYuhz0KaW3gK0RcXbh0BXAH4EVwNzCsbnA8qIqlCQVpdh59P8e+OeIGAC8BtxK2z8eP4+IecAbwA1F9iFJKkJRQZ9SehGYdIRTVxTTriQpOy6BIEk55xIIktTOoVd3Z9rewDOGdHjN7t27mT9/Phs3biQieOCBB7jkkksyq8Ggl6QyW7hwIdOnT2fZsmU0NTVx4MCBTNs36CWpjPbs2cPTTz/NQw89BMCAAQMYMGBApn04Ri9JZbRlyxYqKyu59dZbufDCC5k/fz779+/PtA+DXpLKqKWlhfXr1/OVr3yFDRs2MGjQIO66665M+zDoJamMqqurqa6uZsqUKQDMnj2b9evXZ9qHQS9JZTRq1CjGjBnDyy+/DMCaNWsYP358pn34y1hJaqcz0yGz9v3vf5+bbrqJpqYmxo4dy4MPPphp+wa9JJXZhAkTqKurK1n7Dt1IUs4Z9JKUcwa9JOWcQS9JOWfQS1LOGfSSlHNOr5SkdrZs2ZJpe7W1tcc8//LLL/OFL3zhw/3XXnuNO++8k0WLFmVWg0EvSWV09tln8+KLLwLQ2tpKVVUV1113XaZ9OHQjST3EmjVrOOOMMzj99NMzbdegl6QeYunSpdx4442Zt2vQS1IP0NTUxIoVK/j85z+fedsGvST1AL/85S+ZOHEiI0eOzLxtg16SeoBHH320JMM24KwbSfqIjqZDlsL+/ftZvXo1P/7xj0vSvkEvSWU2aNAgGhsbS9a+QzeSlHMGvSTlnEEvSTln0EtSzhn0kpRzBr0k5ZzTKyWpnV27nsu0vaFDL+7wmrvvvpv777+fiOD888/nwQcfZODAgZnVUPQdfUT0i4gNEbGysF8bEc9HxOaI+FlEDCi+TEnKp23btnHvvfdSV1fHxo0baW1tZenSpZn2kcXQzUJgU7v9bwN3p5TOBHYB8zLoQ5Jyq6WlhYMHD9LS0sKBAwc49dRTM22/qKCPiGrgb4D7C/sBXA4sK1zyMHBtMX1IUp5VVVXx9a9/ndNOO43Ro0dz8sknc9VVV2XaR7F39N8D/hPwfmF/GLA7pdRS2K8HqorsQ5Jya9euXSxfvpwtW7bwpz/9if379/PII49k2keXgz4iPgfsTCmt6+LzF0REXUTUNTQ0dLUMSerVfv3rX1NbW0tlZSX9+/fn+uuv57e//W2mfRRzR38pMDMiXgeW0jZkcw8wJCI+mM1TDWw70pNTSktSSpNSSpMqKyuLKEOSeq/TTjuN5557jgMHDpBSYs2aNYwbNy7TPro8vTKl9A/APwBExDTg6ymlmyLiX4DZtIX/XGB5BnVKUrfozHTILE2ZMoXZs2czceJEKioquPDCC1mwYEGmfZRiHv3twNKI+EdgA/CTEvQhSbmxePFiFi9eXLL2Mwn6lNJTwFOF7deAT2XRriSpeC6BIEk5Z9BLUs4Z9JKUcwa9JOWcQS9JOecyxZLUzrO79mba3qVDB3d4zT333MN9991HSokvf/nLLFq0KNMavKOXpDLauHEj9913H2vXruWll15i5cqVbN68OdM+DHpJKqNNmzYxZcoUTjzxRCoqKvj0pz/NY489lmkfBr0kldF5553HM888Q2NjIwcOHODxxx9n69atmfbhGL0kldG4ceO4/fbbueqqqxg0aBATJkygX79+mfbhHb0kldm8efNYt24dTz/9NEOHDuWTn/xkpu17Ry9JZbZz505GjBjBm2++yWOPPcZzz2X7AeUGvSS105npkFmbNWsWjY2N9O/fnx/+8IcMGTIk0/YNekkqs2eeeaak7TtGL0k5Z9BLUs4Z9JL6vJRSuUs4pmLrM+gl9WkDBw6ksbGxx4Z9SonGxkYGDhzY5Tb8ZaykPq26upr6+noaGhrKXcpRDRw4kOrq6i4/36CX1Kf179+f2tracpdRUg7dSFLOGfSSlHMGvSTlnEEvSTln0EtSzhn0kpRzBr0k5ZxBL0k5Z9BLUs4Z9JKUcy6BIPVRL7z1QqbtTR41OdP2lB3v6CUp5wx6Scq5Lgd9RIyJiCcj4o8R8YeIWFg4fkpErI6IVwqPQ7MrV5J0vIoZo28B/mNKaX1EDAbWRcRq4BZgTUrproi4A7gDuL34UiVlPa6uvqHLd/Qppe0ppfWF7b3AJqAKuAZ4uHDZw8C1xRYpSeq6TMboI6IGuBB4HhiZUtpeOPUWMPIoz1kQEXURUdeTP9lFknq7ooM+Ik4C/hewKKX0bvtzqe1DGI/4QYwppSUppUkppUmVlZXFliFJOoqi5tFHRH/aQv6fU0qPFQ7viIjRKaXtETEa2FlskVJv5Zi6eoJiZt0E8BNgU0rpv7c7tQKYW9ieCyzvenmSpGIVc0d/KfC3wO8j4sXCsf8M3AX8PCLmAW8ANxRXoiSpGF0O+pTSb4A4yukrutquJClb/mWsJOWcQS9JOWfQS1LOGfSSlHMGvSTlnEEvSTln0EtSzhn0kpRzBr0k5ZxBL0k5Z9BLUs4Z9JKUcwa9JOWcQS9JOWfQS1LOFfVRgpL0gZ78sYmTR00udwll5R29JOWcQS9JOWfQS1LOGfSSlHMGvSTlnLNu+oB3D+7/i2Nbtmz5cLu2trY7y+nxevLsEakrvKOXpJwz6CUp5wx6Sco5g16Scs6gl6ScM+glKeecXtnHHNy7G4CG11+jsmYs8NGplodz6qXyIMsps71xgTTv6CUp57yjz5n3Duyjubn5w/2PDx7SLf1u/cPvPtwec+4F3dKnpM7xjl6Scq4kd/QRMR24B+gH3J9SuqsU/ejI+jf3p1/zuwCkAwcBaOnX9sgJp/zF9W9v3ffh9vAxJx2z7UOv7ubtrXs/cmz4mMHwVsufD5zbcY3dtcxA47bGY54fVjWsW+pQfmT93253jPlnfkcfEf2AHwJXA+OBGyNifNb9SJI6pxR39J8CNqeUXgOIiKXANcAfS9CXDtOv+V02DRoFA06ktaUfZ+/b0XaieXDb+UOF8fvXt9LwTisA+3a/B8Dgs2oAWH+w7e78vF2HOPR+2yydV9as/nMng0/9cLPhwDv8qfGEj9TQctiCaYde3f2R8wPP6J7fG0hqU4ox+ipga7v9+sIxSVIZlG3WTUQsABYUdvdFxMtdbGo48HY2VfVaff096OuvH3wP+urrP70zF5Ui6LcBY9rtVxeOfURKaQmwpNjOIqIupTSp2HZ6s77+HvT11w++B3399XekFEM3LwBnRURtRAwA5gArStCPJKkTMr+jTym1RMTfAf+XtumVD6SU/pB1P5KkzinJGH1K6XHg8VK0fQRFD//kQF9/D/r66wffg77++o8pUkrlrkGSVEIugSBJOdfrgj4iTomI1RHxSuFx6FGua42IFwtfvf6XwRExPSJejojNEXHHEc6fEBE/K5x/PiJqur/K0urEe3BLRDS0+77PL0edpRIRD0TEzojYeJTzERH3Ft6f30XExO6usZQ68fqnRcSedt///9rdNfZUvS7ogTuANSmls4A1hf0jOZhSmlD4mtl95WWvk8tKzAN2pZTOBO4Gvt29VZbWcSyt8bN23/f7u7XI0nsImH6M81cDZxW+FgA/6oaautNDHPv1AzzT7vt/ZzfU1Cv0xqC/Bni4sP0wcG0Za+kuHy4rkVJqAj5YVqK99u/LMuCKiIhurLHUOvMe5FpK6WngnWNccg3wT6nNc8CQiBjdPdWVXidev46iNwb9yJTS9sL2W8DIo1w3MCLqIuK5iOjt/xh0ZlmJD69JKbUAe4A8Lc3Y2aU1ZhWGLZZFxJgjnM8zlx+BSyLipYj4ZUR0Yh3VvqFHfvBIRPwaGHWEU/+l/U5KKUXE0aYNnZ5S2hYRY4EnIuL3KaVXs65VPcr/AR5NKb0XEf+Wtp9wLi9zTeo+62n7/35fRMwA/jdtw1h9Xo8M+pTSXx/tXETsiIjRKaXthR9Ldx6ljW2Fx9ci4ingQqC3Bn1nlpX44Jr6iKgATgaOvRh779Lhe5BSav967we+0w119SSdWn4kr1JK77bbfjwi/kdEDE8p9cU1cD6iNw7drADmFrbnAssPvyAihkbECYXt4cCl9O5lkjuzrET792U28ETK1x9JdPgeHDYePRPY1I319QQrgJsLs28uBva0G+bMvYgY9cHvpSLiU7TlW55udrqsR97Rd+Au4OcRMQ94A7gBICImAf8upTQfGAf8OCLep+2bfVdKqdcG/dGWlYiIO4G6lNIK4CfATyNiM22/sJpTvoqz18n34O8jYibQQtt7cEvZCi6BiHgUmAYMj4h64BtAf4CU0v+k7a/RZwCbgQPAreWptDQ68fpnA1+JiBbgIDAnZzc7XeZfxkpSzvXGoRtJ0nEw6CUp5wx6Sco5g16Scs6gl6ScM+glKecMeknKOYNeknLu/wNbUSqUIPBwvQAAAABJRU5ErkJggg==\n", - "text/plain": [ - "" - ] + "metadata": { + "needs_background": "light" }, - "metadata": {}, "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:32: FutureWarning: pd.ewm_mean is deprecated for ndarrays and will be removed in a future version\n" - ] } ], "source": [ - "thompson_agent_rewards = train_contextual_agent(ThompsonBNNAgent(state_size=state_size, n_actions=n_actions),\n", - " batch_size=10, n_iters=N_ITERS)" + "test_agent(agent, greedy=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2.3 Curiosity-driven Exploration" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 313, "metadata": {}, "outputs": [], "source": [ - "class BayesUCBBNNAgent(BNNAgent):\n", - " q = 90\n", - "\n", - " def get_action(self, states):\n", - " \"\"\"\n", - " Compute q-th percentile of rewards P(r|s,a) for all actions\n", - " Take actions that have highest percentiles.\n", - "\n", - " This implements bayesian UCB strategy\n", - " \"\"\"\n", - "\n", - " " + "class MLP(nn.Module):\n", + " def __init__(self, input_size, hidden_size, output_size):\n", + " super().__init__()\n", + " self.layers = nn.Sequential(\n", + " nn.Linear(input_size, hidden_size),\n", + " nn.ReLU(),\n", + " nn.Linear(hidden_size, output_size)\n", + " )\n", + " \n", + " def init_weights(tensor):\n", + " if isinstance(tensor, nn.Linear):\n", + " nn.init.xavier_uniform_(tensor.weight)\n", + " \n", + " self.layers.apply(init_weights)\n", + " \n", + " \n", + " def forward(self, x):\n", + " return self.layers(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3.1 Uncertainty with forward dynamics" + ] + }, + { + "cell_type": "code", + "execution_count": 319, + "metadata": {}, + "outputs": [], + "source": [ + "class ForwardDynamics(BaseIntrinsicRewardModule):\n", + " def __init__(self, states_size, actions_size, hidden_size, alpha=.1):\n", + " super().__init__()\n", + " self.module = MLP(\n", + " actions_size + states_size,\n", + " hidden_size,\n", + " states_size\n", + " )\n", + " self.alpha = alpha\n", + " self.mean_reward = 0\n", + " \n", + " def forward(self, s, a):\n", + " sa = torch.cat([s, a], dim=-1)\n", + " return s + self.module(sa)\n", + "\n", + " def get_intrinsic_reward(self, state, action, next_state):\n", + " with torch.no_grad():\n", + " r = # \n", + " r_centered = r - self.mean_reward\n", + " self.mean_reward = self.alpha * (r) + (1 - self.alpha) * self.mean_reward\n", + " return r_centered\n", + "\n", + " def get_loss(self, state_batch, action_batch, next_state_batch):\n", + " # " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 320, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "iteration #90\tmean reward=0.630\tmse=0.354\tkl=0.047\n" - ] - }, { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "" + "
" ] }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Q(s,a) std: 0.067;0.027;0.093;0.069;0.014;0.148;0.173;0.026;0.043;0.101\n", - "correct 5\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] + "metadata": { + "needs_background": "light" }, - "metadata": {}, "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:32: FutureWarning: pd.ewm_mean is deprecated for ndarrays and will be removed in a future version\n" - ] } ], "source": [ - "ucb_agent_rewards = train_contextual_agent(BayesUCBBNNAgent(state_size=state_size, n_actions=n_actions),\n", - " batch_size=10, n_iters=N_ITERS)" + "agent = QLearningAgent(\n", + " epsilon=.1, \n", + " alpha=0.5, \n", + " discount=.9, \n", + " get_legal_actions=lambda s: range(env.action_space.n)\n", + ")\n", + "\n", + "forward_dynamics = ForwardDynamics(\n", + " np.prod(env.observation_space.shape), \n", + " 1, \n", + " 16\n", + ")\n", + "\n", + "train_with_reward(env, agent, forward_dynamics, n_episodes=2000, update_reward_period=100, batch_size=100, n_iter=25)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 321, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "" + "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" } ], "source": [ - "plt.figure(figsize=(17, 8))\n", - "\n", - "plt.plot(greedy_agent_rewards)\n", - "plt.plot(thompson_agent_rewards)\n", - "plt.plot(ucb_agent_rewards)\n", - "\n", - "plt.legend([\n", - " \"Greedy BNN\",\n", - " \"Thompson sampling BNN\",\n", - " \"UCB BNN\"\n", - "])\n", - "\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Part 3. Exploration in MDP\n", - "\n", - "The following problem, called \"river swim\", illustrates importance of exploration in context of mdp's." + "test_agent(agent)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "Picture from https://arxiv.org/abs/1306.0940" + "### 2.3.2 Uncertainty with reverse dynamics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Rewards and transition probabilities are unknown to an agent. Optimal policy is to swim against current, while easiest way to gain reward is to go left." + "[The paper](https://arxiv.org/pdf/1705.05363.pdf)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 322, "metadata": {}, "outputs": [], "source": [ - "class RiverSwimEnv:\n", - " LEFT_REWARD = 5.0 / 1000\n", - " RIGHT_REWARD = 1.0\n", - "\n", - " def __init__(self, intermediate_states_count=4, max_steps=16):\n", - " self._max_steps = max_steps\n", - " self._current_state = None\n", - " self._steps = None\n", - " self._interm_states = intermediate_states_count\n", - " self.reset()\n", - "\n", - " def reset(self):\n", - " self._steps = 0\n", - " self._current_state = 1\n", - " return self._current_state, 0.0, False\n", - "\n", - " @property\n", - " def n_actions(self):\n", - " return 2\n", - "\n", - " @property\n", - " def n_states(self):\n", - " return 2 + self._interm_states\n", - "\n", - " def _get_transition_probs(self, action):\n", - " if action == 0:\n", - " if self._current_state == 0:\n", - " return [0, 1.0, 0]\n", - " else:\n", - " return [1.0, 0, 0]\n", - "\n", - " elif action == 1:\n", - " if self._current_state == 0:\n", - " return [0, .4, .6]\n", - " if self._current_state == self.n_states - 1:\n", - " return [.4, .6, 0]\n", - " else:\n", - " return [.05, .6, .35]\n", - " else:\n", - " raise RuntumeError(\n", - " \"Unknown action {}. Max action is {}\".format(action, self.n_actions))\n", - "\n", - " def step(self, action):\n", - " \"\"\"\n", - " :param action:\n", - " :type action: int\n", - " :return: observation, reward, is_done\n", - " :rtype: (int, float, bool)\n", - " \"\"\"\n", - " reward = 0.0\n", - "\n", - " if self._steps >= self._max_steps:\n", - " return self._current_state, reward, True\n", - "\n", - " transition = np.random.choice(\n", - " range(3), p=self._get_transition_probs(action))\n", - " if transition == 0:\n", - " self._current_state -= 1\n", - " elif transition == 1:\n", - " pass\n", - " else:\n", - " self._current_state += 1\n", - "\n", - " if self._current_state == 0:\n", - " reward = self.LEFT_REWARD\n", - " elif self._current_state == self.n_states - 1:\n", - " reward = self.RIGHT_REWARD\n", - "\n", - " self._steps += 1\n", - " return self._current_state, reward, False" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's implement q-learning agent with epsilon-greedy exploration strategy and see how it performs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class QLearningAgent:\n", - " def __init__(self, n_states, n_actions, lr=0.2, gamma=0.95, epsilon=0.1):\n", - " self._gamma = gamma\n", - " self._epsilon = epsilon\n", - " self._q_matrix = np.zeros((n_states, n_actions))\n", - " self._lr = lr\n", - "\n", - " def get_action(self, state):\n", - " if np.random.random() < self._epsilon:\n", - " return np.random.randint(0, self._q_matrix.shape[1])\n", - " else:\n", - " return np.argmax(self._q_matrix[state])\n", - "\n", - " def get_q_matrix(self):\n", - " \"\"\" Used for policy visualization\n", - " \"\"\"\n", - "\n", - " return self._q_matrix\n", - "\n", - " def start_episode(self):\n", - " \"\"\" Used in PSRL agent\n", - " \"\"\"\n", - " pass\n", - "\n", - " def update(self, state, action, reward, next_state):\n", - " \n", - " # Finish implementation of q-learnig agent" + "class InverseDynamics(BaseIntrinsicRewardModule):\n", + " def __init__(self, states_size, n_actions, hidden_size, alpha=0.1):\n", + " super().__init__()\n", + " self.module = MLP(\n", + " 2 * states_size,\n", + " hidden_size,\n", + " n_actions\n", + " )\n", + " self.alpha = alpha\n", + " self.mean_reward = 0\n", + " self.n_actions = n_actions\n", + " \n", + " def forward(self, s, s_next):\n", + " # \n", + " \n", + " \n", + " def get_intrinsic_reward(self, state, action, next_state):\n", + " with torch.no_grad():\n", + " r = # \n", + " \n", + " r_centered = r - self.mean_reward\n", + " self.mean_reward = self.alpha * (r) + (1 - self.alpha) * self.mean_reward\n", + " return r_centered\n", + "\n", + " def get_loss(self, state_batch, action_batch, next_state_batch): \n", + " a_pred_proba = self.forward(state_batch, next_state_batch)\n", + " a_one_hot = to_one_hot(action_batch, self.n_actions)\n", + " return -(torch.log(a_pred_proba) * a_one_hot).sum(dim=-1).mean()" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 323, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ - "def train_mdp_agent(agent, env, n_episodes):\n", - " episode_rewards = []\n", - "\n", - " for ep in range(n_episodes):\n", - " state, ep_reward, is_done = env.reset()\n", - " agent.start_episode()\n", - " while not is_done:\n", - " action = agent.get_action(state)\n", - "\n", - " next_state, reward, is_done = env.step(action)\n", - " agent.update(state, action, reward, next_state)\n", - "\n", - " state = next_state\n", - " ep_reward += reward\n", - "\n", - " episode_rewards.append(ep_reward)\n", - " return episode_rewards" + "agent = QLearningAgent(\n", + " epsilon=.1, \n", + " alpha=0.5, \n", + " discount=.9, \n", + " get_legal_actions=lambda s: range(env.action_space.n)\n", + ")\n", + "\n", + "inverse_dynamics = InverseDynamics(\n", + " np.prod(env.observation_space.shape), \n", + " env.action_space.n, \n", + " 16\n", + ")\n", + "\n", + "train_with_reward(env, agent, inverse_dynamics, n_episodes=3000, \n", + " update_reward_period=100, batch_size=100, n_iter=25)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 324, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:6: FutureWarning: pd.ewm_mean is deprecated for ndarrays and will be removed in a future version\n", - " \n" - ] - }, { "data": { - "image/png": "\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXsAAADnCAYAAAD/7faHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAcZElEQVR4nO3dfZRdVZnn8e+vigSSEBIgvAcBMeJEmwZXBN9GsXkxIAtc3WonjgoOPTTO0CONjoPdvXAW3fQo0+3bwKhpoYGWF2kUO+OkiTQt2ijQBIhgiNFAA0mAQAwk4TWpqmf+OKfirUrVvbvqnnvPuff+Pmudxb3n7jpnnyL3Obv22fvZigjMzKy79ZVdATMzaz0HezOzHuBgb2bWAxzszcx6gIO9mVkP2K3sCpiZdYP3vmdG/HrzYFLZ+x58dXlELGxxlUZwsDczK8CmzYPcs3xuUtkpBz0yp8XV2YWDvZlZIYLBGCq7EuNysDczK0AAQ1R3kqqDvZlZQYZwy97MrKsFwQ5345iZdbcABt2NY2bW/dxnb2bW5QIYrHAWYQd7M7OCVLfH3sHezKwQQbjP3sys20XAjurGegd7M7NiiEFUdiXG5WBvZlaAAIbcsjcz635u2ZuZdblsUpWDvZlZVwtgR1R3PSgHezOzAgRisMKL/znYm5kVZCiq241T3dtQF5H0Tkk/lbRF0mZJP5H0FklnS7pzAsc5XFJI8k3arGKG++xTtjI4aLSYpL2A7wOfAG4CpgL/Hni1zHqZWdHEYIX77Ktbs+7xeoCIuCEiBiPi5Yj4AbAD+DrwNkkvSHoeQNL7JD0gaaukdZL+R82xfpz/9/n8Z96W/8x/lLRa0nOSlks6rG1XZ2bA8EpVfUlbGRzsW++XwKCkaySdKmlvgIhYDZwH3BURe0bE7Lz8i8DHgNnA+4BPSHp//tm78v/Ozn/mLklnAn8C/C6wH/AvwA2tvywzqxUhtkd/0lYGB/sWi4itwDvJbvx/AzwraamkA8Ypf0dEPBQRQxHxIFngfnedU5wH/M+IWB0RA8BfAse4dW/WfkMoaSuDg30b5IH47IiYC7wJOBj48lhlJR0v6YeSnpW0hSyYz6lz+MOAr0h6Pu8K2gwIOKTIazCz+rIHtH1JWxkc7NssIn4BXE0W9MfKpHE9sBQ4NCJmkfXrDzcFxiq/DvjDiJhds02LiJ8WX3szG1/2gDZlK4ODfYtJeoOkT0mam78/FFgM3A1sBOZKmlrzIzOBzRHxiqTjgA/XfPYs2foIr63Z93Xgs5LemB9/lqQPtu6KzGwsVX9A66GXrbcNOB64UNJs4HmyoZj/DXgFWAU8LWkoIuYA/xn4a0mXAz8iG645GyAiXpJ0KfATSVOAhRFxi6Q9gRvzfvotwG3A37fvEs0MYLDCk6oUFV4z0cysUxz5WzPiL2+Zn1R20bwV90XEghZXaQS37M3MCjD8gLaqHOzNzAoQqNLdOA72ZmYFKevhawoHezOzAkRQ6dw4bQ32U7V77MGMdp7SesgrvMj2eLW6f0e3mKQA5kXE2oKPewDZ6K5jgSUR8akij9/g3C8AR0fEo+0652QFYkdJqRBStDXY78EMjteJ7Tyl9ZB74vayq9AUSbcC/xoRF4/afybwDWBunhKj3c4FNgF7RQuH70m6A/hWRHxzeF9E7Nmq87VClR/QVrdmZr3nGuAjkkb/dfJR4LqSAj1kKTkebmWg7waBGIq0rQxNBXtJCyWtkbRW0kVFVcqsR30P2JdsvQMA8iyppwPXSjpO0l15HqSnJF0+avY1NT93h6Q/qHk/YqGcfGb3bfliOmskfWic41wNnAV8Jk+rfZKkqyX9RU2ZEyStr3n/mKRPS3owX7Dn25L2qPn8TEkr8zTej+Rx5NL8ui/Pz3N5XjYkvS5/PUvStXneqMcl/Zmkvtrrk/RXearvf5N0avqvvhhF5cZJja2Sfi//HTUcsz/pYC+pH7gCOBWYDyyWlDajwMx2EREvk82Y/ljN7g8Bv4iInwGDwB+TJcZ7G3Ai2YzrCZE0g2yW9fXA/sAi4P+M9f2NiLOB64DL8rTa/5R4mg8BC4EjgKOBs/NzHwdcSzaDfDZZ2u7HIuJPydJzn5+f5/wxjvm/gVlk6ULeTfZ7+njN58cDa8h+P5cBV47xV1LLBDAUfUlbPamxVdJM4JPAPSn1a6ZlfxywNiIejYjtwI3AmU0cz8yyrpwP1LSEP5bvIyLui4i7I2IgIh4j68evl/56PKeTBdi/zY/1APAdoMicSl+NiCcjYjPwf4Fj8v3nAFdFxG15Gu8NeXLAuvIAuAj4bERsy6//r8m6uIY9HhF/ExGDZL+zg4AxU4m3RtqShAnLEqbG1j8HvkCWdqWhZoL9IWQZF4etZ4y0upLOlbRC0oodXonPrK6IuJPsYej7JR1J9sW/HkDS6yV9X9LTkraSrV1QL/31eA4Djh9Oi52nxv4PwIGFXETm6ZrXLwHDD1oPBR6ZxPHmAFOAx2v2Pc7ImLPznBHxUv6ybQ94A9gR/UkbMGc4LubbuTWHahhbJb2ZLDPu/0utX8tH40TEEmAJwF7axw94zBq7lqxFfxSwPCI25vu/BjwALI6IbZIuAD4wzjFeBKbXvK8N5OuAH0XEyZOsX71jN7IOOHKcz+rFh01kS3keBjyc73sNsGEC526pCDXsoqmxabK5cfLnFF8k7xpL1UzLfgPZXXrYXCr0izfrYNcCJwH/ibwLJzcT2Aq8IOkNZIvYj2cl8LuSpucPOM+p+ez7wOslfVTSlHx7i6R/l1i/lcBpkvaRdCBwQeLPAVwJfFzSiZL6JB2SXwtkKb9fO9YP5V0zNwGXSpqZZ3i9EPjWBM7dcgXls28UW2eSrYdxh6THgLcCSxs9pG0m2N8LzJN0RD4iYBHZohtm1oS8P/qnwAxGfqc+Tba+wTayJS6/XecwXwK2kwXQa8gesg4ffxtwCtl39kmy7o8vALsnVvHvgJ8BjwE/aFCPESLiX8keqn6JLB33j8ha6wBfIXte8Zykr47x439E9lfFo8CdZN1bV6Weu9WyfPaFLEtYN7ZGxJaImBMRh0fE4WRrY5wRESvqHbSpFMeSTiNbXq+f7KHLpfXK76V9wpOqrFXuidvZGpt7dgatlevgN+4d59x4QlLZvzj6e3VTHI8VWyVdAqyIiKWjyt4BfLpRsG+qzz4ilgHLmjmGmVk3yIZeFtPWGCu2jp5ZXbP/hJRjOhGamVkBnBvHzKxHOMWxmVmXy1IcV/eRkYO9WZOm9k+Laf17NS7YV91AYI1tfXXjpojYr16ZspKcpXCwN2vStP69ePuBixuWi2mpIxutipb/8rLH632eZb2sbjdOdWtmViJndLWJytIl9CVtZXDL3myUmqyDJ5PlJblX0tKIeLj+T1pvc8verNM4o6tNSkEzaFvCLXuzXY2VdfD42gJ5lsJzAfbon9m+mllleTSOWReqzeY6a+oBzuZqAJXuxnGwN9uVM7rahA2vQVtVDvZmu9qZdZAsyC8iyzZpNq4ABtyyN+scETEg6XxgOb/JOriq5GpZB3A3jlmHmUhG19ixg4H1jXt5ht55TNK5p2zcmlTOKibcjWNm1vWGFy+pKgd7M7OCuGVvZtblily8pBUmHewlHUq2MPIBZNe5JCK+UlTFzMw6SSAGhrrzAe0A8KmIuF/STOA+Sbc5f4iZ9aqu7LOPiKeAp/LX2yStJptm7mBvZr0nurQbp5akw4FjgXvG+Ow3OUSYXsTpzMwqp2v77IdJ2hP4DnBBROwyQLg2h8he2sc5RMysa3VtsJc0hSzQXxcR3y2mSmZmnScQg934gFaSgCuB1RHxxeKqZNZZ1N9H/56N16Dte+BXScfbdsqbksrN/PmzSeWsfar8gLaZ29A7gI8CvyNpZb6dVlC9zMw6SuQPaFO2MjQzGudOqPBtzGySPIfEJiu6tc/erEt5DolNghOhmXUUzyGxyXLL3qxD1ZtDYlYrAgaHHOzNOk69OSQjJgtqRgm1syrq1tE4Zl2r0RySiFgSEQsiYsHUvj3aX0GrnCDrxknZGpG0UNIaSWslXTTG5+dJeigfBXmnpPmNjulgbzaK55DY5KQNu2z0EFdSP3AFcCowH1g8RjC/PiJ+KyKOAS4DGv47dbA325XnkNikRKRtDRwHrI2IRyNiO3AjcObI84zoVpxB9odFXe6zNxtlwnNI+vrRzD0bFhva9Oukw+1112NJ5Ta+77VJ5QD2//HG5LI2eRMYjTNH0oqa90vyPGKQjfxaV/PZeuD40QeQ9F+AC4GpwO80OqGDvZlZAbLROMmdJZsiYkFz54srgCskfRj4M+CseuXdjWNmVpCCunE2AIfWvJ+b7xvPjcD7Gx3Uwd7MrCAFjca5F5gn6QhJU4FFwNLaApLm1bx9H9Awy567cczMChCkDatseJyIAUnnA8uBfuCqiFgl6RJgRUQsBc6XdBKwA3iOBl044GBvZlaYolZniohlwLJR+y6uef3JiR7Twd7MrAgB4XQJZmbdz4nQzMx6QMJIm9I0PRpHUr+kByR9v4gKmZl1oiJz47RCES37TwKrgcaLcJqZdasAurUbR9JcsjGel5JN2zXrPUNDxEsvNyzW97rDkw4XTz6TVG7/u9LSLwCsO/PApHKH/sPTyce0XVW5G6fZlv2Xgc8AM8crMCLvN9ObPJ2ZWVWp0qNxJt1nL+l04JmIuK9eudq831PYfbKnM2s7P4+yCYvErQTNtOzfAZyRp37dA9hL0rci4iPFVM2sdH4eZemi2kMvJ92yj4jPRsTciDicLHfDPzvQW7eoeR71zbLrYh2kwi17J0IzG9uXyZ5HDY31oaRzJa2QtGJ7vNLWilmVKXFrv0KCfUTcERGnF3Ess7KlPI8asQatvAat5YYStxJ4Bq3Zrvw8yiau4uPs3Y1jNoqfR9lkFbR4SUu4ZW9mVpQunlRl1tUi4g7gjrqFJNgt4au0cVPSOQfe8JqkclMeS19E/KC70iY0/vqtBySV2/duL2A+pgp34zjYm5kVRG7Zm5l1uRBUOF2Cg72ZWVHcsjcz6wEO9mZmPcDB3sysy1V8UpWDvZlZQTwax8ysFzjYm5l1P7fszbpZn9D0hMyXSuvPnbIubabtS0fPTSoHMH1N2rq2+25OW03u5SP3TSo37ZH0dXK7gvvszcy6XIkLk6Rw1kuzMUiaLelmSb+QtFrS28quk3WAbl2pyl8I62JfAW6NiDcAv022Fq1ZXRpK28rQbMveXwjrOpJmAe8CrgSIiO0R8XyplbLOUFDLXtJCSWskrZV00RifXyjpYUkPSrpd0mGNjjnpYO8vhHWxI4Bngb+V9ICkb0qaUVtgxBq0gy+XU0urFEX6Vvc4Uj9wBXAqMB9YLGn+qGIPAAsi4mjgZuCyRvVrpmXf8AuRV3znl2IHrzZxOrO22Q14M/C1iDgWeBEY0boasQZt/7Qy6mhVFErb6jsOWBsRj0bEduBG4MwRp4n4YUS8lL+9G2g4NKuZYN/wC5FXaueXYgppw7rMSrYeWB8R9+Tvbyb7t25WXzHdOIcA62rer8/3jecc4B8bHbSZYO8vhHWliHgaWCfpqHzXicDDJVbJOsQEunHmDPd45Nu5kzqf9BFgAfC/GpWd9Dj7iHha0jpJR0XEGvyFsO7yR8B1kqYCjwIfL7k+VnUxoZE2myJiwTifbQAOrXk/N983gqSTgD8F3h0RDfvIm51U5S+EdaWIWEnWYmpMfcTuU4s7+dQpScWm//LZ5ENuffNBSeVmPpR2zGmPP59U7rm37J9Ubu9702b4Vl4xY+jvBeZJOoIsyC8CPlxbQNKxwDeAhRGR9MtrKthP6AthZtbtCgj2ETEg6XxgOdAPXBURqyRdAqyIiKVk3TZ7An+vLA3HExFxRr3jOl2CmVlBikqEFhHLgGWj9l1c8/qkiR7T6RLMzHqAW/ZmZkWpcCI0B3szsyJMbDRO2znYm5kVxS17M7PuJrxSlZlZb3CwNzPrcgkZLcvkYG/WrD4R0xsn+YvENWj7tg8klZtIXNnz0W1px0ycCTywd1qmz9SZsdvnzk4qN3X980nlSuMHtGZm3a/KLXtPqjIbg6Q/lrRK0s8l3SBpj7LrZB2gW9egNetGkg4B/ivZSkBvIstPsqjcWlnlpQb6koK9u3HMxrYbME3SDmA68GTJ9bEO4G4csw4SERuAvwKeAJ4CtkTED2rLjFiDduClsQ5jvajCLXsHe7NRJO1NtubnEcDBwIx8RaCdRqxBu9v0MqppFaShtK0MDvZmuzoJ+LeIeDYidgDfBd5ecp2s6ireZ+9gb7arJ4C3SpqubGWIE4HVJdfJKk4T2MrQVLD38DTrRhFxD3AzcD/wENn3ZEmplbLOUOGW/aRH49QMT5sfES9LuolseNrVBdXNrDQR8Tngc0ll+8TgtMbrxsZuiW2rF9KKxczGs3aHaSCto3go4ToApmzcmlRuyzH7JZWbtTJt7duhPdNm7gL0vfByctmiVHk0TrNDLz08zcxsWIWD/aS7cVKGp8HIIWo7eHXyNTUzq7Lo0tE4KcPTYOQQtSmk/9lpZtZxKtxn38wDWg9PMzOroUjbytBMsPfwNDOzWhVu2U/6AW1E3CNpeHjaAPAAHp5mZj2sa0fjTGR4mplZVwu8eImZWbfzguNmZr3Cwd7MrPspqhvtHezNmhR9YnB6469S3/a0Dt1XDkxLmbzby4NJ5QCGpqYNvNNAYrDab2ZSsVk/25RUbnDvGUnl+p97MakcwKuv2Tup3O5PPJd8zLpKHGmTwsHezKwgVe6zd4pj61mSrpL0jKSf1+zbR9Jtkn6V/zeteWhGcekSJC2UtEbSWkkXjfH5uyTdL2lA0gdS6uZgb73samDhqH0XAbdHxDzg9vy9WZoCJlVJ6geuAE4F5gOLJc0fVewJ4Gzg+tSqOdhbz4qIHwObR+0+E7gmf30N8P521sk6WGKqhISunuOAtRHxaERsB24k+3f5m1NFPBYRDzKBkf0O9mYjHRART+WvnwYOGKvQiGyuO9IfGlqXS2/Zzxn+95Nv59Yc5RBgXc379fm+pvgBrdk4IiKksdthEbGEPD3IzL3mVvixnLXLBCdVbYqIBa2rza4c7M1G2ijpoIh4StJBwDNlV8g6h4YKue9vAA6teT8339cUd+OYjbQUOCt/fRbwDyXWxTpJahdO4/vBvcA8SUdImkq23OvSZqvnYG89S9INwF3AUZLWSzoH+DxwsqRfka3Z8Pky62idpYihlxExAJwPLCdLG39TRKySdImkMwAkvUXSeuCDwDckrWpUN3fjWM+KiMXjfHTihI7TDztm9jcsp8HGZbIDphV7de+0xcEB+lJnxiYuJjc0RUnldszcJ6nctCe2pB3v4FlJ5SB9ZuyyH303qVz/QQmFCnp6ExHLgGWj9l1c8/pesu6dZA72ZmYFqfIMWgd7M7MiBFDhRGgN++w9pdzMLE1R6RJaIeUB7dV4SrmZWV3D4+w7dsFxTyk3M0sQkb6VYLJ99klTyiGbVg6cC7AHaXm6zcw6UZUf0DY9zj4i6k4TiIglEbEgIhZMSR3XZWbWiYqZVNUSkw32G/Op5HhKuZlZpqP77MfhKeVmZrUCGIy0rQQN++zzKeUnkKXkXA98jmwK+U359PLHgQ+1spJmVfbi8xs2/eSWzzw+xkdzgLRFWKutW64DxriWpJmxmcMaFahyn33DYF/UlHKzbhUR+421X9KKdqexbYVuuQ5ow7VUeFKVZ9B2meVPrpzwz7z34GMKr4dZL+rolr2ZmSUocaRNCgd7s9ZZUnYFCtIt1wEtvBYBKunhawoHe7MWyZcu7Hjdch3Q+muR++zNzLqcu3HMzHpBeXlvUnhZQrOCSVooaY2ktZI6OiOspMckPSRppaQVZddnIspIz96NM2jNbAyS+oErgFOB+cBiSfPLrVXT3hMRx3TgWPuraXd69gpnvXSwNyvWccDaiHg0IrYDN5KlBLc2a3t69shG46RsZXCwNyvWIcC6mvfr832dKoAfSLovT1fe6ZLTs09KhbNe+gGtmdXzzojYIGl/4DZJv8hbzB0vIkIqtge9ykMv3bI3K9YG4NCa93PzfR0pIjbk/30GuIWsm6qTtTY9u/vszXrGvcA8SUdImgosIksJ3nEkzZA0c/g1cArw8/o/VXmtS88ewFDiVoK2duNs47lN/xQ3VzEVbNecfwLpWmvOvbYrrp2EFLStFhEDks4HlgP9wFURsarkak3WAcAtkiCLFddHxK3lVildu9Ozi6h0N05bg31VU8H28vl7+dpbJSKWAcvKrkezIuJR4LfLrsdklZKefaikZnsCP6A1MyvCcDdORTnYm5kVxN04jZWdVa+Xz9/L125WLAf7+spOodrL5+/lazcrlhOhmZl1vwAGI21roFEyPUm7S/p2/vk9kg5vdEwHezOzgigiaat7jLRkeucAz0XE64AvAV9oVLe2BvtW3K0mcO5DJf1Q0sOSVkn65BhlTpC0JU/nulLSxQWev26qWGW+ml/7g5LeXOC5j6q5ppWStkq6YFSZQq+9mfSyks7Ky/xK0lljlTGrpGJm0KYk06tN6HYzcKLyCRHjaVuffc3d6mSy5FD3SloaEQ/XFNt5t5K0iOxu9fsFVWEA+FRE3J/PCrxP0m2jzg/wLxFxekHnHO09ETHeBKJTgXn5djzwtfy/TYuINcAxsPP/wwayqe+jFXntVwOXA9fW7BtOL/v5/GZ/EfDfa39I0j5kk18WkP1hfF/+7+S5gupl1hoBDCX32c8Z1ehbUvP8aqxkeqNjwc4y+US+LcC+1Jmg2M6WfUvuVqki4qmIuD9/vQ1YTbWyEZ4JXBuZu4HZwzk8CnYi8EhEjDWTuTBNpJd9L3BbRGzOA/xt7JqT3KyCElv1Wct+U0QsqNlaPlChncE+JfXriLsVMHy3KlTePXQscM8YH79N0s8k/aOkNxZ42kapYtuVGncRcMM4n7Xq2oelpJftthTB1kuK6cZJSaa3s4yk3YBZwK/rHbQSQy/bSdKewHeACyJi66iP7wcOi4gXJJ0GfI+sW6UIpaeKzRNznQF8doyPW3ntu2hFelmzUgUwWMgU2p3J9MiC+iLgw6PKDCd0uwv4APDPEfXvIu1s2bfkbjURkqaQBfrrIuK7oz+PiK0R8UL+ehkwRdKcIs6dkCq2HalxTwXuj4iNY9SvZddeIyW9bFelCLZeEhBDaVu9o2S9GsPJ9FYDN0XEKkmXSDojL3YlsK+ktcCFJCyv2M5gn5L6tTb9aNLdKlXe938lsDoivjhOmQOHnxFIOo7s99P0zUZpqWKXAh/LR+W8FdhS0+VRlMWM04XTqmsfJSW97HLgFEl756N1Tsn3mVVfQfnsI2JZRLw+Io6MiEvzfRdHxNL89SsR8cGIeF1EHJcnraurbd0446V+lXQJsCK/iCuBv8vvVpvJbghFeQfwUeAhSSvzfX8CvCav39fJbjCfkDQAvAwsKuhmM2aqWEnn1Zx7GXAasBZ4Cfh4AefdKb/JnAz8Yc2+2vMXeu2aQHpZSQuA8yLiDyJis6Q/J2scAFwSEaMf9JpVz8RG47SdCmo4m5n1tFlTD4i3H5DWPr11/Vfva3dq7557QGtm1jIVbjw72JuZFSECBgfLrsW4HOzNzIrilr2ZWQ9wsDcz63ZR6dE4DvZmZkUIiAYTpsrkYG9mVpRi0iW0hIO9mVkRImDIwd7MrPv5Aa2ZWfcLt+zNzLpdWpKzsjjYm5kVoeKJ0BzszcwKEEA4XYKZWZeLaLgwSZkc7M3MChLuxjEz6wEVbtl78RIzswJIuhVIXbd5U0QsbGV9RnOwNzPrAe1ccNzMzEriYG9m1gMc7M3MeoCDvZlZD3CwNzPrAf8fbJZMDI5qc2MAAAAASUVORK5CYII=", "text/plain": [ - "" + "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" } ], "source": [ - "env = RiverSwimEnv()\n", - "agent = QLearningAgent(env.n_states, env.n_actions)\n", - "rews = train_mdp_agent(agent, env, 1000)\n", - "plt.figure(figsize=(15, 8))\n", - "\n", - "plt.plot(moving_average(np.array(rews), alpha=.1))\n", - "plt.xlabel(\"Episode count\")\n", - "plt.ylabel(\"Reward\")\n", - "plt.show()" + "test_agent(agent)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's visualize our policy:" + "## 2.3.3 Intrinsic Curiosity Module algorithm" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 336, "metadata": {}, "outputs": [], "source": [ - "def plot_policy(agent):\n", - " fig = plt.figure(figsize=(15, 8))\n", - " ax = fig.add_subplot(111)\n", - " ax.matshow(agent.get_q_matrix().T)\n", - " ax.set_yticklabels(['', 'left', 'right'])\n", - " plt.xlabel(\"State\")\n", - " plt.ylabel(\"Action\")\n", - " plt.title(\"Values of state-action pairs\")\n", - " plt.show()" + "class Embedder(nn.Module):\n", + " def __init__(self, states_size, embedding_size, hidden_size):\n", + " super().__init__()\n", + " self.module = MLP(\n", + " states_size,\n", + " hidden_size,\n", + " embedding_size\n", + " )\n", + " \n", + " def forward(self, s):\n", + " return self.module(s)\n", + " \n", + "class ICMModule(BaseIntrinsicRewardModule):\n", + " def __init__(self, states_size, n_actions, hidden_size, embedding_size):\n", + " super().__init__()\n", + " # \n", + " \n", + " def get_intrinsic_reward(self, state, action, next_state):\n", + " with torch.no_grad(): \n", + " # \n", + "\n", + " def get_loss(self, state_batch, action_batch, next_state_batch):\n", + " # " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 337, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "" + "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" } ], "source": [ - "plot_policy(agent)" + "agent = QLearningAgent(\n", + " epsilon=.1, \n", + " alpha=0.5, \n", + " discount=1, \n", + " get_legal_actions=lambda s: range(env.action_space.n)\n", + ")\n", + "\n", + "icm = ICMModule(\n", + " states_size=np.prod(env.observation_space.shape), \n", + " n_actions=env.action_space.n, \n", + " hidden_size=16, embedding_size=10\n", + ")\n", + "\n", + "train_with_reward(env, agent, icm, n_episodes=3000, update_reward_period=100, batch_size=100, n_iter=200)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 338, "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ - "As your see, agent uses suboptimal policy of going left and does not explore the right state." + "test_agent(agent)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Bonus 3.1 Posterior sampling RL (3 points)" + "## HW 2.1: Random network distillation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now we will implement Thompson Sampling for MDP!\n", - "\n", - "General algorithm:\n", - "\n", - ">**for** episode $k = 1,2,...$ **do**\n", - ">> sample $M_k \\sim f(\\bullet\\ |\\ H_k)$\n", - "\n", - ">> compute policy $\\mu_k$ for $M_k$\n", - "\n", - ">> **for** time $t = 1, 2,...$ **do**\n", - "\n", - ">>> take action $a_t$ from $\\mu_k$ \n", - "\n", - ">>> observe $r_t$ and $s_{t+1}$\n", - ">>> update $H_k$\n", - "\n", - ">> **end for**\n", - "\n", - ">**end for**\n", - "\n", - "In our case we will model $M_k$ with two matricies: transition and reward. Transition matrix is sampled from dirichlet distribution. Reward matrix is sampled from normal-gamma distribution.\n", - "\n", - "Distributions are updated with bayes rule - see continious distribution section at https://en.wikipedia.org/wiki/Conjugate_prior\n", - "\n", - "Article on PSRL - https://arxiv.org/abs/1306.0940" + "Implement algorithm from [this](https://arxiv.org/abs/1810.12894) paper" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 328, "metadata": {}, "outputs": [], "source": [ - "def sample_normal_gamma(mu, lmbd, alpha, beta):\n", - " \"\"\" https://en.wikipedia.org/wiki/Normal-gamma_distribution\n", - " \"\"\"\n", - " tau = np.random.gamma(alpha, beta)\n", - " mu = np.random.normal(mu, 1.0 / np.sqrt(lmbd * tau))\n", - " return mu, tau\n", - "\n", - "\n", - "class PsrlAgent:\n", - " def __init__(self, n_states, n_actions, horizon=10):\n", - " self._n_states = n_states\n", - " self._n_actions = n_actions\n", - " self._horizon = horizon\n", - "\n", - " # params for transition sampling - Dirichlet distribution\n", - " self._transition_counts = np.zeros(\n", - " (n_states, n_states, n_actions)) + 1.0\n", - "\n", - " # params for reward sampling - Normal-gamma distribution\n", - " self._mu_matrix = np.zeros((n_states, n_actions)) + 1.0\n", - " self._state_action_counts = np.zeros(\n", - " (n_states, n_actions)) + 1.0 # lambda\n", - "\n", - " self._alpha_matrix = np.zeros((n_states, n_actions)) + 1.0\n", - " self._beta_matrix = np.zeros((n_states, n_actions)) + 1.0\n", - "\n", - " def _value_iteration(self, transitions, rewards):\n", - " # YOU CODE HERE\n", - " state_values = \n", - " return state_values\n", - "\n", - " def start_episode(self):\n", - " # sample new mdp\n", - " self._sampled_transitions = np.apply_along_axis(\n", - " np.random.dirichlet, 1, self._transition_counts)\n", - "\n", - " sampled_reward_mus, sampled_reward_stds = sample_normal_gamma(\n", - " self._mu_matrix,\n", - " self._state_action_counts,\n", - " self._alpha_matrix,\n", - " self._beta_matrix\n", - " )\n", - "\n", - " self._sampled_rewards = sampled_reward_mus\n", - " self._current_value_function = self._value_iteration(\n", - " self._sampled_transitions, self._sampled_rewards)\n", - "\n", - " def get_action(self, state):\n", - " return np.argmax(self._sampled_rewards[state] +\n", - " self._current_value_function.dot(self._sampled_transitions[state]))\n", - "\n", - " def update(self, state, action, reward, next_state):\n", - " \n", - " # update rules - https://en.wikipedia.org/wiki/Conjugate_prior\n", - "\n", - " def get_q_matrix(self):\n", - " return self._sampled_rewards + self._current_value_function.dot(self._sampled_transitions)" + "class RandomNetworkDistilationModule(BaseIntrinsicRewardModule):\n", + " # " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 332, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:7: FutureWarning: pd.ewm_mean is deprecated for ndarrays and will be removed in a future version\n", - " import sys\n" - ] - }, { "data": { - "image/png": "\n", + "image/png": "", "text/plain": [ - "" + "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" } ], "source": [ - "from pandas import DataFrame\n", - "moving_average = lambda x, **kw: DataFrame(\n", - " {'x': np.asarray(x)}).x.ewm(**kw).mean().values\n", - "\n", - "horizon = 20\n", - "env = RiverSwimEnv(max_steps=horizon)\n", - "agent = PsrlAgent(env.n_states, env.n_actions, horizon=horizon)\n", - "rews = train_mdp_agent(agent, env, 1000)\n", - "\n", - "plt.figure(figsize=(15, 8))\n", - "plt.plot(moving_average(np.array(rews), alpha=0.1))\n", - "\n", - "plt.xlabel(\"Episode count\")\n", - "plt.ylabel(\"Reward\")\n", - "plt.show()" + "agent = QLearningAgent(\n", + " epsilon=.1, \n", + " alpha=0.5, \n", + " discount=.9, \n", + " get_legal_actions=lambda s: range(env.action_space.n)\n", + ")\n", + "\n", + "rnd = RandomNetworkDistilationModule(\n", + " np.prod(env.observation_space.shape), \n", + " np.prod(env.observation_space.shape), \n", + " 16\n", + ")\n", + "\n", + "train_with_reward(env, agent, rnd, n_episodes=2000, update_reward_period=100, batch_size=100, n_iter=25)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 333, "metadata": {}, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAA4MAAAFRCAYAAADO/nj3AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAGV9JREFUeJzt3XmUpXdd5/HPN2uTdCcBwxJIJCpBBhgWDQHFYMISIQZxhl3WYTQDDMPiQXEfYHBw5igjjjNolNFEVllkl0VI2JGQXSBwEJMTIAIBQpKGsITv/HGfhktTXV2d9O1b1b/X65w6Xffep577vVVPOv2uZ7nV3QEAAGAs+yx7AAAAAPY8MQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQjAblFVR1dVV9V+y55lm6p6UlV9vqquqaofWvY8e0JVHV9Vn1j2HNubfgY/uuw5APgeMQhAkqSq3lpVz13h/gdV1b+up8hbi6raP8kLkpzU3Zu7+0u78LVnVdUv78Lyz66ql1yfOW+oKcBvs+12d7+3u398GbOsZvoZfHrZcwDwPWIQgG1OT/Loqqrt7n9Mkpd297eXMNMNcfMkm5J8dNmDsLqq2nfZMwCMSAwCsM3rkvxQkuO33VFVN05ySpIzpts/X1XnVdVVVXVZVT17Ryurqkuq6r5zt79v71lV3aOqPlBVV1bVBVV1wtxjj6+qT1fV1VX1L1X1qB08x4FV9cdV9bnp44+n+26bZNuhkldW1btW+NpNVfWSqvrSNMPZVXXzqvr96Xvwp9OhjX86Lf/C6TVfVVXnVNXx0/33T/JbSR4+LX/BdP+hVfXiqrq8qj5bVc9bLXp2tP7psX2r6req6p+n78k5VXVUVb1nWuSC6bkfXlUnVNVn5r7230x7Oq+sqo9W1S/MPfbXVfV/qurN03r/sap+bAfzbTsM+NTpe315VT1z7vHjquqD0/NcXlV/WlUHzD3+3T2Y0/O+qKreUlVbk5xYVSdX1cemOT47v24AFkMMApAk6e6vJ/nbJI+du/thSS7u7gum21unxw9L8vNJnlRVv7irz1VVt0ry5iTPS3KTJM9M8pqqumlVHZzkT5I8oLu3JPnpJOfvYFW/neQeSe6S5M5JjkvyO939ySR3mJY5rLvvvcLXPi7JoUmOyiyCn5jk693920nem+Qp06GNT5mWP3t6npskeVmSV1XVpu5+a5L/nuSV0/J3npb/6yTfTnKbJHdNclKS1Q49XXH902O/muSRSU5OckiSJyT5Wnffa3r8ztNzv3J+hdOhsm9M8vYkN0vyX5K8tKrmDyN9RJLnJLlxkk8l+f1VZkySE5McM72eZ80F/3VJnpHk8CQ/leQ+SZ68ynp+aXquLUnel+TFSf7T9DO/Y5IfCHgAdi8xCMC805M8ZC5CHjvdlyTp7rO6+6Lu/k53X5jk5Ul+9no8z6OTvKW73zKt6x1JPpJZ7CTJd5Lcsapu1N2Xd/eODvV8VJLndvcXuvuLmUXNY9Y4w7cyi8DbdPd13X1Od1+1o4W7+yXd/aXu/nZ3/1GSA5OseG5eVd18ei1P7+6t3f2FJP8rs/C6Puv/5cwi9xM9c8Eaz4G8R5LNSf6gu7/Z3e9K8qbMwnKbv+vuD0+HAb80syBdzXOm13RRkr/atq7p+/ehaf5Lkvx5Vt82Xt/d759+/tdm9vO4fVUd0t1f6e5z1/D6ALgBxCAA39Xd70tyRZJfnA4XPC6zvVRJkqq6e1WdWVVfrKqvZrY37fDr8VS3TvLQ6ZDCK6vqyiQ/k+SI7t6a5OHTui+fDmG83Q7Wc8skl87dvnS6by3+JsnbkrxiOuzxf0570lZUVc+sqo9X1VeneQ/Njl/7rZPsP82/7fX9eWZ75zIdrnnN9HH8GtZ/VJJ/XuPrmnfLJJd193fm7rs0ya3mbv/r3OdfyyweV3PZduu65TT/bavqTTW72NBVme0tXW3buGy72w/OLKAvrap3V9VP7WQOAG4gMQjA9s7IbI/go5O8rbs/P/fYy5K8IclR3X1okj9Lsv0FZ7bZmuSgudu3mPv8siR/092HzX0c3N1/kCTd/bbuvl+SI5JcnOQvdvAcn8ssvLb54em+nerub3X3c7r79pkdinpKvneIbM8vOwXbr2d22OyNu/uwJF/N91779y0/vb5vJDl87vUd0t13mJ77DtNhnZu7+71rWP9lSVY8l28nPpfkqKqa///9Dyf57PVY1zZHbbeubd/vF2X2szqmuw/J7DzKHW0byXbfs+4+u7sflFkwvy6zQ5YBWCAxCMD2zkhy3yS/krlDRCdbkny5u6+tquMyO+9rR85P8oiq2r+qjk3ykLnHXpLkgVX1c9PFUTZNFz45crqIy4Omcwe/keSazA4bXcnLk/zOdK7h4Ul+b1r3TlXViVX1b6eLulyV2WGK257n80nm3xNvS2bn/30xyX5V9XuZnbuXueWP3hZd3X15Zufp/VFVHVJV+1TVj1XVjg6b3Nn6/zLJf6uqY2rmTvW9903cftZ5/5jZ3r5fn34OJyR5YJJXrPKt2ZnfraqDquoOSf5Dkm3nKW7J7Pt4zbQn90lrXWFVHVBVj6qqQ7v7W9N6dvQzB2A3EYMAfJ/pfK8PJDk4s72A856c5LlVdXVm4bXa3pvfzWxv1lcyO5fvu4ebdvdlSR6U2d6jL2a25+vXMvv/0j6ZXTDlc0m+nNl5ZzsKi+dldq7hhUkuSnLudN9a3CLJqzMLj48neXdmh44myQszO3fyK1X1J5kdTvrWJJ/M7NDIa/P9hzm+avrzS1W17Vy3xyY5IMnHpu/BqzPb07mSna3/BZl9r98+zfviJDeaHnt2ktOnw1EfNr/S7v5mZvH3gMwO//2/SR7b3Rev8n3ZmXdndqGZdyb5w+5++3T/MzP75cDVme3JfeXKX75Dj0lyyXSI6RMzOx8UgAWq7u2PbAEA+H5VdXSSf0my/wZ8z0kAVmDPIAAAwIDEIAAAwIDE4F6uqu5fVZ+oqk9V1W8sex7Wr6r6f1X1har6p2XPwvpWVUdNby/xsektEp627JlYvO6+pLtrVw8RnS4O9OGqumDaXp6zqBnZO0wXlTqvqt607FlY36rqkqq6qKrOr6qPLHuejcg5g3ux6Qp5n0xyvySfSXJ2kkd298eWOhjrUlXdK7OrNp7R3Xdc9jysX1V1RGbvB3huVW1Jck6SX/R3CyupqkpycHdfM72P4/uSPK27P7Tk0VinqupXkxyb5JDuPmXZ87B+VdUlSY7t7iuWPctGZc/g3u24JJ/q7k9PV5R7RWZX74Mf0N3vyezKjbCq7r68u8+dPr86sytx3mr1r2JUPXPNdHP/6cNvollRVR2Z5OczezsVYMHE4N7tVvn+S5N/Jv7BBuxG0xUm75rZ+9nBiqbD/s5P8oUk7+hu2ws78sdJfj3eZ5K16SRvr6pzqurUZQ+zEYlBAK6Xqtqc5DVJnt7dVy17Htav7r6uu++S5Mgkx1WVQ9H5AVV1SpIvdPc5y56FDeNnuvsnMnsv1f88nfLCLhCDe7fPJjlq7vaR030AN8h07tdrkry0u1+77HnYGLr7yiRnJrn/smdhXbpnkl+YzgN7RZJ7V9VLljsS61l3f3b68wtJ/i6zU6TYBWJw73Z2kmOq6keq6oAkj0jyhiXPBGxw0wVBXpzk4939gmXPw/pWVTetqsOmz2+U2UXNLl7uVKxH3f2b3X1kdx+d2b9Z3tXdj17yWKxTVXXwdBGzVNXBSU5K4orou0gM7sWmy38/JcnbMrvAw99290eXOxXrVVW9PMkHk/x4VX2mqv7jsmdi3bpnksdk9lv786ePk5c9FOvWEUnOrKoLM/sl5Tu621sGADfUzZO8r6ouSPLhJG/u7rcueaYNx1tLAAAADMieQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJwUFU1anLnoGNwbbCrrC9sFa2FXaF7YW1sq3cMGJwHP5DYa1sK+wK2wtrZVthV9heWCvbyg0gBgEAAAa0173p/AF1YG/KwcseY935Vr6R/XPgssdYV2ofvwtZyTf72hxQm5Y9xrrTN/Lfz0q+9a2t2X9/f+fO++Yh/m5ZyXVf25p9D7KtbK/3XfYE69N1W7dm34NtL/O2bP76skdYl6698tpsOsy/W7b3pYu/dEV333Rny+23J4bZkzbl4Ny97rPsMdgA9tm8ZdkjsIFcd8cfXfYIbBCX3c8/YFm7bx72nWWPwAZxr5/+6LJHYAM54+5/delalvPrSwAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAGJQQAAgAEtJQar6po1LPPUqvp4Vb20qk6oqp/eE7MBAACMYD3vGXxykvt196OSnJBEDAIAAOwmS4/Bqvq1qjq7qi6squdM9/1Zkh9N8vdV9YwkT0zyjKo6v6qOX+a8AAAAe4P9lvnkVXVSkmOSHJekkryhqu7V3U+sqvsnObG7r6iqQ5Nc091/uMx5AQAA9hZLjcEkJ00f5023N2cWh+/ZlZVU1alJTk2STTlod84HAACwV1p2DFaS53f3n9+QlXT3aUlOS5JD6ia9OwYDAADYmy37nMG3JXlCVW1Okqq6VVXdbIXlrk6yZY9OBgAAsBdbagx299uTvCzJB6vqoiSvzsrR98Yk/84FZAAAAHaPpRwm2t2b5z5/YZIXrrDM0XOffzLJnfbIcAAAAANY9mGiAAAALIEYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGNB+O1ugqm6a5FeSHD2/fHc/YXFjAQAAsEg7jcEkr0/y3iT/kOS6xY4DAADAnrCWGDyou5+18EkAAADYY9ZyzuCbqurkhU8CAADAHrOWGHxaZkF4bVVdPX1ctejBAAAAWJydHiba3Vv2xCAAAADsOWs5ZzBV9QtJ7jXdPKu737S4kQAAAFi0nR4mWlV/kNmhoh+bPp5WVc9f9GAAAAAszlr2DJ6c5C7d/Z0kqarTk5yX5DcXORgAAACLs5YLyCTJYXOfH7qIQQAAANhz1rJn8PlJzquqM5NUZucO/sZCpwIAAGCh1nI10ZdX1VlJ7jbd9azu/teFTgUAAMBC7fAw0aq63fTnTyQ5Islnpo9bTvcBAACwQa22Z/BXk5ya5I9WeKyT3HshE91Atd++2fewmyx7DDaA6445ctkjsIFcfvzByx6BDWLL3b+47BHYQP7yDn+z7BHYIO50wKZlj8AGcsYal9thDHb3qdOnD+jua+cfqypbIwAAwAa2lquJfmCN9wEAALBB7HDPYFXdIsmtktyoqu6a2ZVEk+SQJAftgdkAAABYkNXOGfy5JI9PcmRm5w1ui8GrkvzWYscCAABgkVY7Z/D0JKdX1YO7+zV7cCYAAAAWbC3nDP5kVR227UZV3biqnrfAmQAAAFiwtcTgA7r7ym03uvsrSU5e3EgAAAAs2lpicN+qOnDbjaq6UZIDV1keAACAdW61C8hs89Ik76yqv8rsIjKPT3L6IocCAABgsXYag939P6rqgiT3TdJJ3pbk1oseDAAAgMVZy2GiSfL5zELwoUnuneTjC5sIAACAhVvtTedvm+SR08cVSV6ZpLr7xD00GwAAAAuy2mGiFyd5b5JTuvtTSVJVz9gjUwEAALBQqx0m+u+TXJ7kzKr6i6q6T2YXkAEAAGCD22EMdvfruvsRSW6X5MwkT09ys6p6UVWdtKcGBAAAYPfb6QVkuntrd7+sux+Y5Mgk5yV51sInAwAAYGHWejXRJEl3f6W7T+vu+yxqIAAAABZvl2IQAACAvYMYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGJAYBAAAGNAei8GqektVHbaTZc6qqmNXuP8uVXXy4qYDAAAYyx6JwaqqJKd095XXcxV3SSIGAQAAdpOFxWBVHV1Vn6iqM5L8U5Lrqurw6bHfnR57X1W9vKqeOfelD62qD1fVJ6vq+Ko6IMlzkzy8qs6vqocvamYAAIBR7Lfg9R+T5HHd/aGquiRJqupuSR6c5M5J9k9ybpJz5mfq7uOmw0L/a3fft6p+L8mx3f2UBc8LAAAwhEXH4KXd/aHt7rtnktd397VJrq2qN273+GunP89JcvRanqSqTk1yapJs2mfz9Z8WAABgEIs+Z3Dr9fiab0x/Xpc1xmp3n9bdx3b3sQfss+l6PCUAAMBYlvHWEu9P8sCq2lRVm5OcsoavuTrJlsWOBQAAMI49HoPdfXaSNyS5MMnfJ7koyVd38mVnJrm9C8gAAADsHgs7Z7C7L0lyx7nbR889/Ifd/eyqOijJezJdQKa7T5hb/opM5wx295eT3G1RswIAAIxm0ReQ2ZHTqur2STYlOb27z13SHAAAAENaSgx29y8t43kBAACYWcYFZAAAAFgyMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADCg6u5lz7BbVdUXk1y67DnWocOTXLHsIdgQbCvsCtsLa2VbYVfYXlgr28rKbt3dN93ZQntdDLKyqvpIdx+77DlY/2wr7ArbC2tlW2FX2F5YK9vKDeMwUQAAgAGJQQAAgAGJwXGctuwB2DBsK+yKIbaXqvrtqvpoVV1YVedX1d2r6ulVddAavnZNyw1giG2F3cb2wlrZVm4A5wwCwCqq6qeSvCDJCd39jao6PMkBST6Q5NjuXvXCBVV1yVqWA4A9zZ5BAFjdEUmu6O5vJMkUdQ9JcsskZ1bVmUlSVS+qqo9MexCfM9331BWWO6mqPlhV51bVq6pq8zJeFADYMwgAq5hi7X1JDkryD0le2d3v3n6PX1XdpLu/XFX7Jnlnkqd294Xzy017FV+b5AHdvbWqnpXkwO5+7hJeGgCD22/ZAwDAetbd11TVTyY5PsmJSV5ZVb+xwqIPq6pTM/t/6xFJbp/kwu2Wucd0//urKpkdbvrBRc0OAKsRgwCwE919XZKzkpxVVRcledz841X1I0memeRu3f2VqvrrJJtWWFUleUd3P3KxEwPAzjlnEABWUVU/XlXHzN11lySXJrk6yZbpvkOSbE3y1aq6eZIHzC0/v9yHktyzqm4zrfvgqrrtIucHgB2xZxAAVrc5yf+uqsOSfDvJp5KcmuSRSd5aVZ/r7hOr6rwkFye5LMn7577+tO2We3ySl1fVgdPjv5Pkk3votQDAd7mADAAAwIAcJgoAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADAgMQgAADCg/w/tprD+xrdI1QAAAABJRU5ErkJggg==\n", + "image/png": "", "text/plain": [ - "" + "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" } ], "source": [ - "plot_policy(agent)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Bonus 3.2 Bootstrapped DQN (10 points)\n", - "\n", - "Implement Bootstrapped DQN algorithm and compare it's performance with ordinary DQN on BeamRider Atari game. Links:\n", - "- https://arxiv.org/abs/1602.04621" + "test_agent(agent)" ] } ], "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "pygments_lexer": "ipython3" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" } }, "nbformat": 4, diff --git a/week06_policy_based/README.md b/week06_policy_based/README.md index 5f5ea046f..52b00bad8 100644 --- a/week06_policy_based/README.md +++ b/week06_policy_based/README.md @@ -1,7 +1,7 @@ ## Materials * [Slides](https://yadi.sk/i/keSzKSgA2oYuwQ) * Video lecture by D. Silver - [video](https://www.youtube.com/watch?v=KHZVXao4qXs) -* Our [lecture](https://yadi.sk/i/yPIPkO_f3TPsNK), [seminar(pytorch)](https://yadi.sk/i/flW8ezGk3TPsQ5), [seminar(theano)](https://yadi.sk/i/8f9NX_E73GKBkT) +* Our [lecture](https://yadi.sk/i/yPIPkO_f3TPsNK), [seminar(pytorch)](https://yadi.sk/i/flW8ezGk3TPsQ5) * Alternative lecture by J. Schulman part 1 - [video](https://www.youtube.com/watch?v=BB-BhTn6DCM) * Alternative lecture by J. Schulman part 2 - [video](https://www.youtube.com/watch?v=Wnl-Qh2UHGg) * Andrej Karpathy's [post](http://karpathy.github.io/2016/05/31/rl/) on policy gradients diff --git a/week06_policy_based/a2c-optional.ipynb b/week06_policy_based/a2c-optional.ipynb index 4cb4186ad..2ffe4deaf 100644 --- a/week06_policy_based/a2c-optional.ipynb +++ b/week06_policy_based/a2c-optional.ipynb @@ -1,298 +1,409 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "if 'google.colab' in sys.modules:\n", - " import os\n", - "\n", - " os.system('apt-get install -y xvfb')\n", - " os.system('wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/xvfb -O ../xvfb')\n", - " os.system('apt-get install -y python-opengl ffmpeg')\n", - " os.system('pip install pyglet==1.2.4')\n", - "\n", - " os.system('python -m pip install -U pygame --user')\n", - "\n", - " print('setup complete')\n", - "\n", - "# XVFB will be launched if you run on a server\n", - "import os\n", - "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", - " !bash ../xvfb start\n", - " os.environ['DISPLAY'] = ':1'" - ] + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "o4vBVdNx2EPr" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Starting virtual X frame buffer: Xvfb../xvfb: line 24: start-stop-daemon: command not found\n", + ".\n" + ] + } + ], + "source": [ + "import sys, os\n", + "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", + " # Install xvfb and our launcher script for it\n", + " !apt-get install -y xvfb\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/xvfb -O ../xvfb\n", + "\n", + " # Download dependencies from Github\n", + " !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week06_policy_based/atari_wrappers.py\n", + " !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week06_policy_based/env_batch.py\n", + " !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week06_policy_based/runners.py\n", + "\n", + " # Update the gym environment to be compatible with the Atari environment\n", + " !pip install -q gymnasium[atari,accept-rom-license]\n", + " !pip install -q tensorboardX\n", + "\n", + " !touch .setup_complete\n", + "\n", + "# This code creates a virtual display to draw game images on.\n", + "# It will have no effect if your machine has a monitor.\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " os.environ['DISPLAY'] = ':1'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "O_iJbFWQ2EPs" + }, + "source": [ + "# Implementing Advantage-Actor Critic (A2C)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "16ownLDJ2EPs" + }, + "source": [ + "In this notebook you will implement Advantage Actor Critic algorithm that trains on a batch of Atari 2600 environments running in parallel.\n", + "\n", + "Firstly, we will use environment wrappers implemented in file `atari_wrappers.py`. These wrappers preprocess observations (resize, grayscale, take max between frames, skip frames and stack them together) and rewards. Some of the wrappers help to reset the environment and pass `done` flag equal to `True` when agent dies.\n", + "File `env_batch.py` includes implementation of `ParallelEnvBatch` class that allows to run multiple environments in parallel. To create an environment we can use `nature_dqn_env` function. Note that if you are using\n", + "PyTorch and not using `tensorboardX` you will need to implement a wrapper that will log **raw** total rewards that the *unwrapped* environment returns and redefine the implemention of `nature_dqn_env` function here.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uScP-zu12EPt" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import gymnasium as gym\n", + "from atari_wrappers import nature_dqn_env\n", + "\n", + "\n", + "env_name = \"SpaceInvadersNoFrameskip-v4\"\n", + "nenvs = 8 # change this if you have more than 8 CPU ;)\n", + "summaries = \"Tensorboard\"\n", + "\n", + "env = nature_dqn_env(env_name, nenvs=nenvs, summaries=summaries)\n", + "obs, _ = env.reset()\n", + "assert obs.shape == (nenvs, 4, 84, 84)\n", + "assert obs.dtype == np.float32\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jiWeYgmd2EPt" + }, + "source": [ + "Next, we will need to implement a model that predicts logits and values. It is suggested that you use the same model as in [Nature DQN paper](https://www.nature.com/articles/nature14236) with a modification that instead of having a single output layer, it will have two output layers taking as input the output of the last hidden layer. **Note** that this model is different from the model you used in homework where you implemented DQN. You can use your favorite deep learning framework here. We suggest that you use orthogonal initialization with parameter $\\sqrt{2}$ for kernels and initialize biases with zeros." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "id": "FIkJ7z7TiWS4" + }, + "outputs": [], + "source": [ + "# import tensorflow as torch\n", + "# import torch as tf\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pA2VlyZ32EPt" + }, + "source": [ + "You will also need to define and use a policy that wraps the model. While the model computes logits for all actions, the policy will sample actions and also compute their log probabilities. `policy.act` should return a dictionary of all the arrays that are needed to interact with an environment and train the model.\n", + " Note that actions must be an `np.ndarray` while the other\n", + "tensors need to have the type determined by your deep learning framework." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "id": "dtHP-Fo72EPt" + }, + "outputs": [], + "source": [ + "class Policy:\n", + " def __init__(self, model):\n", + " self.model = model\n", + "\n", + " def act(self, inputs):\n", + " # Implement a policy by calling the model, sampling actions and computing their log probs.\n", + " # Should return a dict containing keys ['actions', 'logits', 'log_probs', 'values'].\n", + "\n", + " \n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2oPCQwsd2EPt" + }, + "source": [ + "Next will pass the environment and policy to a runner that collects partial trajectories from the environment.\n", + "The class that does is is already implemented for you." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "id": "fj-fKr_A2EPt" + }, + "outputs": [], + "source": [ + "from runners import EnvRunner" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_9JehIbH2EPt" + }, + "source": [ + "This runner interacts with the environment for a given number of steps and returns a dictionary containing\n", + "keys\n", + "\n", + "* 'observations'\n", + "* 'rewards'\n", + "* 'resets'\n", + "* 'actions'\n", + "* all other keys that you defined in `Policy`\n", + "\n", + "under each of these keys there is a python `list` of interactions with the environment. This list has length $T$ that is size of partial trajectory. Partial trajectory for given moment `t` is part of `ComputeValueTargets.__call__` input argument `trajectory` from moment `t` to the end (i.e. it's different at each iteration in the algorithm)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iY7FB6s72EPu" + }, + "source": [ + "To train the part of the model that predicts state values you will need to compute the value targets.\n", + "Any callable could be passed to `EnvRunner` to be applied to each partial trajectory after it is collected.\n", + "Thus, we can implement and use `ComputeValueTargets` callable.\n", + "The formula for the value targets is simple:\n", + "\n", + "$$\n", + "\\hat v(s_t) = \\left( \\sum_{t'=0}^{T - 1} \\gamma^{t'}r_{t+t'} \\right) + \\gamma^T \\hat{v}(s_{t+T}),\n", + "$$\n", + "\n", + "In implementation, however, do not forget to use\n", + "`trajectory['resets']` flags to check if you need to add the value targets at the next step when\n", + "computing value targets for the current step. You can access `trajectory['state']['latest_observation']`\n", + "to get last observations in partial trajectory — $s_{t+T}$." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "id": "4CbDi3GZ2EPu" + }, + "outputs": [], + "source": [ + "class ComputeValueTargets:\n", + " def __init__(self, policy, gamma=0.99):\n", + " self.policy = policy\n", + " self.gamma = gamma\n", + "\n", + " def __call__(self, trajectory):\n", + " \"\"\"Compute value targets for a given partial trajectory.\"\"\"\n", + "\n", + " # This method should modify trajectory inplace by adding\n", + " # an item with key 'value_targets' to it.\n", + "\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9_d9OYyz2EPu" + }, + "source": [ + "After computing value targets we will transform lists of interactions into tensors\n", + "with the first dimension `batch_size` which is equal to `env_steps * num_envs`, i.e. you essentially need\n", + "to flatten the first two dimensions." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "id": "IEnqWlHh2EPu" + }, + "outputs": [], + "source": [ + "class MergeTimeBatch:\n", + " \"\"\" Merges first two axes typically representing time and env batch. \"\"\"\n", + " def __call__(self, trajectory):\n", + " # Modify trajectory inplace.\n", + "\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "id": "-2CwwzLl2EPu" + }, + "outputs": [], + "source": [ + "model = \n", + "policy = Policy(model)\n", + "runner = EnvRunner(\n", + " env=env,\n", + " policy=policy,\n", + " nsteps=5,\n", + " transforms=[\n", + " ComputeValueTargets(policy),\n", + " MergeTimeBatch(),\n", + " ],\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IuYy-8Ri2EPu" + }, + "source": [ + "Now is the time to implement the advantage actor critic algorithm itself. You can look into your lecture,\n", + "[Mnih et al. 2016](https://arxiv.org/abs/1602.01783) paper, and [lecture](https://www.youtube.com/watch?v=Tol_jw5hWnI&list=PLkFD6_40KJIxJMR-j5A1mkxK26gh_qg37&index=20) by Sergey Levine." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "id": "hxFLzyRX2EPu" + }, + "outputs": [], + "source": [ + "class A2C:\n", + " def __init__(self,\n", + " policy,\n", + " optimizer,\n", + " value_loss_coef=0.25,\n", + " entropy_coef=0.01,\n", + " max_grad_norm=0.5):\n", + " self.policy = policy\n", + " self.optimizer = optimizer\n", + " self.value_loss_coef = value_loss_coef\n", + " self.entropy_coef = entropy_coef\n", + " self.max_grad_norm = max_grad_norm\n", + "\n", + " def policy_loss(self, trajectory):\n", + " # You will need to compute advantages here.\n", + " \n", + "\n", + " def value_loss(self, trajectory):\n", + " \n", + "\n", + " def loss(self, trajectory):\n", + " \n", + "\n", + " def step(self, trajectory):\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JIMtFZuG2EPu" + }, + "source": [ + "Now you can train your model. With reasonable hyperparameters training on a single GTX1080 for 10 million steps across all batched environments (which translates to about 5 hours of wall clock time)\n", + "it should be possible to achieve *average raw reward over last 100 episodes* (the average is taken over 100 last\n", + "episodes in each environment in the batch) of about 600. You should plot this quantity with respect to\n", + "`runner.step_var` — the number of interactions with all environments. It is highly\n", + "encouraged to also provide plots of the following quantities (these are useful for debugging as well):\n", + "\n", + "* [Coefficient of Determination](https://en.wikipedia.org/wiki/Coefficient_of_determination) between\n", + "value targets and value predictions\n", + "* Entropy of the policy $\\pi$\n", + "* Value loss\n", + "* Policy loss\n", + "* Value targets\n", + "* Value predictions\n", + "* Gradient norm\n", + "* Advantages\n", + "* A2C loss\n", + "\n", + "For optimization we suggest you use RMSProp with learning rate starting from 7e-4 and linearly decayed to 0, smoothing constant (alpha in PyTorch and decay in TensorFlow) equal to 0.99 and epsilon equal to 1e-5." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#if you use TensorboardSummaries\n", + "%load_ext tensorboard\n", + "%tensorboard --logdir logs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "a2c = \n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZDbgUdMq2EPu" + }, + "source": [ + "### Target networks?\n", + "\n", + "You may recall a technique called \"target networks\" we used a few weeks ago when we trained a DQN agent to play Atari Breakout and wonder why we have not suggested using them here. The answer is that this is more historical than practical.\n", + "\n", + "While the \"chasing the target\" problem is still present in actor-critic value estimation and target networks do show up in follow-up papers, the original A3C/A2C papers do not mention them and do not explain this omission.\n", + "\n", + "The hypothesis why this may not be a big deal (compared to Q-learning) goes like this. An A3C/A2C agent selects actions based on policy, not an epsilon greedy exploration function, for which the argmax can change drastically due to tiny errors in function approximation. Therefore, errors in the value target caused by target chasing will cause less damage.\n", + "\n", + "Also, the actor-critic gradient relies on the advantage function $A(s_t, a_t) = Q(s_t, a_t) - V(s_t)$. Compare this to the $Q$-function $Q(s_t, a_t) = r(s_t, a_t) + \\gamma \\cdot \\mathbb{E}_{s_{t+1} \\mid s_t, a_t} V(s_{t+1})$ used in Q-learning and SARSA: we would expect that any bias in $V$-function approximation will be carried over from $V(s_{t+1})$ to $V(s_t)$ by gradient updates. However, in the formula for the advantage function the two approximations ($Q$-function and $V$-function) come with opposite signs, and thus the errors will cancel out.\n", + "\n", + "The last reason may be computational. Authors were concerned to beat existent algorithms in the wall-clock learning time, and any overhead of parameter copying (target network update) counted against this goal." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Implementing Advantage-Actor Critic (A2C)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this notebook you will implement Advantage Actor Critic algorithm that trains on a batch of Atari 2600 environments running in parallel. \n", - "\n", - "Firstly, we will use environment wrappers implemented in file `atari_wrappers.py`. These wrappers preprocess observations (resize, grayscal, take max between frames, skip frames and stack them together) and rewards. Some of the wrappers help to reset the environment and pass `done` flag equal to `True` when agent dies.\n", - "File `env_batch.py` includes implementation of `ParallelEnvBatch` class that allows to run multiple environments in parallel. To create an environment we can use `nature_dqn_env` function. Note that if you are using \n", - "PyTorch and not using `tensorboardX` you will need to implement a wrapper that will log **raw** total rewards that the *unwrapped* environment returns and redefine the implemention of `nature_dqn_env` function here. \n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from atari_wrappers import nature_dqn_env, NumpySummaries\n", - "\n", - "\n", - "env = nature_dqn_env(\"SpaceInvadersNoFrameskip-v4\", nenvs=8, summaries='Numpy')\n", - "obs = env.reset()\n", - "assert obs.shape == (8, 84, 84, 4)\n", - "assert obs.dtype == np.uint8" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we will need to implement a model that predicts logits and values. It is suggested that you use the same model as in [Nature DQN paper](https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf) with a modification that instead of having a single output layer, it will have two output layers taking as input the output of the last hidden layer. **Note** that this model is different from the model you used in homework where you implemented DQN. You can use your favorite deep learning framework here. We suggest that you use orthogonal initialization with parameter $\\sqrt{2}$ for kernels and initialize biases with zeros. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import tensorflow as torch\n", - "# import torch as tf\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You will also need to define and use a policy that wraps the model. While the model computes logits for all actions, the policy will sample actions and also compute their log probabilities. `policy.act` should return a dictionary of all the arrays that are needed to interact with an environment and train the model.\n", - " Note that actions must be an `np.ndarray` while the other\n", - "tensors need to have the type determined by your deep learning framework. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class Policy:\n", - " def __init__(self, model):\n", - " self.model = model\n", - " \n", - " def act(self, inputs):\n", - " \n", - " # Should return a dict containing keys ['actions', 'logits', 'log_probs', 'values']." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next will pass the environment and policy to a runner that collects partial trajectories from the environment. \n", - "The class that does is is already implemented for you." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from runners import EnvRunner" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This runner interacts with the environment for a given number of steps and returns a dictionary containing\n", - "keys \n", - "\n", - "* 'observations' \n", - "* 'rewards' \n", - "* 'resets'\n", - "* 'actions'\n", - "* all other keys that you defined in `Policy`\n", - "\n", - "under each of these keys there is a python `list` of interactions with the environment of specified length $T$ — the size of partial trajectory. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To train the part of the model that predicts state values you will need to compute the value targets. \n", - "Any callable could be passed to `EnvRunner` to be applied to each partial trajectory after it is collected. \n", - "Thus, we can implement and use `ComputeValueTargets` callable. \n", - "The formula for the value targets is simple:\n", - "\n", - "$$\n", - "\\hat v(s_t) = \\left( \\sum_{t'=0}^{T - 1 - t} \\gamma^{t'}r_{t+t'} \\right) + \\gamma^T \\hat{v}(s_{t+T}),\n", - "$$\n", - "\n", - "In implementation, however, do not forget to use \n", - "`trajectory['resets']` flags to check if you need to add the value targets at the next step when \n", - "computing value targets for the current step. You can access `trajectory['state']['latest_observation']`\n", - "to get last observations in partial trajectory — $s_{t+T}$." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class ComputeValueTargets:\n", - " def __init__(self, policy, gamma=0.99):\n", - " self.policy = policy\n", - " \n", - " def __call__(self, trajectory):\n", - " # This method should modify trajectory inplace by adding\n", - " # an item with key 'value_targets' to it.\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After computing value targets we will transform lists of interactions into tensors\n", - "with the first dimension `batch_size` which is equal to `T * nenvs`, i.e. you essentially need\n", - "to flatten the first two dimensions. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class MergeTimeBatch:\n", - " \"\"\" Merges first two axes typically representing time and env batch. \"\"\"\n", - " def __call__(self, trajectory):\n", - " # Modify trajectory inplace.\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model = \n", - "policy = Policy(model)\n", - "runner = EnvRunner(\n", - " env, policy, nsteps=5,\n", - " transforms=[\n", - " ComputeValueTargets(),\n", - " MergeTimeBatch(),\n", - " ])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now is the time to implement the advantage actor critic algorithm itself. You can look into your lecture,\n", - "[Mnih et al. 2016](https://arxiv.org/abs/1602.01783) paper, and [lecture](https://www.youtube.com/watch?v=Tol_jw5hWnI&list=PLkFD6_40KJIxJMR-j5A1mkxK26gh_qg37&index=20) by Sergey Levine." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class A2C:\n", - " def __init__(self,\n", - " policy,\n", - " optimizer,\n", - " value_loss_coef=0.25,\n", - " entropy_coef=0.01,\n", - " max_grad_norm=0.5):\n", - " self.policy = policy\n", - " self.optimizer = optimizer\n", - " self.value_loss_coef = value_loss_coef\n", - " self.entropy_coef = entropy_coef\n", - " self.max_grad_norm = max_grad_norm\n", - " \n", - " def policy_loss(self, trajectory):\n", - " # You will need to compute advantages here.\n", - " \n", - " \n", - " def value_loss(self, trajectory):\n", - " \n", - " \n", - " def loss(self, trajectory):\n", - " \n", - " \n", - " def step(self, trajectory):\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now you can train your model. With reasonable hyperparameters training on a single GTX1080 for 10 million steps across all batched environments (which translates to about 5 hours of wall clock time)\n", - "it should be possible to achieve *average raw reward over last 100 episodes* (the average is taken over 100 last \n", - "episodes in each environment in the batch) of about 600. You should plot this quantity with respect to \n", - "`runner.step_var` — the number of interactions with all environments. It is highly \n", - "encouraged to also provide plots of the following quantities (these are useful for debugging as well):\n", - "\n", - "* [Coefficient of Determination](https://en.wikipedia.org/wiki/Coefficient_of_determination) between \n", - "value targets and value predictions\n", - "* Entropy of the policy $\\pi$\n", - "* Value loss\n", - "* Policy loss\n", - "* Value targets\n", - "* Value predictions\n", - "* Gradient norm\n", - "* Advantages\n", - "* A2C loss\n", - "\n", - "For optimization we suggest you use RMSProp with learning rate starting from 7e-4 and linearly decayed to 0, smoothing constant (alpha in PyTorch and decay in TensorFlow) equal to 0.99 and epsilon equal to 1e-5." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "a2c = \n", - "\n", - "" - ] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/week06_policy_based/atari_wrappers.py b/week06_policy_based/atari_wrappers.py index b1a9234de..ffd0ce25f 100644 --- a/week06_policy_based/atari_wrappers.py +++ b/week06_policy_based/atari_wrappers.py @@ -2,139 +2,149 @@ from collections import defaultdict, deque import cv2 -import gym -import gym.spaces as spaces -from gym.envs import atari +import gymnasium as gym import numpy as np +from gymnasium import ObservationWrapper, RewardWrapper, Wrapper +from gymnasium.spaces import Box +from gymnasium.wrappers import RecordVideo +from shimmy.atari_env import AtariEnv +from tensorboardX import SummaryWriter from env_batch import ParallelEnvBatch + cv2.ocl.setUseOpenCL(False) -class EpisodicLife(gym.Wrapper): - """ Sets done flag to true when agent dies. """ +class EpisodicLife(Wrapper): + """Sets done flag to true when agent dies.""" def __init__(self, env): - super(EpisodicLife, self).__init__(env) + super().__init__(env) self.lives = 0 self.real_done = True def step(self, action): - obs, rew, done, info = self.env.step(action) - self.real_done = done - info["real_done"] = done + obs, reward, terminated, truncated, info = self.env.step(action) + self.real_done = terminated or truncated + info["real_done"] = self.real_done lives = self.env.unwrapped.ale.lives() if 0 < lives < self.lives: - done = True + terminated = True self.lives = lives - return obs, rew, done, info + return obs, reward, terminated, truncated, info def reset(self, **kwargs): if self.real_done: - obs = self.env.reset(**kwargs) + obs, info = self.env.reset(**kwargs) else: - obs, _, _, _ = self.env.step(0) + obs, _, terminated, truncated, info = self.env.step(0) + if terminated or truncated: + obs, info = self.env.reset(**kwargs) self.lives = self.env.unwrapped.ale.lives() - return obs + return obs, info -class FireReset(gym.Wrapper): - """ Makes fire action when reseting environment. +class FireReset(Wrapper): + """Makes fire action when reseting environment. Some environments are fixed until the agent makes the fire action, this wrapper makes this action so that the epsiode starts automatically. """ def __init__(self, env): - super(FireReset, self).__init__(env) + super().__init__(env) action_meanings = env.unwrapped.get_action_meanings() if len(action_meanings) < 3: raise ValueError( "env.unwrapped.get_action_meanings() must be of length >= 3" - f"but is of length {len(action_meanings)}") + f"but is of length {len(action_meanings)}" + ) if env.unwrapped.get_action_meanings()[1] != "FIRE": raise ValueError( "env.unwrapped.get_action_meanings() must have 'FIRE' " - f"under index 1, but is {action_meanings}") + f"under index 1, but is {action_meanings}" + ) def step(self, action): return self.env.step(action) def reset(self, **kwargs): self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(1) - if done: + obs, _, terminated, truncated, _ = self.env.step(1) + if terminated or truncated: self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(2) - if done: + obs, _, terminated, truncated, _ = self.env.step(2) + if terminated or truncated: self.env.reset(**kwargs) - return obs + return obs, {} -class StartWithRandomActions(gym.Wrapper): - """ Makes random number of random actions at the beginning of each - episode. """ +class StartWithRandomActions(Wrapper): + """Makes random number of random actions at the beginning of each + episode.""" def __init__(self, env, max_random_actions=30): - super(StartWithRandomActions, self).__init__(env) + super().__init__(env) self.max_random_actions = max_random_actions self.real_done = True def step(self, action): - obs, rew, done, info = self.env.step(action) + obs, reward, terminated, truncated, info = self.env.step(action) self.real_done = info.get("real_done", True) - return obs, rew, done, info + return obs, reward, terminated, truncated, info def reset(self, **kwargs): - obs = self.env.reset() + obs, info = self.env.reset(**kwargs) if self.real_done: - num_random_actions = np.random.randint(self.max_random_actions + 1) + num_random_actions = self.unwrapped.np_random.integers( + low=1, high=self.max_random_actions + 1 + ) for _ in range(num_random_actions): - obs, _, _, _ = self.env.step(self.env.action_space.sample()) + obs, _, _, _, info = self.env.step(self.env.action_space.sample()) self.real_done = False - return obs + return obs, info -class ImagePreprocessing(gym.ObservationWrapper): - """ Preprocesses image-observations by possibly grayscaling and resizing. """ +class ImagePreprocessing(ObservationWrapper): + """Preprocesses image-observations by possibly grayscaling and resizing.""" - def __init__(self, env, width=84, height=84, grayscale=True): - super(ImagePreprocessing, self).__init__(env) - self.width = width + def __init__(self, env, height=84, width=84, grayscale=True): + super().__init__(env) self.height = height + self.width = width self.grayscale = grayscale ospace = self.env.observation_space low, high, dtype = ospace.low.min(), ospace.high.max(), ospace.dtype if self.grayscale: - self.observation_space = spaces.Box( + self.observation_space = Box( low=low, high=high, - shape=(width, height), + shape=(height, width), dtype=dtype, ) else: - obs_shape = (width, height) + self.observation_space.shape[2:] - self.observation_space = spaces.Box(low=low, high=high, - shape=obs_shape, dtype=dtype) + self.observation_space = Box( + low=low, + high=high, + shape=(height, width, *self.observation_space.shape[2:]), + dtype=dtype, + ) def observation(self, observation): - """ Performs image preprocessing. """ + """Performs image preprocessing.""" if self.grayscale: observation = cv2.cvtColor(observation, cv2.COLOR_RGB2GRAY) - observation = cv2.resize(observation, (self.width, self.height), - cv2.INTER_AREA) + observation = cv2.resize(observation, (self.width, self.height), cv2.INTER_AREA) return observation -class MaxBetweenFrames(gym.ObservationWrapper): - """ Takes maximum between two subsequent frames. """ +class MaxBetweenFrames(ObservationWrapper): + """Takes maximum between two subsequent frames.""" def __init__(self, env): - if (isinstance(env.unwrapped, atari.AtariEnv) and - "NoFrameskip" not in env.spec.id): - raise ValueError( - "MaxBetweenFrames requires NoFrameskip in Atari env id") - super(MaxBetweenFrames, self).__init__(env) + if isinstance(env.unwrapped, AtariEnv) and "NoFrameskip" not in env.spec.id: + raise ValueError("MaxBetweenFrames requires NoFrameskip in atari env id") + super().__init__(env) self.last_obs = None def observation(self, observation): @@ -143,15 +153,15 @@ def observation(self, observation): return obs def reset(self, **kwargs): - self.last_obs = self.env.reset() - return self.last_obs + self.last_obs, info = self.env.reset(**kwargs) + return self.last_obs, info -class QueueFrames(gym.ObservationWrapper): - """ Queues specified number of frames together along new dimension. """ +class QueueFrames(ObservationWrapper): + """Queues specified number of frames together along new dimension.""" def __init__(self, env, nframes, concat=False): - super(QueueFrames, self).__init__(env) + super().__init__(env) self.obs_queue = deque([], maxlen=nframes) self.concat = concat ospace = self.observation_space @@ -159,110 +169,143 @@ def __init__(self, env, nframes, concat=False): oshape = ospace.shape[:-1] + (ospace.shape[-1] * nframes,) else: oshape = ospace.shape + (nframes,) - self.observation_space = spaces.Box( - ospace.low.min(), ospace.high.max(), oshape, ospace.dtype) + self.observation_space = Box( + ospace.low.min(), ospace.high.max(), oshape, ospace.dtype + ) def observation(self, observation): self.obs_queue.append(observation) - return (np.concatenate(self.obs_queue, -1) if self.concat - else np.dstack(self.obs_queue)) + return ( + np.concatenate(self.obs_queue, -1) + if self.concat + else np.dstack(self.obs_queue) + ) def reset(self, **kwargs): - obs = self.env.reset() + obs, info = self.env.reset(**kwargs) for _ in range(self.obs_queue.maxlen - 1): self.obs_queue.append(obs) - return self.observation(obs) + return self.observation(obs), info -class SkipFrames(gym.Wrapper): - """ Performs the same action for several steps and returns the final result. - """ +class SkipFrames(Wrapper): + """Performs the same action for several steps and returns the final result.""" def __init__(self, env, nskip=4): - super(SkipFrames, self).__init__(env) - if (isinstance(env.unwrapped, atari.AtariEnv) and - "NoFrameskip" not in env.spec.id): - raise ValueError("SkipFrames requires NoFrameskip in Atari env id") + super().__init__(env) + if isinstance(env.unwrapped, AtariEnv) and "NoFrameskip" not in env.spec.id: + raise ValueError("SkipFrames requires NoFrameskip in atari env id") self.nskip = nskip def step(self, action): total_reward = 0.0 for _ in range(self.nskip): - obs, rew, done, info = self.env.step(action) - total_reward += rew - if done: + obs, reward, terminated, truncated, info = self.env.step(action) + total_reward += reward + if terminated or truncated: break - return obs, total_reward, done, info + return obs, total_reward, terminated, truncated, info def reset(self, **kwargs): return self.env.reset(**kwargs) -class ClipReward(gym.RewardWrapper): - """ Modifes reward to be in {-1, 0, 1} by taking sign of it. """ +class ClipReward(RewardWrapper): + """Modifes reward to be in {-1, 0, 1} by taking sign of it.""" def reward(self, reward): return np.sign(reward) -class SummariesBase(gym.Wrapper): - """ Env summaries writer base.""" +class SwapImageAxes(ObservationWrapper): + """ + Image shape to num_channels x height x width and normalization + """ - def __init__(self, env, prefix=None, running_mean_size=100): + def __init__(self, env): + super().__init__(env) + old_shape = self.observation_space.shape + self.observation_space = Box( + low=0.0, + high=1.0, + shape=(old_shape[-1], old_shape[0], old_shape[1]), + dtype=np.float32, + ) + + def observation(self, observation): + return np.transpose(observation, (2, 0, 1)).astype(np.float32) / 255.0 + + +class SummariesBase(Wrapper): + """Env summaries writer base.""" + + def __init__(self, env, prefix=None, running_mean_size=100, step_var=None): super().__init__(env) self.episode_counter = 0 self.prefix = prefix or self.env.spec.id + self.step_var = step_var or 0 - nenvs = getattr(self.env.unwrapped, "nenvs", 1) - self.rewards = np.zeros(nenvs) - self.had_ended_episodes = np.zeros(nenvs, dtype=np.bool) - self.episode_lengths = np.zeros(nenvs) - self.reward_queues = [deque([], maxlen=running_mean_size) - for _ in range(nenvs)] + self.nenvs = getattr(self.env.unwrapped, "nenvs", 1) + self.rewards = np.zeros(self.nenvs) + self.had_ended_episodes = np.zeros(self.nenvs, dtype=bool) + self.episode_lengths = np.zeros(self.nenvs) + self.reward_queues = [ + deque([], maxlen=running_mean_size) for _ in range(self.nenvs) + ] def should_write_summaries(self): - """ Returns true if it's time to write summaries. """ + """Returns true if it's time to write summaries.""" return np.all(self.had_ended_episodes) def add_summaries(self): - """ Writes summaries. """ - self.add_summary_scalar( - f"{self.prefix}/total_reward", - np.mean([q[-1] for q in self.reward_queues])) - self.add_summary_scalar( - f"{self.prefix}/reward_mean_{self.reward_queues[0].maxlen}", - np.mean([np.mean(q) for q in self.reward_queues])) - self.add_summary_scalar( - f"{self.prefix}/episode_length", - np.mean(self.episode_lengths)) + """Writes summaries.""" + self.add_summary( + f"Episodes/total_reward", np.mean([q[-1] for q in self.reward_queues]) + ) + self.add_summary( + f"Episodes/reward_mean_{self.reward_queues[0].maxlen}", + np.mean([np.mean(q) for q in self.reward_queues]), + ) + self.add_summary(f"Episodes/episode_length", np.mean(self.episode_lengths)) if self.had_ended_episodes.size > 1: - self.add_summary_scalar( - f"{self.prefix}/min_reward", - min(q[-1] for q in self.reward_queues)) - self.add_summary_scalar( - f"{self.prefix}/max_reward", - max(q[-1] for q in self.reward_queues)) + self.add_summary( + f"Episodes/min_reward", + min(q[-1] for q in self.reward_queues), + ) + self.add_summary( + f"Episodes/max_reward", + max(q[-1] for q in self.reward_queues), + ) self.episode_lengths.fill(0) self.had_ended_episodes.fill(False) def step(self, action): - obs, rew, done, info = self.env.step(action) + obs, rew, terminated, truncated, info = self.env.step(action) self.rewards += rew self.episode_lengths[~self.had_ended_episodes] += 1 info_collection = [info] if isinstance(info, dict) else info - done_collection = [done] if isinstance(done, bool) else done - done_indices = [i for i, info in enumerate(info_collection) - if info.get("real_done", done_collection[i])] + terminated_collection = ( + [terminated] if isinstance(terminated, bool) else terminated + ) + truncated_collection = [truncated] if isinstance(truncated, bool) else truncated + done_indices = [ + i + for i, info in enumerate(info_collection) + if info.get( + "real_done", terminated_collection[i] or truncated_collection[i] + ) + ] for i in done_indices: if not self.had_ended_episodes[i]: self.had_ended_episodes[i] = True self.reward_queues[i].append(self.rewards[i]) self.rewards[i] = 0 + self.step_var += self.nenvs if self.should_write_summaries(): self.add_summaries() - return obs, rew, done, info + return obs, rew, terminated, truncated, info def reset(self, **kwargs): self.rewards.fill(0) @@ -271,30 +314,23 @@ def reset(self, **kwargs): return self.env.reset(**kwargs) -class TFSummaries(SummariesBase): - """ Writes env summaries using TensorFlow.""" +class TensorboardSummaries(SummariesBase): + """Writes env summaries using Tensorboard.""" def __init__(self, env, prefix=None, running_mean_size=100, step_var=None): + super().__init__(env, prefix, running_mean_size, step_var) + self.writer = SummaryWriter(f"logs/{self.prefix}") - super().__init__(env, prefix, running_mean_size) - - import tensorflow as tf - self.step_var = (step_var if step_var is not None - else tf.train.get_global_step()) - - def add_summary_scalar(self, name, value): - import tensorflow as tf - tf.contrib.summary.scalar(name, value, step = self.step_var) + def add_summary(self, name, value): + if isinstance(value, dict): + self.writer.add_scalars(name, value, self.step_var) + else: + self.writer.add_scalar(name, value, self.step_var) class NumpySummaries(SummariesBase): _summaries = defaultdict(list) - _summary_step = None - - @classmethod - def set_step(cls, step): - cls._summary_step = step @classmethod def get_values(cls, name): @@ -304,16 +340,44 @@ def get_values(cls, name): def clear(cls): cls._summaries = defaultdict(list) - def __init__(self, env, prefix = None, running_mean_size = 100): - super().__init__(env, prefix, running_mean_size) + def __init__(self, env, prefix=None, running_mean_size=100, step_var=None): + super().__init__(env, prefix, running_mean_size, step_var) + + def add_summary(self, name, value): + self._summaries[name].append((self.step_var, value)) + + +def get_summaries_class(summaries): + summaries_class_map = { + "Numpy": NumpySummaries, + "Tensorboard": TensorboardSummaries, + } + if summaries in summaries_class_map: + return summaries_class_map[summaries] + + raise NotImplementedError( + f"Unknown summaries: {summaries}. Supported summaries: {summaries_class_map.keys()}" + ) - def add_summary_scalar(self, name, value): - self._summaries[name].append((self._summary_step, value)) +# magic for parallel launching of environments +class _thunk: + def __init__(self, i, env_id, **kwargs): + self.env_id = env_id + self.i = i + self.kwargs = kwargs -def nature_dqn_env(env_id, nenvs=None, seed=None, - summaries='TensorFlow', clip_reward=True): - """ Wraps env as in Nature DQN paper. """ + def __call__(self): + return nature_dqn_env( + self.env_id, + summaries=False, + clip_reward=False, + **self.kwargs, + ) + + +def nature_dqn_env(env_id, nenvs=None, seed=None, summaries="Numpy", clip_reward=True): + """Wraps env as in Nature DQN paper.""" if "NoFrameskip" not in env_id: raise ValueError(f"env_id must have 'NoFrameskip' but is {env_id}") if nenvs is not None: @@ -322,25 +386,24 @@ def nature_dqn_env(env_id, nenvs=None, seed=None, if isinstance(seed, int): seed = [seed] * nenvs if len(seed) != nenvs: - raise ValueError(f"seed has length {len(seed)} but must have " - f"length equal to nenvs which is {nenvs}") - - env = ParallelEnvBatch([ - lambda i=i, env_seed=env_seed: nature_dqn_env( - env_id, seed=env_seed, summaries=False, clip_reward=False) - for i, env_seed in enumerate(seed) - ]) + raise ValueError( + f"seed has length {len(seed)} but must have " + f"length equal to nenvs which is {nenvs}" + ) + + thunks = [_thunk(i, env_id) for i in range(nenvs)] + env = ParallelEnvBatch(make_env=thunks, seeds=seed) + if summaries: - summaries_class = NumpySummaries if summaries == 'Numpy' else TFSummaries + summaries_class = get_summaries_class(summaries) env = summaries_class(env, prefix=env_id) if clip_reward: env = ClipReward(env) return env - env = gym.make(env_id) - env.seed(seed) + env = gym.make(env_id, render_mode="rgb_array") if summaries: - env = TFSummaries(env) + env = TensorboardSummaries(env) env = EpisodicLife(env) if "FIRE" in env.unwrapped.get_action_meanings(): env = FireReset(env) @@ -349,6 +412,7 @@ def nature_dqn_env(env_id, nenvs=None, seed=None, env = SkipFrames(env, 4) env = ImagePreprocessing(env, width=84, height=84, grayscale=True) env = QueueFrames(env, 4) + env = SwapImageAxes(env) if clip_reward: env = ClipReward(env) return env diff --git a/week06_policy_based/env_batch.py b/week06_policy_based/env_batch.py index 1e23913e9..b2bd163ac 100644 --- a/week06_policy_based/env_batch.py +++ b/week06_policy_based/env_batch.py @@ -1,8 +1,9 @@ # pylint: skip-file -from multiprocessing import Process, Pipe +from multiprocessing import Pipe, Process -from gym import Env, Wrapper, Space import numpy as np +from gymnasium import Env, Wrapper +from gymnasium.spaces import Space class SpaceBatch(Space): @@ -12,18 +13,26 @@ def __init__(self, spaces): first_dtype = spaces[0].dtype for space in spaces: if not isinstance(space, first_type): - raise TypeError("spaces have different types: {}, {}" - .format(first_type, type(space))) + raise TypeError( + "spaces have different types: {}, {}".format( + first_type, type(space) + ) + ) if first_shape != space.shape: - raise ValueError("spaces have different shapes: {}, {}" - .format(first_shape, space.shape)) + raise ValueError( + "spaces have different shapes: {}, {}".format( + first_shape, space.shape + ) + ) if first_dtype != space.dtype: - raise ValueError("spaces have different data types: {}, {}" - .format(first_dtype, space.dtype)) + raise ValueError( + "spaces have different data types: {}, {}".format( + first_dtype, space.dtype + ) + ) self.spaces = spaces - super(SpaceBatch, self).__init__(shape=self.spaces[0].shape, - dtype=self.spaces[0].dtype) + super().__init__(shape=self.spaces[0].shape, dtype=self.spaces[0].dtype) def sample(self): return np.stack([space.sample() for space in self.spaces]) @@ -39,16 +48,15 @@ def __init__(self, make_env, nenvs=None): self._nenvs = len(self.envs) # self.observation_space = SpaceBatch([env.observation_space # for env in self._envs]) - self.action_space = SpaceBatch([env.action_space - for env in self._envs]) + self.action_space = SpaceBatch([env.action_space for env in self._envs]) def _get_make_env_functions(self, make_env, nenvs): if nenvs is None and not isinstance(make_env, list): - raise ValueError("When nenvs is None make_env" - " must be a list of callables") - if nenvs is not None and not callable(make_env): raise ValueError( - "When nenvs is not None make_env must be callable") + "When nenvs is None make_env" " must be a list of callables" + ) + if nenvs is not None and not callable(make_env): + raise ValueError("When nenvs is not None make_env must be callable") if nenvs is not None: make_env = [make_env for _ in range(nenvs)] @@ -66,29 +74,41 @@ def _check_actions(self, actions): if not len(actions) == self.nenvs: raise ValueError( "number of actions is not equal to number of envs: " - "len(actions) = {}, nenvs = {}" - .format(len(actions), self.nenvs)) + "len(actions) = {}, nenvs = {}".format(len(actions), self.nenvs) + ) def step(self, actions): self._check_actions(actions) - obs, rews, resets, infos = [], [], [], [] + observations, rewards, terminated_list, truncated_list, infos = [], [], [], [], [] for env, action in zip(self._envs, actions): - ob, rew, done, info = env.step(action) - if done: - ob = env.reset() - obs.append(ob) - rews.append(rew) - resets.append(done) + obs, rew, terminated, truncated, info = env.step(action) + if terminated or truncated: + obs, info = env.reset() + observations.append(obs) + rewards.append(rew) + terminated_list.append(terminated) + truncated_list.append(truncated) infos.append(info) - return np.stack(obs), np.stack(rews), np.stack(resets), infos + return ( + np.stack(observations), + np.stack(rewards), + np.stack(terminated_list), + np.stack(truncated_list), + infos, + ) - def reset(self): - return np.stack([env.reset() for env in self.envs]) + def reset(self, **kwargs): + observations, infos = [], [] + for env in self.envs: + obs, info = env.reset(**kwargs) + observations.append(obs) + infos.append(info) + return np.stack(observations), infos class SingleEnvBatch(Wrapper, EnvBatch): def __init__(self, env): - super(SingleEnvBatch, self).__init__(env) + super().__init__(env) self.observation_space = SpaceBatch([self.env.observation_space]) self.action_space = SpaceBatch([self.env.action_space]) @@ -102,37 +122,38 @@ def envs(self): def step(self, actions): self._check_actions(actions) - ob, rew, done, info = self.env.step(actions[0]) - if done: - ob = self.env.reset() + obs, rew, terminated, truncated, info = self.env.step(actions[0]) + if terminated or truncated: + obs, info = self.env.reset() return ( - ob[None], + obs[None], np.expand_dims(rew, 0), - np.expand_dims(done, 0), + np.expand_dims(terminated, 0), + np.expand_dims(truncated, 0), [info], ) - def reset(self): - return self.env.reset()[None] + def reset(self, **kwargs): + obs, info = self.env.reset(**kwargs) + return obs[None], info -def worker(parent_connection, worker_connection, make_env_function, - send_spaces=True): +def worker(parent_connection, worker_connection, make_env_function, send_spaces=True): # Adapted from SubprocVecEnv github.com/openai/baselines parent_connection.close() env = make_env_function() if send_spaces: worker_connection.send((env.observation_space, env.action_space)) while True: - cmd, action = worker_connection.recv() + cmd, data = worker_connection.recv() if cmd == "step": - ob, rew, done, info = env.step(action) - if done: - ob = env.reset() - worker_connection.send((ob, rew, done, info)) + obs, rew, terminated, truncated, info = env.step(data) + if terminated or truncated: + obs, info = env.reset() + worker_connection.send((obs, rew, terminated, truncated, info)) elif cmd == "reset": - ob = env.reset() - worker_connection.send(ob) + obs, info = env.reset(seed=data) + worker_connection.send((obs, info)) elif cmd == "close": env.close() worker_connection.close() @@ -146,22 +167,26 @@ class ParallelEnvBatch(EnvBatch): An abstract batch of environments. """ - def __init__(self, make_env, nenvs=None): + def __init__(self, make_env, nenvs=None, seeds=None): make_env_functions = self._get_make_env_functions(make_env, nenvs) self._nenvs = len(make_env_functions) - self._parent_connections, self._worker_connections = zip(*[ - Pipe() for _ in range(self._nenvs) - ]) + self._parent_connections, self._worker_connections = zip( + *[Pipe() for _ in range(self._nenvs)] + ) + self._seeds = seeds or list(range(self._envs)) self._processes = [ Process( target=worker, args=(parent_connection, worker_connection, make_env), - daemon=True + daemon=True, + ) + for i, (parent_connection, worker_connection, make_env) in enumerate( + zip( + self._parent_connections, + self._worker_connections, + make_env_functions, + ) ) - for i, (parent_connection, worker_connection, make_env) - in enumerate(zip(self._parent_connections, - self._worker_connections, - make_env_functions)) ] for p in self._processes: p.start() @@ -187,13 +212,23 @@ def step(self, actions): for conn, a in zip(self._parent_connections, actions): conn.send(("step", a)) results = [conn.recv() for conn in self._parent_connections] - obs, rews, dones, infos = zip(*results) - return np.stack(obs), np.stack(rews), np.stack(dones), infos + obs, rews, terminated, truncated, infos = zip(*results) + return ( + np.stack(obs), + np.stack(rews), + np.stack(terminated), + np.stack(truncated), + infos, + ) - def reset(self): - for conn in self._parent_connections: - conn.send(("reset", None)) - return np.stack([conn.recv() for conn in self._parent_connections]) + def reset(self, **kwargs): + for env_idx, conn in enumerate(self._parent_connections): + conn.send(("reset", self._seeds[env_idx])) + + results = [remote.recv() for remote in self._parent_connections] + observations, infos = zip(*results) + + return np.stack(observations), infos def close(self): if self._closed: diff --git a/week06_policy_based/reinforce_lasagne.ipynb b/week06_policy_based/reinforce_lasagne.ipynb deleted file mode 100644 index 3dcb9cefe..000000000 --- a/week06_policy_based/reinforce_lasagne.ipynb +++ /dev/null @@ -1,451 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# REINFORCE in lasagne\n", - "\n", - "Just like we did before for q-learning, this time we'll design a lasagne network to learn `CartPole-v0` via policy gradient (REINFORCE).\n", - "\n", - "Most of the code in this notebook is taken from approximate qlearning, so you'll find it more or less familiar and even simpler." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Frameworks__ - we'll accept this homework in any deep learning framework. For example, it translates to TensorFlow almost line-to-line. However, we recommend you to stick to theano/lasagne unless you're certain about your skills in the framework of your choice." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "env: THEANO_FLAGS='floatX=float32'\n" - ] - } - ], - "source": [ - "%env THEANO_FLAGS = 'floatX=float32'\n", - "import os\n", - "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", - " !bash ../xvfb start\n", - " os.environ['DISPLAY'] = ':1'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2017-03-14 19:35:59,320] Making new env: CartPole-v0\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEACAYAAACwB81wAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFDZJREFUeJzt3X+MXeWd3/H3B2yLBLrxWqTG2G7tbEAb70ZrU2Gqpgm3\nESGm3UIaVcBus0JdukFim6BU7caOqjLpSpRESjZSKyJoyMqbLk6sZBPBNiQ2lNuQSoEk2Alk7MWW\nmC1DsUkJSaFpdm349o97bC5j47kzc2fGvuf9kkZ+7nPOuef5SuPPPPf8uCdVhSRp9J212AOQJC0M\nA1+SWsLAl6SWMPAlqSUMfElqCQNfklpiXgI/yZYk+5McSPLR+diHJGlmMuzr8JOcDfwFcAXwDPBd\n4Leqat9QdyRJmpH5mOFvBg5W1URVHQG+CFwzD/uRJM3AfAT+auDpvteTTZ8kaRHNR+D7XQ2SdBpa\nMg/v+Qywtu/1Wnqz/OOS+EdBkmahqjLbbedjhv894KIk65IsA64D7p26UlWN7M+tt9666GOwPutr\nY32jXFvV3OfJQ5/hV9XRJP8S+CZwNnB3eYWOJC26+TikQ1XdD9w/H+8tSZod77SdB51OZ7GHMK+s\n78w2yvWNcm3DMPQbrwbaaVKLsV9JOpMloU6zk7aSpNOQgS9JLWHgS1JLGPiS1BIGviS1hIEvSS1h\n4EtSSxj4ktQSBr4ktYSBL0ktYeBLUksY+JLUEga+JLWEgS9JLWHgS1JLGPiS1BIGviS1xJyeaZtk\nAvg/wMvAkaranGQF8CXgbwMTwLVV9dM5jlOSNEdzneEX0KmqTVW1uenbCuyuqouBB5vXkqRFNoxD\nOlOfr3g1sL1pbwfeN4R9SJLmaBgz/AeSfC/J7zV9K6vqcNM+DKyc4z4kSUMwp2P4wDuq6tkkbwZ2\nJ9nfv7CqKknNcR+SpCGYU+BX1bPNvz9O8lVgM3A4yQVVdSjJKuC5k207NjZ2vN3pdOh0OnMZiiSN\nnG63S7fbHdr7pWp2E/AkbwTOrqoXk5wL7AI+DlwBPF9Vn0iyFVheVVunbFuz3a8ktVUSqmrqedPB\nt59D4K8Hvtq8XAL8aVX9h+ayzJ3A3+J1Lss08CVp5hYt8OfCwJekmZtr4HunrSS1hIEvSS1h4EtS\nSxj4ktQSBr4ktYSBL0ktYeBLUksY+JLUEga+JLWEgS9JLWHgS1JLGPiS1BIGviS1hIEvSS1h4EtS\nSxj4ktQSBr4ktYSBL0ktYeBLUktMG/hJPp/kcJLH+/pWJNmd5Mkku5Is71u2LcmBJPuTXDlfA5ck\nzcwgM/w/BrZM6dsK7K6qi4EHm9ck2QBcB2xotrkjiZ8iJOk0MG0YV9XDwAtTuq8Gtjft7cD7mvY1\nwI6qOlJVE8BBYPNwhipJmovZzr5XVtXhpn0YWNm0LwQm+9abBFbPch+SpCGa8+GWqiqgTrXKXPch\nSZq7JbPc7nCSC6rqUJJVwHNN/zPA2r711jR9JxgbGzve7nQ6dDqdWQ5FkkZTt9ul2+0O7f3Sm6BP\ns1KyDrivqt7evP4k8HxVfSLJVmB5VW1tTtreQ++4/WrgAeCtNWUnSaZ2SZKmkYSqymy3n3aGn2QH\ncDlwfpKngX8H3A7sTHIjMAFcC1BV40l2AuPAUeBmk12STg8DzfCHvlNn+JI0Y3Od4XuNvCS1hIEv\nSS1h4EtSSxj4ktQSBr4ktYSBL0ktYeBLUksY+JLUEga+JLWEgS9JLWHgS1JLGPiS1BIGviS1hIEv\nSS1h4EtSSxj4ktQSBr4ktYSBL0ktYeBLUktMG/hJPp/kcJLH+/rGkkwm2dP8XNW3bFuSA0n2J7ly\nvgYuSZqZaR9inuSdwEvAn1TV25u+W4EXq+rTU9bdANwDXAqsBh4ALq6qV6as50PMJWmG5v0h5lX1\nMPDCyfZ9kr5rgB1VdaSqJoCDwObZDk6SNDxzOYb/oSQ/SHJ3kuVN34XAZN86k/Rm+pKkRTbbwP8s\nsB7YCDwLfOoU63rsRpJOA0tms1FVPXesneRzwH3Ny2eAtX2rrmn6TjA2Nna83el06HQ6sxmKJI2s\nbrdLt9sd2vtNe9IWIMk64L6+k7arqurZpv0R4NKq+u2+k7abefWk7VunnqH1pK0kzdxcT9pOO8NP\nsgO4HDg/ydPArUAnyUZ6h2ueAm4CqKrxJDuBceAocLPJLkmnh4Fm+EPfqTN8SZqxeb8sU5I0Ggx8\nSWoJA1+SWsLAl6SWMPAlqSUMfElqCQNfklrCwJemeOGpxxj/yh9SLx9d7KFIQ+WNV9IU/3vfw/zl\nw//lhP6/88E7F2E00qu88UqSNBADX5JawsCXpJYw8CWpJQx8SWoJA1+SWsLAl6SWMPAlqSUMfGmK\nk910de7ffMsijEQaLgNfGsCyc5cv9hCkOZs28JOsTfJQkh8leSLJh5v+FUl2J3kyya4ky/u22Zbk\nQJL9Sa6czwIkSYMZZIZ/BPhIVf0a8HeB30/yNmArsLuqLgYebF6TZANwHbAB2ALckcRPEpK0yKYN\n4qo6VFV7m/ZLwD5gNXA1sL1ZbTvwvqZ9DbCjqo5U1QRwENg85HFLkmZoRjPvJOuATcAjwMqqOtws\nOgysbNoXApN9m03S+wMhSVpESwZdMcl5wFeAW6rqxeTVb+isqkpyqu87PmHZ2NjY8Xan06HT6Qw6\nFElqhW63S7fbHdr7DfR9+EmWAn8O3F9Vn2n69gOdqjqUZBXwUFX9apKtAFV1e7PeN4Bbq+qRvvfz\n+/B12vr+XTed0PfL6y/hLe85sV9aSPP+ffjpTeXvBsaPhX3jXuCGpn0D8LW+/uuTLEuyHrgIeHS2\nA5QkDccgh3TeAXwA+GGSPU3fNuB2YGeSG4EJ4FqAqhpPshMYB44CNzudl6TFN23gV9W3ef1PAle8\nzja3AbfNYVySpCHz+nhJagkDX5JawsCXpJYw8KU+f/mtL5y030syNQoMfElqCQNfklrCwJekljDw\nJaklDHxJagkDX5JawsCXpJYw8CWpJQx8SWoJA1+SWsLAl6SWMPAlqSUMfKnPTw5+94S+CzZdtQgj\nkYbPwJf6vHL0r07oW7LsDYswEmn4DHxpGj6QWaNi2sBPsjbJQ0l+lOSJJB9u+seSTCbZ0/xc1bfN\ntiQHkuxPcuV8FiDNtyz2AKQhmfYh5sAR4CNVtTfJecD3k+ymN/H5dFV9un/lJBuA64ANwGrggSQX\nV9UrQx67tCCc4WtUTDvDr6pDVbW3ab8E7KMX5HDyyc81wI6qOlJVE8BBYPNwhistPGf4GhUzOoaf\nZB2wCfhO0/WhJD9IcneS5U3fhcBk32aTvPoHQjrjOMPXqBjkkA4AzeGcLwO3VNVLST4L/Ptm8R8C\nnwJufJ3NT/g/MzY2drzd6XTodDqDDkWSWqHb7dLtdof2fqmafv6SZCnw58D9VfWZkyxfB9xXVW9P\nshWgqm5vln0DuLWqHulbvwbZr7TQvn/XiQ8rX3PZ+1n5G+9dhNFIr5WEqpr1UcZBrtIJcDcw3h/2\nSVb1rfZPgMeb9r3A9UmWJVkPXAQ8OtsBSovNqYlGxSCHdN4BfAD4YZI9Td/HgN9KspHe/4engJsA\nqmo8yU5gHDgK3Ox0XmcyT9pqVEwb+FX1bU7+SeD+U2xzG3DbHMYlnTacrWhUeKetNA1n+BoVBr7U\nOPr/Xjxp/1lLz1ngkUjzw8CXGk/+1z86af+bN1y+wCOR5oeBL0ktYeBLUksY+JLUEga+JLWEgS9J\nLWHgS1JLGPiS1BIGviS1hIEvSS1h4EtSSxj4ktQSAz3xaug79YlXWgATExPs2bNn+hUbq174Hyw9\neuIXqP3PN28ZaPt169axadOmgfcnzdRcn3g18DNtpTPNrl27uOmmEx9ZeDLnLFvCt//j757Q/9OX\nfsH7b3r/QO/xwQ9+kDvvvHNGY5QWkoEv9XnguX92vP3uN3+RP7hz9yKORhouj+FLjf/+43/KL14+\n9/jP1w/dyEtHfnmxhyUNjYEvAS/XEl48uuKE/r8uH36i0XHKwE9yTpJHkuxN8kSSsaZ/RZLdSZ5M\nsivJ8r5ttiU5kGR/kivnefzSUITi7Bxd7GFI8+qUgV9VvwD+QVVtBDYCW5JcBmwFdlfVxcCDzWuS\nbACuAzYAW4A7kvgpQqe9s/Iy5y154YS+FcueXaQRScM37Unbqvp501wGLAUKuBo49ty37UCXXuhf\nA+yoqiPARJKDwGbgO8MdtjRcf3XkZe747Ad48egK3r3pLfyLf3QJv7T0J3xtsQcmDdG0gd/M0B8D\nfgX4T1X1aJKVVXW4WeUwsLJpX8hrw30SWH2y9x30cjlptvbt2zfwulXFgcnngee55/4D3HP/N2e8\nv29961v+Xuu0NsgM/xVgY5I3AV9N8utTlleSU91FddJlq1atOt7udDp0Op2BBiwN6q677uLhhx9e\nsP29613v8jp8DVW326Xb7Q7t/Qa+Dr+qfpbkIeC9wOEkF1TVoSSrgOea1Z4B1vZttqbpO8HY2Njs\nRixJLTF1Mvzxj398Tu833VU65x+7AifJG4D3APuAe4EbmtVugOOHOu8Frk+yLMl64CLg0TmNUJI0\nFNPN8FcB25OcTe+Pw5eq6utJvgPsTHIjMAFcC1BV40l2AuPAUeBmvzRHkk4Ppwz8qnocuOQk/T8B\nrnidbW4DbhvK6CRJQ+M18pLUEga+JLWEgS9JLeEDUDSynnrqKR577LEF29/69eu55JITTnlJQzPX\nB6AY+JJ0hphr4HtIR5JawsCXpJYw8CWpJQx8SWoJA1+SWsLAl6SWMPAlqSUMfElqCQNfklrCwJek\nljDwJaklDHxJagkDX5JaYrqHmJ+T5JEke5M8kWSs6R9LMplkT/NzVd8225IcSLI/yZXzPH5J0oCm\n/XrkJG+sqp8nWQJ8G7gF2AK8WFWfnrLuBuAe4FJgNfAAcHFVvTJlPb8eWZJmaN6/Hrmqft40lwFL\ngWNJfbKdXgPsqKojVTUBHAQ2z3ZwkqThmTbwk5yVZC9wGNhVVY82iz6U5AdJ7k6yvOm7EJjs23yS\n3kxfkrTIBpnhv1JVG4E1wGVJfg34LLAe2Ag8C3zqVG8xjIFKkuZmyaArVtXPkjwEbKmq4wGf5HPA\nfc3LZ4C1fZutafpOMDY2drzd6XTodDoDD1qS2qDb7dLtdof2fqc8aZvkfOBoVf00yRuAbwK3A49V\n1aFmnY8Al1bVb/edtN3Mqydt3zr1DK0nbSVp5uZ60na6Gf4qYHuSs+kd/vlSVX09yZ8k2UjvcM1T\nwE0AVTWeZCcwDhwFbjbZJen0MO1lmfOyU2f4kjRj835ZpiRpNBj4ktQSBr4ktYSBL0ktYeBLUksY\n+JLUEga+JLWEgS9JLWHgS1JLGPiS1BIGviS1hIEvSS1h4EtSSxj4ktQSBr4ktYSBL0ktYeBLUksY\n+JLUEga+JLXEQIGf5Owke5Lc17xekWR3kieT7EqyvG/dbUkOJNmf5Mr5GrgkaWYGneHfAowDx548\nvhXYXVUXAw82r0myAbgO2ABsAe5I0rpPEd1ud7GHMK+s78w2yvWNcm3DMG0YJ1kD/EPgc8Cxp6Vf\nDWxv2tuB9zXta4AdVXWkqiaAg8DmYQ74TDDqv3TWd2Yb5fpGubZhGGT2/UfAvwFe6etbWVWHm/Zh\nYGXTvhCY7FtvElg910FKkubulIGf5DeB56pqD6/O7l+jqopXD/WcdJXZD0+SNCzp5fXrLExuA34H\nOAqcA/wS8GfApUCnqg4lWQU8VFW/mmQrQFXd3mz/DeDWqnpkyvv6R0CSZqGqTjr5HsQpA/81KyaX\nA/+6qv5xkk8Cz1fVJ5qQX15VW5uTtvfQO26/GngAeGsNuhNJ0rxZMsP1jwX37cDOJDcCE8C1AFU1\nnmQnvSt6jgI3G/aSdHoYeIYvSTqzLfg18km2NDdlHUjy0YXe/zAk+XySw0ke7+sbiZvRkqxN8lCS\nHyV5IsmHm/5Rqe+cJI8k2dvUN9b0j0R9x4zyzZJJJpL8sKnv0aZvJOpLsjzJl5PsSzKe5LKh1lZV\nC/YDnE3v2vx1wFJgL/C2hRzDkOp4J7AJeLyv75PAHzTtjwK3N+0NTZ1Lm7oPAmctdg2nqO0CYGPT\nPg/4C+Bto1JfM+Y3Nv8uAb4DXDZK9TXj/lfAnwL3jtLvZzPmp4AVU/pGoj569zX9bt/v55uGWdtC\nz/A3AweraqKqjgBfpHez1hmlqh4GXpjSPRI3o1XVoara27RfAvbROwE/EvUBVNXPm+Yyev9ZihGq\nryU3S069UuWMry/Jm4B3VtXnAarqaFX9jCHWttCBvxp4uu/1KN2YNXI3oyVZR++TzCOMUH1Jzkqy\nl14du6rqUUaoPkb/ZskCHkjyvSS/1/SNQn3rgR8n+eMkjyX5z0nOZYi1LXTgt+IMcfU+b53RN6Ml\nOQ/4CnBLVb3Yv+xMr6+qXqmqjcAa4LIkvz5l+RlbX0tulnxHVW0CrgJ+P8k7+xeewfUtAS4B7qiq\nS4D/S/M9ZcfMtbaFDvxngLV9r9fy2r9QZ7LDSS4AaG5Ge67pn1rzmqbvtJVkKb2w/0JVfa3pHpn6\njmk+Lj8EvJfRqe/vAVcneQrYAbw7yRcYnfqoqmebf38MfJXeYYxRqG8SmKyq7zavv0zvD8ChYdW2\n0IH/PeCiJOuSLKP3zZr3LvAY5su9wA1N+wbga3391ydZlmQ9cBHw6CKMbyBJAtwNjFfVZ/oWjUp9\n5x+7yiHJG4D30DtPMRL1VdXHqmptVa0Hrgf+W1X9DiNSX5I3JvkbTftc4ErgcUagvqo6BDyd5OKm\n6wrgR8B9DKu2RTgLfRW9Kz8OAtsW+6z4LGvYAfwv4K/pnZP458AKencWPwnsonf38bH1P9bUux94\n72KPf5ra/j69Y797gT3Nz5YRqu/twGPAD+gFxb9t+keivim1Xs6rV+mMRH30jnPvbX6eOJYhI1Tf\nbwDfbX4//4zeVTpDq80brySpJVr3cBJJaisDX5JawsCXpJYw8CWpJQx8SWoJA1+SWsLAl6SWMPAl\nqSX+P7qsoBUsM5jGAAAAAElFTkSuQmCC\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import gym\n", - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "\n", - "env = gym.make(\"CartPole-v0\").env\n", - "env.reset()\n", - "n_actions = env.action_space.n\n", - "state_dim = env.observation_space.shape\n", - "\n", - "plt.imshow(env.render(\"rgb_array\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Building the network for REINFORCE" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For REINFORCE algorithm, we'll need a model that predicts action probabilities given states." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import theano\n", - "import theano.tensor as T\n", - "\n", - "# create input variables. We'll support multiple states at once\n", - "\n", - "states = T.matrix(\"states[batch,units]\")\n", - "actions = T.ivector(\"action_ids[batch]\")\n", - "cumulative_rewards = T.vector(\"G[batch] = r + gamma*r' + gamma^2*r'' + ...\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import lasagne\n", - "from lasagne.layers import *\n", - "\n", - "# input layer\n", - "l_states = InputLayer((None,)+state_dim, input_var=states)\n", - "\n", - "\n", - "\n", - "\n", - "# output layer\n", - "# this time we need to predict action probabilities,\n", - "# so make sure your nonlinearity forces p>0 and sum_p = 1\n", - "l_action_probas = DenseLayer( ,\n", - " num_units= ,\n", - " nonlinearity= )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Predict function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get probabilities of actions\n", - "predicted_probas = get_output(l_action_probas)\n", - "\n", - "# predict action probability given state\n", - "# if you use float32, set allow_input_downcast=True\n", - "predict_proba = " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Loss function and updates\n", - "\n", - "We now need to define objective and update over policy gradient.\n", - "\n", - "Our objective function is\n", - "\n", - "$$ J \\approx { 1 \\over N } \\sum_{s_i,a_i} G(s_i,a_i) $$\n", - "\n", - "\n", - "Following the REINFORCE algorithm, we can define our objective as follows: \n", - "\n", - "$$ \\hat J \\approx { 1 \\over N } \\sum_{s_i,a_i} \\log \\pi_\\theta (a_i \\mid s_i) \\cdot G(s_i,a_i) $$\n", - "\n", - "When you compute gradient of that function over network weights $ \\theta $, it will become exactly the policy gradient." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# select probabilities for chosen actions, pi(a_i|s_i)\n", - "predicted_probas_for_actions = predicted_probas[T.arange(\n", - " actions.shape[0]), actions]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# REINFORCE objective function\n", - "J = # " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# all network weights\n", - "all_weights = \n", - "\n", - "# weight updates. maximize J = minimize -J\n", - "updates = lasagne.updates.sgd(-J, all_weights, learning_rate=0.01)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_step = theano.function([states, actions, cumulative_rewards], updates=updates,\n", - " allow_input_downcast=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Computing cumulative rewards" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "\n", - "def get_cumulative_rewards(rewards, # rewards at each step\n", - " gamma=0.99 # discount for reward\n", - " ):\n", - " \"\"\"\n", - " take a list of immediate rewards r(s,a) for the whole session \n", - " compute cumulative returns (a.k.a. G(s,a) in Sutton '16)\n", - " G_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...\n", - "\n", - " The simple way to compute cumulative rewards is to iterate from last to first time tick\n", - " and compute G_t = r_t + gamma*G_{t+1} recurrently\n", - "\n", - " You must return an array/list of cumulative rewards with as many elements as in the initial rewards.\n", - " \"\"\"\n", - "\n", - " \n", - "\n", - " return " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "assert len(get_cumulative_rewards(range(100))) == 100\n", - "assert np.allclose(get_cumulative_rewards([0, 0, 1, 0, 0, 1, 0], gamma=0.9), [\n", - " 1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0])\n", - "assert np.allclose(get_cumulative_rewards(\n", - " [0, 0, 1, -2, 3, -4, 0], gamma=0.5), [0.0625, 0.125, 0.25, -1.5, 1.0, -4.0, 0.0])\n", - "assert np.allclose(get_cumulative_rewards(\n", - " [0, 0, 1, 2, 3, 4, 0], gamma=0), [0, 0, 1, 2, 3, 4, 0])\n", - "print(\"looks good!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Playing the game" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def generate_session(t_max=1000):\n", - " \"\"\"play env with REINFORCE agent and train at the session end\"\"\"\n", - "\n", - " # arrays to record session\n", - " states, actions, rewards = [], [], []\n", - "\n", - " s = env.reset()\n", - "\n", - " for t in range(t_max):\n", - "\n", - " # action probabilities array aka pi(a|s)\n", - " action_probas = predict_proba([s])[0]\n", - "\n", - " a = \n", - "\n", - " new_s, r, done, info = env.step(a)\n", - "\n", - " # record session history to train later\n", - " states.append(s)\n", - " actions.append(a)\n", - " rewards.append(r)\n", - "\n", - " s = new_s\n", - " if done:\n", - " break\n", - "\n", - " cumulative_rewards = get_cumulative_rewards(rewards)\n", - " train_step(states, actions, cumulative_rewards)\n", - "\n", - " return sum(rewards)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mean reward:20.900\n", - "mean reward:35.860\n", - "mean reward:50.820\n", - "mean reward:88.550\n", - "mean reward:132.080\n", - "mean reward:165.890\n", - "mean reward:193.790\n", - "mean reward:166.510\n", - "mean reward:120.910\n", - "mean reward:98.450\n", - "mean reward:236.340\n", - "mean reward:280.410\n", - "mean reward:317.610\n", - "You Win!\n" - ] - } - ], - "source": [ - "for i in range(100):\n", - "\n", - " rewards = [generate_session() for _ in range(100)] # generate new sessions\n", - "\n", - " print(\"mean reward:%.3f\" % (np.mean(rewards)))\n", - "\n", - " if np.mean(rewards) > 300:\n", - " print(\"You Win!\")\n", - " break" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Video" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2017-03-14 19:36:45,862] Making new env: CartPole-v0\n", - "[2017-03-14 19:36:45,870] DEPRECATION WARNING: env.spec.timestep_limit has been deprecated. Replace your call to `env.spec.timestep_limit` with `env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps')`. This change was made 12/28/2016 and is included in version 0.7.0\n", - "[2017-03-14 19:36:45,873] Clearing 12 monitor files from previous run (because force=True was provided)\n", - "[2017-03-14 19:36:45,894] Starting new video recorder writing to /home/jheuristic/Downloads/Practical_RL/week6/videos/openaigym.video.0.7776.video000000.mp4\n", - "[2017-03-14 19:36:51,516] Starting new video recorder writing to /home/jheuristic/Downloads/Practical_RL/week6/videos/openaigym.video.0.7776.video000001.mp4\n", - "[2017-03-14 19:36:57,580] Starting new video recorder writing to /home/jheuristic/Downloads/Practical_RL/week6/videos/openaigym.video.0.7776.video000008.mp4\n", - "[2017-03-14 19:37:05,049] Starting new video recorder writing to /home/jheuristic/Downloads/Practical_RL/week6/videos/openaigym.video.0.7776.video000027.mp4\n", - "[2017-03-14 19:37:08,785] Starting new video recorder writing to /home/jheuristic/Downloads/Practical_RL/week6/videos/openaigym.video.0.7776.video000064.mp4\n", - "[2017-03-14 19:37:11,505] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/jheuristic/Downloads/Practical_RL/week6/videos')\n" - ] - } - ], - "source": [ - "# record sessions\n", - "import gym.wrappers\n", - "env = gym.wrappers.Monitor(gym.make(\"CartPole-v0\"),\n", - " directory=\"videos\", force=True)\n", - "sessions = [generate_session() for _ in range(100)]\n", - "env.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# show video\n", - "from IPython.display import HTML\n", - "import os\n", - "\n", - "video_names = list(\n", - " filter(lambda s: s.endswith(\".mp4\"), os.listdir(\"./videos/\")))\n", - "\n", - "HTML(\"\"\"\n", - "\n", - "\"\"\".format(\"./videos/\" + video_names[-1])) # this may or may not be _last_ video. Try other indices" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/week06_policy_based/reinforce_pytorch.ipynb b/week06_policy_based/reinforce_pytorch.ipynb index 5f2989998..0130118d5 100644 --- a/week06_policy_based/reinforce_pytorch.ipynb +++ b/week06_policy_based/reinforce_pytorch.ipynb @@ -19,7 +19,11 @@ "source": [ "import sys, os\n", "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + " !pip install -q gymnasium\n", + " !pip install moviepy\n", + " !apt install ffmpeg\n", + " !pip install imageio-ffmpeg\n", " !touch .setup_complete\n", "\n", "# This code creates a virtual display to draw game images on.\n", @@ -35,12 +39,22 @@ "metadata": {}, "outputs": [], "source": [ - "import gym\n", + "import gymnasium as gym\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# also you need to install ffmpeg if not installed\n", + "# for MacOS: ! brew install ffmpeg" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -54,7 +68,7 @@ "metadata": {}, "outputs": [], "source": [ - "env = gym.make(\"CartPole-v0\")\n", + "env = gym.make(\"CartPole-v1\", render_mode=\"rgb_array\")\n", "\n", "# gym compatibility: unwrap TimeLimit\n", "if hasattr(env, '_max_episode_steps'):\n", @@ -64,7 +78,7 @@ "n_actions = env.action_space.n\n", "state_dim = env.observation_space.shape\n", "\n", - "plt.imshow(env.render(\"rgb_array\"))" + "plt.imshow(env.render())" ] }, { @@ -91,7 +105,8 @@ "outputs": [], "source": [ "import torch\n", - "import torch.nn as nn" + "import torch.nn as nn\n", + "import torch.nn.functional as F" ] }, { @@ -155,7 +170,7 @@ "metadata": {}, "outputs": [], "source": [ - "test_states = np.array([env.reset() for _ in range(5)])\n", + "test_states = np.array([env.reset()[0] for _ in range(5)])\n", "test_probas = predict_probs(test_states)\n", "assert isinstance(test_probas, np.ndarray), \\\n", " \"you must return np array and not %s\" % type(test_probas)\n", @@ -180,13 +195,14 @@ "outputs": [], "source": [ "def generate_session(env, t_max=1000):\n", - " \"\"\" \n", + " \"\"\"\n", " Play a full session with REINFORCE agent.\n", " Returns sequences of states, actions, and rewards.\n", " \"\"\"\n", " # arrays to record session\n", " states, actions, rewards = [], [], []\n", - " s = env.reset()\n", + "\n", + " s = env.reset()[0]\n", "\n", " for t in range(t_max):\n", " # action probabilities array aka pi(a|s)\n", @@ -194,7 +210,8 @@ "\n", " # Sample action with given probabilities.\n", " a = \n", - " new_s, r, done, info = env.step(a)\n", + "\n", + " new_s, r, terminated, truncated, info = env.step(a)\n", "\n", " # record session history to train later\n", " states.append(s)\n", @@ -202,7 +219,7 @@ " rewards.append(r)\n", "\n", " s = new_s\n", - " if done:\n", + " if terminated or truncated:\n", " break\n", "\n", " return states, actions, rewards" @@ -300,20 +317,6 @@ "When you compute the gradient of that function with respect to network weights $\\theta$, it will become exactly the policy gradient." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def to_one_hot(y_tensor, ndims):\n", - " \"\"\" helper: take an integer vector and convert it to 1-hot matrix. \"\"\"\n", - " y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)\n", - " y_one_hot = torch.zeros(\n", - " y_tensor.size()[0], ndims).scatter_(1, y_tensor, 1)\n", - " return y_one_hot" - ] - }, { "cell_type": "code", "execution_count": null, @@ -333,7 +336,7 @@ "\n", " # cast everything into torch tensors\n", " states = torch.tensor(states, dtype=torch.float32)\n", - " actions = torch.tensor(actions, dtype=torch.int32)\n", + " actions = torch.tensor(actions, dtype=torch.int64)\n", " cumulative_returns = np.array(get_cumulative_rewards(rewards, gamma))\n", " cumulative_returns = torch.tensor(cumulative_returns, dtype=torch.float32)\n", "\n", @@ -347,7 +350,7 @@ "\n", " # select log-probabilities for chosen actions, log pi(a_i|s_i)\n", " log_probs_for_actions = torch.sum(\n", - " log_probs * to_one_hot(actions, env.action_space.n), dim=1)\n", + " log_probs * F.one_hot(actions, env.action_space.n), dim=1)\n", " \n", " # Compute loss here. Don't forgen entropy regularization with `entropy_coef` \n", " entropy = \n", @@ -376,7 +379,7 @@ "for i in range(100):\n", " rewards = [train_on_session(*generate_session(env)) for _ in range(100)] # generate new sessions\n", " \n", - " print(\"mean reward:%.3f\" % (np.mean(rewards)))\n", + " print(\"mean reward: %.3f\" % (np.mean(rewards)))\n", " \n", " if np.mean(rewards) > 500:\n", " print(\"You Win!\") # but you can train even further\n", @@ -398,10 +401,12 @@ "source": [ "# Record sessions\n", "\n", - "import gym.wrappers\n", + "from gymnasium.wrappers import RecordVideo\n", "\n", - "with gym.wrappers.Monitor(gym.make(\"CartPole-v0\"), directory=\"videos\", force=True) as env_monitor:\n", - " sessions = [generate_session(env_monitor) for _ in range(100)]" + "with gym.make(\"CartPole-v1\", render_mode=\"rgb_array\") as env, RecordVideo(\n", + " env=env, video_folder=\"./videos\"\n", + ") as env_monitor:\n", + " sessions = [generate_session(env_monitor) for _ in range(10)]\n" ] }, { @@ -414,22 +419,45 @@ "# work for you, you can download the videos and view them locally.\n", "\n", "from pathlib import Path\n", + "from base64 import b64encode\n", "from IPython.display import HTML\n", "\n", - "video_names = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_paths = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_path = video_paths[-1] # You can also try other indices\n", + "\n", + "if 'google.colab' in sys.modules:\n", + " # https://stackoverflow.com/a/57378660/1214547\n", + " with video_path.open('rb') as fp:\n", + " mp4 = fp.read()\n", + " data_url = 'data:video/mp4;base64,' + b64encode(mp4).decode()\n", + "else:\n", + " data_url = str(video_path)\n", "\n", "HTML(\"\"\"\n", "\n", - "\"\"\".format(video_names[-1])) # You can also try other indices" + "\"\"\".format(data_url))" ] } ], "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "pygments_lexer": "ipython3" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" } }, "nbformat": 4, diff --git a/week06_policy_based/reinforce_tensorflow.ipynb b/week06_policy_based/reinforce_tensorflow.ipynb index 077350874..644340cb4 100644 --- a/week06_policy_based/reinforce_tensorflow.ipynb +++ b/week06_policy_based/reinforce_tensorflow.ipynb @@ -22,7 +22,8 @@ " %tensorflow_version 1.x\n", " \n", " if not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + " !pip install -q gym[classic_control]==0.18.0\n", " !touch .setup_complete\n", "\n", "# This code creates a virtual display to draw game images on.\n", @@ -425,15 +426,25 @@ "# work for you, you can download the videos and view them locally.\n", "\n", "from pathlib import Path\n", + "from base64 import b64encode\n", "from IPython.display import HTML\n", "\n", - "video_names = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_paths = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_path = video_paths[-1] # You can also try other indices\n", + "\n", + "if 'google.colab' in sys.modules:\n", + " # https://stackoverflow.com/a/57378660/1214547\n", + " with video_path.open('rb') as fp:\n", + " mp4 = fp.read()\n", + " data_url = 'data:video/mp4;base64,' + b64encode(mp4).decode()\n", + "else:\n", + " data_url = str(video_path)\n", "\n", "HTML(\"\"\"\n", "\n", - "\"\"\".format(video_names[-1])) # You can also try other indices" + "\"\"\".format(data_url))" ] } ], diff --git a/week06_policy_based/runners.py b/week06_policy_based/runners.py index 120c42484..2d608aa68 100644 --- a/week06_policy_based/runners.py +++ b/week06_policy_based/runners.py @@ -1,11 +1,10 @@ -""" RL env runner """ from collections import defaultdict import numpy as np class EnvRunner: - """ Reinforcement learning runner in an environment with given policy """ + """Reinforcement learning runner in an environment with given policy""" def __init__(self, env, policy, nsteps, transforms=None, step_var=None): self.env = env @@ -13,20 +12,25 @@ def __init__(self, env, policy, nsteps, transforms=None, step_var=None): self.nsteps = nsteps self.transforms = transforms or [] self.step_var = step_var if step_var is not None else 0 - self.state = {"latest_observation": self.env.reset()} + self.state = {"latest_observation": self.env.reset()[0]} @property def nenvs(self): - """ Returns number of batched envs or `None` if env is not batched """ + """Returns number of batched envs or `None` if env is not batched""" return getattr(self.env.unwrapped, "nenvs", None) - def reset(self): - """ Resets env and runner states. """ - self.state["latest_observation"] = self.env.reset() + def reset(self, **kwargs): + """Resets env and runner states.""" + self.state["latest_observation"] = self.env.reset(**kwargs)[0] self.policy.reset() + def add_summary(self, name, val): + """Writes logs""" + add_summary = self.env.get_wrapper_attr("add_summary") + add_summary(name, val) + def get_next(self): - """ Runs the agent in the environment. """ + """Runs the agent in the environment.""" trajectory = defaultdict(list, {"actions": []}) observations = [] rewards = [] @@ -37,27 +41,29 @@ def get_next(self): observations.append(self.state["latest_observation"]) act = self.policy.act(self.state["latest_observation"]) if "actions" not in act: - raise ValueError("result of policy.act must contain 'actions' " - f"but has keys {list(act.keys())}") + raise ValueError( + "result of policy.act must contain 'actions' " + f"but has keys {list(act.keys())}" + ) for key, val in act.items(): trajectory[key].append(val) - obs, rew, done, _ = self.env.step(trajectory["actions"][-1]) + obs, rew, terminated, truncated, _ = self.env.step( + trajectory["actions"][-1] + ) self.state["latest_observation"] = obs rewards.append(rew) - resets.append(done) + reset = np.logical_or(terminated, truncated) + resets.append(reset) self.step_var += self.nenvs or 1 # Only reset if the env is not batched. Batched envs should # auto-reset. - if not self.nenvs and np.all(done): + if not self.nenvs and np.all(reset): self.state["env_steps"] = i + 1 - self.state["latest_observation"] = self.env.reset() + self.state["latest_observation"] = self.env.reset()[0] - trajectory.update( - observations=observations, - rewards=rewards, - resets=resets) + trajectory.update(observations=observations, rewards=rewards, resets=resets) trajectory["state"] = self.state for transform in self.transforms: diff --git a/week07_[recap]_rnn/README.md b/week07_[recap]_rnn/README.md index 4b187b0f2..2e98665f6 100644 --- a/week07_[recap]_rnn/README.md +++ b/week07_[recap]_rnn/README.md @@ -1,7 +1,7 @@ ## Materials * [Slides](https://yadi.sk/i/-Iqdhg483GDyoN) * CS231 lecture on RNNs - [video](https://www.youtube.com/watch?v=iX5V1WpxxkY) -* Our [lecture](https://yadi.sk/i/XHmT5hO53GcCKV), [seminar(pytorch)](https://yadi.sk/i/nCch5I8S3TsXh5), [seminar(theano)](https://yadi.sk/i/19twHESN3GcGKQ) (both russian) +* Our [lecture](https://yadi.sk/i/XHmT5hO53GcCKV), [seminar (PyTorch)](https://yadi.sk/i/nCch5I8S3TsXh5) (in russian) * [alternative] Brief lecture on RNN by nervana - [video](https://www.youtube.com/watch?v=Ukgii7Yd_cU) * [alternative] More detailed lecture by Y. Bengio - [video](https://www.youtube.com/watch?v=xK-bzjIQkmM) * Great reading by Karpathy - [blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) @@ -15,6 +15,4 @@ # Homework description [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week07_%5Brecap%5D_rnn/seminar_pytorch.ipynb) -This week's practice gets you acquainted with basics of recurrent neural networks. For simplicity, we'll train them on character language modelling task. Pick any one of `seminar_lasagne`, `seminar_lasagne_ingraph` or `seminar_tf`. - -As for difference btwn `seminar_lasagne` and `seminar_lasagne_ingraph` - ingraph version shows a lower-level interface to recurrent neural networks. It also requires you to install `pip install https://github.com/yandexdataschool/agentnet/archive/master.zip`. Out-of-graph version cover higher-level syntax from native lasagne. +This week's practice gets you acquainted with basics of recurrent neural networks. For simplicity, we'll train them on character language modelling task. You'll need `seminar_pytorch.ipynb` or `seminar_tf.ipynb`. diff --git a/week07_[recap]_rnn/seminar_lasagne.ipynb b/week07_[recap]_rnn/seminar_lasagne.ipynb deleted file mode 100644 index 5fdbeddcf..000000000 --- a/week07_[recap]_rnn/seminar_lasagne.ipynb +++ /dev/null @@ -1,488 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Generating names with recurrent neural networks (5 points)\n", - "\n", - "This time you'll find yourself delving into the heart (and other intestines) of recurrent neural networks on a class of toy problems.\n", - "\n", - "Struggle to find a name for the variable? Let's see how you'll come up with a name for your son/daughter. Surely no human has expertize over what is a good child name, so let us train RNN instead;\n", - "\n", - "It's dangerous to go alone, take these:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import theano\n", - "import theano.tensor as T\n", - "import lasagne\n", - "import os\n", - "#thanks @keskarnitish" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "start_token = \" \"\n", - "\n", - "with open(\"names\") as f:\n", - " names = f.read()[:-1].split('\\n')\n", - " names = [start_token+name for name in names]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print('n samples = ', len(names))\n", - "for x in names[::1000]:\n", - " print(x)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Text processing\n", - "\n", - "First we need next to collect a \"vocabulary\" of all unique tokens i.e. unique characters. We can then encode inputs as a sequence of character ids." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# all unique characters go here\n", - "tokens = \n", - "\n", - "tokens = list(tokens)\n", - "print('n_tokens = ', len(tokens))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Theano is built for numbers, not strings of characters.\n", - "We'll feed our recurrent neural network with ids of characters from our dictionary.\n", - "\n", - "To create such dictionary, let's assign each character with it's index in tokens list." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "token_to_id = its identifier (index in tokens list)>\n", - "\n", - "id_to_token = symbol itself>" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "plt.hist(list(map(len, names)), bins=25)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# truncate names longer than MAX_LEN characters.\n", - "MAX_LEN = ?!\n", - "\n", - "# you will likely need to change this for any dataset different from \"names\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Cast everything from symbols into identifiers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "names_ix = list(map(lambda name: list(map(token_to_id.get, name)), names))\n", - "\n", - "\n", - "# crop long names and pad short ones\n", - "for i in range(len(names_ix)):\n", - " names_ix[i] = names_ix[i][:MAX_LEN] # crop too long\n", - "\n", - " if len(names_ix[i]) < MAX_LEN:\n", - " names_ix[i] += [token_to_id[\" \"]] * \\\n", - " (MAX_LEN - len(names_ix[i])) # pad too short\n", - "\n", - "assert len(set(map(len, names_ix))) == 1\n", - "\n", - "names_ix = np.array(names_ix)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Input variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "input_sequence = T.matrix('token sequencea', 'int32')\n", - "target_values = T.matrix('actual next token', 'int32')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build NN\n", - "\n", - "You will be building a model that takes token sequence and predicts next token\n", - "\n", - "\n", - "* iput sequence\n", - "* one-hot / embedding\n", - "* recurrent layer(s)\n", - "* otput layer(s) that predict output probabilities\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from lasagne.layers import InputLayer, DenseLayer, EmbeddingLayer\n", - "from lasagne.layers import RecurrentLayer, LSTMLayer, GRULayer, CustomRecurrentLayer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "l_in = lasagne.layers.InputLayer(shape=(None, None), input_var=input_sequence)\n", - "\n", - "#!\n", - "l_emb = \n", - "\n", - "l_rnn = \n", - "\n", - "# flatten batch and time to be compatible with feedforward layers (will un-flatten later)\n", - "l_rnn_flat = lasagne.layers.reshape(l_rnn, (-1, l_rnn.output_shape[-1]))\n", - "\n", - "l_out = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Model weights\n", - "weights = lasagne.layers.get_all_params(l_out, trainable=True)\n", - "print(weights)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "network_output = \n", - "# If you use dropout do not forget to create deterministic version for evaluation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "predicted_probabilities_flat = network_output\n", - "correct_answers_flat = target_values.ravel()\n", - "\n", - "\n", - "loss = \n", - "\n", - "updates = " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Compiling it" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# training\n", - "train = theano.function([input_sequence, target_values],\n", - " loss, updates=updates, allow_input_downcast=True)\n", - "\n", - "# computing loss without training\n", - "compute_cost = theano.function(\n", - " [input_sequence, target_values], loss, allow_input_downcast=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# generation\n", - "\n", - "Simple: \n", - "* get initial context(seed), \n", - "* predict next token probabilities,\n", - "* sample next token, \n", - "* add it to the context\n", - "* repeat from step 2\n", - "\n", - "You'll get a more detailed info on how it works in the homework section." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# compile the function that computes probabilities for next token given previous text.\n", - "\n", - "# reshape back into original shape\n", - "next_word_probas = network_output.reshape(\n", - " (input_sequence.shape[0], input_sequence.shape[1], len(tokens)))\n", - "# predictions for next tokens (after sequence end)\n", - "last_word_probas = next_word_probas[:, -1]\n", - "probs = theano.function(\n", - " [input_sequence], last_word_probas, allow_input_downcast=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "def generate_sample(seed_phrase=None, N=MAX_LEN, t=1, n_snippets=1):\n", - " '''\n", - " The function generates text given a phrase of length at least SEQ_LENGTH.\n", - "\n", - " parameters:\n", - " sample_fun - max_ or proportional_sample_fun or whatever else you implemented\n", - "\n", - " The phrase is set using the variable seed_phrase\n", - "\n", - " The optional input \"N\" is used to set the number of characters of text to predict. \n", - " '''\n", - " if seed_phrase is None:\n", - " seed_phrase = start_token\n", - " if len(seed_phrase) > MAX_LEN:\n", - " seed_phrase = seed_phrase[-MAX_LEN:]\n", - " assert type(seed_phrase) is str\n", - "\n", - " snippets = []\n", - " for _ in range(n_snippets):\n", - " sample_ix = []\n", - " x = list(map(lambda c: token_to_id.get(c, 0), seed_phrase))\n", - " x = np.array([x])\n", - "\n", - " for i in range(N):\n", - " # Pick the character that got assigned the highest probability\n", - " p = probs(x).ravel()\n", - " p = p**t / np.sum(p**t)\n", - " ix = np.random.choice(np.arange(len(tokens)), p=p)\n", - " sample_ix.append(ix)\n", - "\n", - " x = np.hstack((x[-MAX_LEN+1:], [[ix]]))\n", - "\n", - " random_snippet = seed_phrase + \\\n", - " ''.join(id_to_token[ix] for ix in sample_ix)\n", - " snippets.append(random_snippet)\n", - "\n", - " print(\"----\\n %s \\n----\" % '; '.join(snippets))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Model training\n", - "\n", - "Here you can tweak parameters or insert your generation function\n", - "\n", - "\n", - "__Once something word-like starts generating, try increasing seq_length__\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def sample_batch(data, batch_size):\n", - "\n", - " rows = data[np.random.randint(0, len(data), size=batch_size)]\n", - "\n", - " return rows[:, :-1], rows[:, 1:]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "print(\"Training ...\")\n", - "\n", - "\n", - "# total N iterations\n", - "n_epochs = 100\n", - "\n", - "# how many minibatches are there in the epoch\n", - "batches_per_epoch = 500\n", - "\n", - "# how many training sequences are processed in a single function call\n", - "batch_size = 10\n", - "\n", - "\n", - "for epoch in range(n_epochs):\n", - "\n", - " print(\"Generated names\")\n", - " generate_sample(n_snippets=10)\n", - "\n", - " avg_cost = 0\n", - "\n", - " for _ in range(batches_per_epoch):\n", - "\n", - " x, y = sample_batch(names_ix, batch_size)\n", - " avg_cost += train(x, y)\n", - "\n", - " print(\"Epoch {} average loss = {}\".format(\n", - " epoch, avg_cost / batches_per_epoch))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "generate_sample(n_snippets=100)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "generate_sample(seed_phrase=\" A\", n_snippets=10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "generate_sample(seed_phrase= , n_snippets=10, t=1.0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Bonus: try it out!\n", - "You've just implemented a recurrent language model that can be tasked with generating any kind of sequence, so there's plenty of data you can try it on:\n", - "\n", - "* Novels/poems/songs of your favorite author\n", - "* News titles/clickbait titles\n", - "* Source code of Linux or Tensorflow\n", - "* Molecules in [smiles](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system) format\n", - "* Melody in notes/chords format\n", - "* Ikea catalog titles\n", - "* Pokemon names\n", - "* Cards from Magic, the Gathering / Hearthstone\n", - "\n", - "If you're willing to give it a try, here's what you wanna look at:\n", - "* Current data format is a sequence of lines, so a novel can be formatted as a list of sentences. Alternatively, you can change data preprocessing altogether.\n", - "* While some datasets are readily available, others can only be scraped from the web. Try `Selenium` or `Scrapy` for that.\n", - "* Make sure MAX_LENGTH is adjusted for longer datasets. There's also a bonus section about dynamic RNNs at the bottom.\n", - "* More complex tasks require larger RNN architecture, try more neurons or several layers. It would also require more training iterations.\n", - "* Long-term dependencies in music, novels or molecules are better handled with LSTM or GRU\n", - "\n", - "__Good hunting!__" - ] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/week07_[recap]_rnn/seminar_lasagne_ingraph.ipynb b/week07_[recap]_rnn/seminar_lasagne_ingraph.ipynb deleted file mode 100644 index 5a7b55f79..000000000 --- a/week07_[recap]_rnn/seminar_lasagne_ingraph.ipynb +++ /dev/null @@ -1,439 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import theano\n", - "import theano.tensor as T\n", - "import lasagne\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Generate names\n", - "* Struggle to find a name for the variable? Let's see how you'll come up with a name for your son/daughter. Surely no human has expertize over what is a good child name, so let us train NN instead.\n", - "* Dataset contains ~8k human names from different cultures[in latin transcript]\n", - "* Objective (toy problem): learn a generative model over names." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "start_token = \" \"\n", - "\n", - "with open(\"names\") as f:\n", - " names = f.read()[:-1].split('\\n')\n", - " names = [start_token+name for name in names]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print('n samples = ', len(names))\n", - "for x in names[::1000]:\n", - " print(x)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Text processing\n", - "First we need next to collect a \"vocabulary\" of all unique tokens i.e. unique characters. We can then encode inputs as a sequence of character ids." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# all unique characters go here\n", - "token_set = \n", - "\n", - "tokens = list(token_set)\n", - "print('n_tokens = ', len(tokens))\n", - "assert 54 < len(tokens) < 56" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Theano is built for numbers, not strings of characters.\n", - "We'll feed our recurrent neural network with ids of characters from our dictionary.\n", - "\n", - "To create such dictionary, let's assign each character with it's index in tokens list." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "token_to_id = its identifier (index in tokens list)>\n", - "\n", - "id_to_token = symbol itself>" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "plt.hist(list(map(len, names)), bins=25)\n", - "\n", - "# truncate names longer than MAX_LEN characters.\n", - "MAX_LEN = min([60, max(list(map(len, names)))])\n", - "# ADJUST IF YOU ARE UP TO SOMETHING SERIOUS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Cast everything from symbols into identifiers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "names_ix = list(map(lambda name: list(map(token_to_id.get, name)), names))\n", - "\n", - "\n", - "# crop long names and pad short ones\n", - "for i in range(len(names_ix)):\n", - " names_ix[i] = names_ix[i][:MAX_LEN] # crop too long\n", - "\n", - " if len(names_ix[i]) < MAX_LEN:\n", - " names_ix[i] += [token_to_id[\" \"]] * \\\n", - " (MAX_LEN - len(names_ix[i])) # pad too short\n", - "\n", - "assert len(set(map(len, names_ix))) == 1\n", - "\n", - "names_ix = np.array(names_ix)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Input variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from agentnet import Recurrence\n", - "from lasagne.layers import *\n", - "from agentnet.memory import *\n", - "from agentnet.resolver import ProbabilisticResolver" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sequence = T.matrix('token sequence', 'int64')\n", - "\n", - "inputs = sequence[:, :-1]\n", - "targets = sequence[:, 1:]\n", - "\n", - "\n", - "l_input_sequence = InputLayer(shape=(None, None), input_var=inputs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build NN\n", - "\n", - "You'll be building a model that takes token sequence and predicts next tokens at each tick\n", - "\n", - "This is basically equivalent to how rnn step was described in the lecture" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# One step of rnn\n", - "class step:\n", - "\n", - " # inputs\n", - " inp = InputLayer((None,), name='current character')\n", - " h_prev = InputLayer((None, 10), name='previous rnn state')\n", - "\n", - " # recurrent part\n", - " emb = EmbeddingLayer(inp, len(tokens), 30, name='emb')\n", - "\n", - " h_new = \n", - "\n", - " next_token_probas = \n", - "\n", - " # pick next token from predicted probas\n", - " next_token = ProbabilisticResolver(next_token_probas)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "training_loop = Recurrence(\n", - " state_variables={step.h_new: step.h_prev},\n", - " input_sequences={step.inp: l_input_sequence},\n", - " tracked_outputs=[step.next_token_probas, ],\n", - " unroll_scan=False,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Model weights\n", - "weights = lasagne.layers.get_all_params(training_loop, trainable=True)\n", - "print(weights)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "predicted_probabilities = lasagne.layers.get_output(\n", - " training_loop[step.next_token_probas])\n", - "# If you use dropout do not forget to create deterministic version for evaluation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "loss = # \n", - "\n", - "updates = lasagne.updates.adam(loss, weights)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Compiling it" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# training\n", - "train_step = theano.function([sequence], loss,\n", - " updates=training_loop.get_automatic_updates()+updates)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# generation\n", - "\n", - "here we re-wire the recurrent network so that it's output is fed back to it's input. \n", - "\n", - "We also make sure to feed id of `\" \"` as initial token." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_steps = T.scalar(dtype='int32')\n", - "x0 = InputLayer([None], theano.shared(np.int32([token_to_id[' ']])))\n", - "\n", - "feedback_loop = Recurrence(\n", - " state_variables={step.h_new: step.h_prev,\n", - " step.next_token: step.inp},\n", - " tracked_outputs=[step.next_token_probas, ],\n", - " state_init={step.next_token: x0},\n", - " batch_size=theano.shared(1),\n", - " n_steps=n_steps,\n", - " unroll_scan=False,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "generated_tokens = get_output(feedback_loop[step.next_token])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "generate_sample = theano.function(\n", - " [n_steps], generated_tokens, updates=feedback_loop.get_automatic_updates())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def generate_string(length=MAX_LEN):\n", - " output_indices = generate_sample(length)[0]\n", - "\n", - " return ''.join(tokens[i] for i in output_indices)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "generate_string()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Model training\n", - "\n", - "Here you can tweak parameters or insert your generation function\n", - "\n", - "\n", - "__Once something word-like starts generating, try increasing seq_length__\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def sample_batch(data, batch_size):\n", - "\n", - " rows = data[np.random.randint(0, len(data), size=batch_size)]\n", - "\n", - " return rows" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "print(\"Training ...\")\n", - "\n", - "\n", - "# total N iterations\n", - "n_epochs = 100\n", - "\n", - "# how many minibatches are there in the epoch\n", - "batches_per_epoch = 500\n", - "\n", - "# how many training sequences are processed in a single function call\n", - "batch_size = 10\n", - "\n", - "\n", - "for epoch in xrange(n_epochs):\n", - "\n", - " avg_cost = 0\n", - " for _ in range(batches_per_epoch):\n", - "\n", - " avg_cost += train_step(sample_batch(names_ix, batch_size))\n", - "\n", - " print(\"\\n\\nEpoch {} average loss = {}\".format(\n", - " epoch, avg_cost / batches_per_epoch))\n", - "\n", - " print(\"Generated names\")\n", - " for i in range(10):\n", - " print(generate_string(),)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# And now,\n", - "* try lstm/gru\n", - "* try several layers\n", - "* try mtg cards\n", - "* try your own dataset of any kind" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/week07_seq2seq/README.md b/week07_seq2seq/README.md index b3eb52b5a..05d9f8a12 100644 --- a/week07_seq2seq/README.md +++ b/week07_seq2seq/README.md @@ -13,19 +13,18 @@ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week07_seq2seq/practice_torch.ipynb) -As usual, go to practice_{your framework}.ipynb above and follow instructions from there. [pytorch](./practice_torch.ipynb), [tensorflow](./practice_tf.ipynb), [theano](./practice_theano.ipynb) +As usual, go to practice_{your framework}.ipynb above and follow instructions from there. [PyTorch](./practice_torch.ipynb), [Tensorflow](./practice_tf.ipynb) ## More materials * An [awesome post](http://distill.pub/2016/augmented-rnns/) explaining attention and long-term memory models. * [BLEU](http://www.aclweb.org/anthology/P02-1040.pdf) and [CIDEr](https://arxiv.org/pdf/1411.5726.pdf) articles. * Image captioning - * MSCOCO captioning [challenge](http://mscoco.org/dataset/#captions-challenge2015) + * MSCOCO captioning [challenge](https://cocodataset.org/#captions-2015) * Captioning tutorial [notebook](https://github.com/yandexdataschool/Practical_DL/tree/980121c7b3147ed28a7c1360df5038d3432b8cc3/week07_seq2seq) -* Other articles on reinforcement learning for natural language: +* Other articles on reinforcement learning for natural language: * [task-oriented conversation system](https://arxiv.org/abs/1703.07055) * [generating dialogues](https://arxiv.org/abs/1606.01541) * [sequential adversarial networks](https://arxiv.org/abs/1609.05473) (a.k.a. SeqGAN) * A large overview for machine translation (touching on RL, including RL failures) - [arxiv](https://arxiv.org/abs/1609.08144) * How _not_ to evaluate conversation models - [arxiv](https://arxiv.org/abs/1603.08023) * Overview of other non-games applications ("that article again") - [arxiv](https://arxiv.org/abs/1701.07274) - diff --git a/week07_seq2seq/basic_model_theano.py b/week07_seq2seq/basic_model_theano.py deleted file mode 100644 index 951b2dff9..000000000 --- a/week07_seq2seq/basic_model_theano.py +++ /dev/null @@ -1,214 +0,0 @@ -# code by https://github.com/deniskamazur - -from lasagne.layers import * -import theano.tensor as T -import theano - -from agentnet.memory import LSTMCell, GRUCell, AttentionLayer -from agentnet import Recurrence -from agentnet.learning.generic import get_mask_by_eos -from agentnet.resolver import ProbabilisticResolver -from agentnet.utils import reapply - - -class BasicTranslationModel: - def __init__(self, inp_voc, out_voc, emb_size, hid_size, **kwargs): - """ - A simple interface for mt - :param emb_size: Embedding size - :param hid_size: Number of LSTM units - :param bidereactional: If the nLSTM layers should be bidirectional - :param input_dropout: Dropout after embedding layer - :param recurrent_dropout: Dropout after each LSTM iteration - :param rdo_size: If int - use dense layer after neck in decoder, if none don't - :param peepholes: http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-var-peepholes.png - :param kwargs: recurrence flags - """ - self.inp_voc = inp_voc - self.out_voc = out_voc - # encode input sequence - - class encoder: - # intput layers - inp = InputLayer((None, None)) - mask = ExpressionLayer( - inp, - lambda x: get_mask_by_eos(T.eq(x, self.out_voc.eos_ix)), - ) - - # embed the tokens - emb = EmbeddingLayer( - inp, - input_size=len(inp_voc), - output_size=emb_size, - ) - - rnn_fw = GRULayer( - emb, - num_units=hid_size, - mask_input=mask, - only_return_final=True, - ) - - dec_start = DenseLayer(rnn_fw, hid_size, nonlinearity=None) - - # make encoder a public field - self.encoder = encoder - - # decoder the encoded sequence - class decoder: - # decoder previous memory and tokens - prev_hid = InputLayer((None, hid_size), name='prev hidden state') - inp = InputLayer((None,), name="prev phoneme") - - emb = EmbeddingLayer(inp, len(out_voc), emb_size) - - new_hid = GRUCell(prev_hid, emb) - - logits = DenseLayer(new_hid, len(out_voc), nonlinearity=None) - - probs = NonlinearityLayer(logits, nonlinearity=T.nnet.softmax) - logprobs = NonlinearityLayer( - logits, - nonlinearity=T.nnet.logsoftmax, - ) - out = ProbabilisticResolver(probs, assume_normalized=True) - - state_dict = { - new_hid: prev_hid, - # ^^^ this reads "at next step, new_hid will become prev_hid" - # if you add any more recurrent memory units, - # please make sure they're here - } - - init_dict = { - new_hid: encoder.dec_start - # ^^^ this reads "before first step, new_hid is set to outputs of dec_start" - # if you add any more recurrent memory units with non-zero init - # please make sure they're here - } - - nonseq_dict = { - # here you can add anything encoder needs that's gonna be same - # across time-steps - } - - self.decoder = decoder - - top_layers = [encoder.dec_start, decoder.out] + \ - list(decoder.state_dict.keys()) - self.weights = get_all_params(top_layers, trainable=True) - - def symbolic_score(self, inp, out, eps=1e-30, **flags): - """ - Takes symbolic int32 matrices of hebrew words and their english translations. - Computes the log-probabilities of all possible english characters given english prefices and hebrew word. - :param inp: input sequence, int32 matrix of shape [batch,time] - :param out: output sequence, int32 matrix of shape [batch,time] - :return: log-probabilities of all possible english characters of shape [bath,time,n_tokens] - - NOTE: log-probabilities time axis is synchronized with out - In other words, logp are probabilities of __current__ output at each tick, not the next one - therefore you can get likelihood as logprobas * tf.one_hot(out,n_tokens) - """ - - l_output_sequence = InputLayer([None, None]) - - # Defining custom recurrent layer out of decoder - rec = Recurrence( - state_variables=self.decoder.state_dict, - state_init=self.decoder.init_dict, - input_sequences={self.decoder.inp: l_output_sequence}, - input_nonsequences=self.decoder.nonseq_dict, - tracked_outputs=self.decoder.logprobs, - unroll_scan=False - ) - - feed_dict = { - self.encoder.inp: inp, - l_output_sequence: out - } - logprobs = get_output(rec[self.decoder.logprobs], feed_dict, - recurrence_flags=flags, **flags) - - self.auto_updates = rec.get_automatic_updates() - if len(self.auto_updates) != 0: - print( - "symbolic_score: Please collect auto_updates of random states " - "after you called symbolic_score (available at model.auto_updates)!") - - first_logprobs = T.zeros_like(logprobs[:, :1]) - logprobs = T.concatenate([first_logprobs, logprobs[:, :-1]], axis=1) - - return logprobs - - def symbolic_translate(self, inp, greedy=False, max_len=None, - unroll_scan=False, eps=1e-30, **flags): - """ - takes symbolic int32 matrix of hebrew words, produces output tokens sampled - from the model and output log-probabilities for all possible tokens at each tick. - :param inp: input sequence, int32 matrix of shape [batch,time] - :param greedy: if greedy, takes token with highest probablity at each tick. - Otherwise samples proportionally to probability. - :param max_len: max length of output, defaults to 2 * input length - :param unroll_scan: if True, compiles longer but runs faster. - requires max_len to be constant - :return: output tokens int32[batch,time] and - log-probabilities of all tokens at each tick, [batch,time,n_tokens] - """ - if unroll_scan: - assert isinstance( - max_len, int), "if scan is unrolled, max_len must be a constant integer" - - max_len = max_len if max_len is not None else 2 * inp.shape[1] - - # initial output tokens (BOS) - bos = T.zeros_like(inp[:, 0]) + self.out_voc.bos_ix - l_start = InputLayer((None,), bos) - - # Defining custom recurrent layer out of decoder - rec = Recurrence( - state_variables=merge_dicts(self.decoder.state_dict, - {self.decoder.out: self.decoder.inp}), - state_init=merge_dicts(self.decoder.init_dict, {self.decoder.out: l_start}), - input_nonsequences=self.decoder.nonseq_dict, - tracked_outputs=(self.decoder.out, self.decoder.probs, self.decoder.logprobs), - n_steps=max_len, - unroll_scan=unroll_scan - ) - - translations, logprobs = get_output(rec[self.decoder.out, self.decoder.logprobs], - {self.encoder.inp: inp, - l_start: bos}, - recurrence_flags=dict(flags, greedy=greedy), - **flags) - - self.auto_updates = rec.get_automatic_updates() - if len(self.auto_updates) != 0: - print( - "symbolic_translate: Please collect auto_updates of random states " - "after you called symbolic_translate (available at model.auto_updates)!") - - # add first step (bos) - translations = T.concatenate([bos[:, None], translations], axis=1) - first_logprobs = T.zeros_like(logprobs[:, :1]) - logprobs = T.concatenate([first_logprobs, logprobs], axis=1) - - return translations, logprobs - - -def merge_dicts(*dicts, **kwargs): - """ - Melts several dicts into one. Useful when messing with feed dicts - :param dicts: dictionaries - :param check_conflicts: if True, raises error if several dicts have the same key - Otherwise uses the key from the latest dict in *dicts - :return: a dict that contains k-v pairs from all *dicts - """ - merged_dict = {} - for d in dicts: - merged_dict.update(d) - if kwargs.get('check_conflicts'): - assert len(merged_dict) == sum( - map(len, dicts)), 'dicts have duplicate keys' - return merged_dict diff --git a/week07_seq2seq/basic_model_torch.py b/week07_seq2seq/basic_model_torch.py index 586482ffb..25700da42 100644 --- a/week07_seq2seq/basic_model_torch.py +++ b/week07_seq2seq/basic_model_torch.py @@ -2,8 +2,8 @@ import torch.nn as nn import torch.nn.functional as F -# Note: unlike official pytorch tutorial, this model doesn't process one sample at a time -# because it's slow on GPU. instead it uses masks just like ye olde theano/tensorflow. +# Note: unlike official PyTorch tutorial, this model doesn't process one sample at a time +# because it's slow on GPU. Instead it uses masks just like ye olde Tensorflow. # it doesn't use torch.nn.utils.rnn.pack_paded_sequence because reasons. @@ -24,7 +24,7 @@ def __init__(self, inp_voc, out_voc, def encode(self, inp, **flags): """ Takes symbolic input sequence, computes initial state - :param inp: a vector of input tokens (Variable, int64, 1d) + :param inp: input tokens, int64 vector of shape [batch] :return: a list of initial decoder state tensors """ inp_emb = self.emb_inp(inp) diff --git a/week07_seq2seq/bonus.ipynb b/week07_seq2seq/bonus.ipynb deleted file mode 100644 index 1acb2ead1..000000000 --- a/week07_seq2seq/bonus.ipynb +++ /dev/null @@ -1,247 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Week8 bonus descriptions\n", - "\n", - "Here are some cool mini-projects you can try to dive deeper into the topic.\n", - "\n", - "## More metrics: BLEU (5+ pts)\n", - "\n", - "Pick BLEU or any other relevant metric, e.g. BLEU (e.g. from `nltk.bleu_score`).\n", - "* Train model to maximize BLEU directly\n", - "* How does levenshtein behave when maximizing BLEU and vice versa?\n", - "* Compare this with how they behave when optimizing likelihood. \n", - "\n", - "(use default parameters for bleu: 4-gram, uniform weights)\n", - "\n", - "## Actor-critic (5+++ pts)\n", - "\n", - "While self-critical training provides a large reduction of gradient variance, it has a few drawbacks:\n", - "- It requires a lot of additional computation during training\n", - "- It doesn't adjust V(s) between decoder steps. (one value per sequence)\n", - "\n", - "There's a more general way of doing the same thing: learned baselines, also known as __advantage actor-critic__.\n", - "\n", - "There are two main ways to apply that:\n", - "- __naive way__: compute V(s) once per training example.\n", - " - This only requires additional 1-unit linear dense layer that grows out of encoder, estimating V(s)\n", - " - (implement this to get main points)\n", - "- __every step__: compute V(s) on each decoder step\n", - " - Again it's just an 1-unit dense layer (no nonlinearity), but this time it's inside decoder recurrence.\n", - " - (+3 pts additional for this guy)\n", - "\n", - "In both cases, you should train V(s) to minimize squared error $(V(s) - R(s,a))^2$ with R being actual levenshtein.\n", - "You can then use $ A(s,a) = (R(s,a) - const(V(s))) $ for policy gradient.\n", - "\n", - "There's also one particularly interesting approach (+5 additional pts):\n", - "- __combining SCST and actor-critic__:\n", - " - compute baseline $V(s)$ via self-critical sequence training (just like in main assignment)\n", - " - learn correction $ C(s,a_{:t}) = R(s,a) - V(s) $ by minimizing $(R(s,a) - V(s) - C(s,a_{:t}))^2 $\n", - " - use $ A(s,a_{:t}) = R(s,a) - V(s) - const(C(s,a_{:t})) $\n", - "\n", - "\n", - "\n", - "## Implement attention (5+++ pts)\n", - "\n", - "Some seq2seq tasks can benefit from the attention mechanism. In addition to taking the _last_ time-step of encoder hidden state, we can allow decoder to peek on any time-step of his choice.\n", - "\n", - "![img](https://xiandong79.github.io/downloads/nmt-model-fast.gif)\n", - "\n", - "\n", - "#### Recommended steps:\n", - "__1)__ Modify encoder-decoder\n", - "\n", - "Learn to feed the entire encoder into the decoder. You can do so by sending encoder rnn layer directly into decoder (make sure there's no `only_return_final=True` for encoder rnn layer).\n", - "\n", - "```\n", - "class decoder:\n", - " ...\n", - " encoder_rnn_input = InputLayer(encoder.rnn.output_shape, name='encoder rnn input for decoder')\n", - " ...\n", - " \n", - "#decoder Recurrence\n", - "rec = Recurrence(...,\n", - " input_nonsequences = {decoder.encoder_rnn_input: encoder.rnn},\n", - " )\n", - "\n", - "```\n", - "\n", - "For starters, you can take it's last tick (via SliceLayer) inside the decoder step and feed it as input to make sure it works.\n", - "\n", - "__2)__ Implement attention mechanism\n", - "\n", - "Next thing we'll need is to implement the math of attention.\n", - "\n", - "The simplest way to do so is to write a special layer. We gave you a prototype and some tests below.\n", - "\n", - "__3)__ Use attention inside decoder\n", - "\n", - "That's almost it! Now use `AttentionLayer` inside the decoder and feed it to back to lstm/gru/rnn (see code demo below).\n", - "\n", - "Train the full network just like you did before attention.\n", - "\n", - "__More points__ will be awwarded for comparing learning results of attention Vs no attention.\n", - "\n", - "__Bonus bonus:__ visualize attention vectors (>= +3 points)\n", - "\n", - "The best way to make sure your attention actually works is to visualize it.\n", - "\n", - "A simple way to do so is to obtain attention vectors from each tick (values __right after softmax__, not the layer outputs) and drawing those as images.\n", - "\n", - "#### step-by-step guide:\n", - "- split AttentionLayer into two layers: _\"from start to softmax\"_ and _\"from softmax to output\"_\n", - "- add outputs of the first layer to recurrence's `tracked_outputs`\n", - "- compile a function that computes them\n", - "- plt.imshow(them)\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import theano\n", - "import lasagne\n", - "import theano.tensor as T\n", - "from lasagne import init\n", - "from lasagne.layers import *" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class AttentionLayer(MergeLayer):\n", - " def __init__(self, decoder_h, encoder_rnn):\n", - " # sanity checks\n", - " assert len(\n", - " decoder_h.output_shape) == 2, \"please feed decoder 1 step activation as first param \"\n", - " assert len(\n", - " encoder_rnn.output_shape) == 3, \"please feed full encoder rnn sequence as second param\"\n", - "\n", - " self.decoder_num_units = decoder_h.output_shape[-1]\n", - " self.encoder_num_units = encoder.output_shape[-1]\n", - "\n", - " # Here you should initialize all trainable parameters.\n", - " #\n", - "\n", - " # use this syntax:\n", - " self.add_param(spec=init.Normal(std=0.01), # or other initializer\n", - " shape= ,\n", - " name='')\n", - "\n", - " MergeLayer.__init__(self, [decoder_h, encoder_rnn], name=\"attention\")\n", - "\n", - " def get_output_shape_for(self, input_shapes, **kwargs):\n", - " \"\"\"return matrix of shape [batch_size, encoder num units]\"\"\"\n", - " return (None, self.encoder_num_units)\n", - "\n", - " def get_output_for(self, inputs, **kwargs):\n", - " \"\"\"\n", - " takes (decoder_h, encoder_seq)\n", - " decoder_h has shape [batch_size, decoder num_units]\n", - " encoder_seq has shape [batch_size, sequence_length, encoder num_units]\n", - "\n", - " returns attention output: matrix of shape [batch_size, encoder num units]\n", - "\n", - " please read comments carefully before you start implementing\n", - " \"\"\"\n", - " decoder_h, encoder_seq = inputs\n", - "\n", - " # get symbolic batch-size / seq length. Also don't forget self.decoder_num_units above\n", - " batch_size, seq_length, _ = tuple(encoder_seq.shape)\n", - "\n", - " # here's a recommended step-by-step guide for attention mechanism.\n", - " # You are free to ignore it alltogether if you so wish\n", - "\n", - " # we repeat decoder activations to allign with encoder\n", - " decoder_h_repeated = \n", - " \n", - " # ^--shape=[batch,seq_length,decoder_n_units]\n", - " \n", - " encoder_and_decoder_together = \n", - " # ^--shape=[batch,seq_length,enc_n_units+dec_n_units]\n", - " \n", - " # here we flatten the tensor to simplify\n", - " encoder_and_decoder_flat = T.reshape(encoder_and_decoder_together,(-1,encoder_and_decoder_together.shape[-1]))\n", - " # ^--shape=[batch*seq_length,enc_n_units+dec_n_units]\n", - " \n", - " # here you use encoder_and_decoder_flat and some learned weights to predict attention logits\n", - " # don't use softmax yet\n", - " \n", - " attention_logits_flat = \n", - " # ^--shape=[batch*seq_length,1]\n", - " \n", - " \n", - " # here we reshape flat logits back into correct form\n", - " assert attention_logits_flat.ndim==2\n", - " attention_logits = attention_logits_flat.reshape((batch_size,seq_length))\n", - " # ^--shape=[batch,seq_length]\n", - " \n", - " # here we apply softmax :)\n", - " attention = T.nnet.softmax(attention_logits)\n", - " # ^--shape=[batch,seq_length]\n", - " \n", - " # here we compute output\n", - " output = (attention[:,:,None]*encoder_seq).sum(axis=1) #sum over seq_length\n", - " # ^--shape=[batch,enc_n_units]\n", - " \n", - " return output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# demo code\n", - "\n", - "from numpy.random import randn\n", - "\n", - "dec_h_prev = InputLayer((None, 50), T.constant(\n", - " randn(5, 50)), name='decoder h mock')\n", - "\n", - "enc = InputLayer((None, None, 32), T.constant(\n", - " randn(5, 20, 32)), name='encoder sequence mock')\n", - "\n", - "attention = AttentionLayer(dec_h_prev, enc)\n", - "\n", - "# now you can use attention as additonal input to your decoder\n", - "# LSTMCell(prev_cell,prev_out,input_or_inputs=(usual_input,attention))\n", - "\n", - "\n", - "# sanity check\n", - "demo_output = get_output(attention).eval()\n", - "print 'actual shape:', demo_output.shape\n", - "assert demo_output.shape == (5, 32)\n", - "assert np.isfinite(demo_output)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/week07_seq2seq/practice_tf.ipynb b/week07_seq2seq/practice_tf.ipynb index c561d7b11..8d5a1520b 100644 --- a/week07_seq2seq/practice_tf.ipynb +++ b/week07_seq2seq/practice_tf.ipynb @@ -7,19 +7,20 @@ "outputs": [], "source": [ "import sys, os\n", - "if 'google.colab' in sys.modules:\n", + "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", " # https://github.com/yandexdataschool/Practical_RL/issues/256\n", - " !pip install tensorflow-gpu==1.13.1\n", - " \n", - " if not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", + " # https://stackoverflow.com/a/62482183\n", + " !pip uninstall -y tensorflow\n", + " !pip install tensorflow-gpu==1.13.1 keras==2.3.1\n", "\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week07_seq2seq/basic_model_tf.py\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week07_seq2seq/he-pron-wiktionary.txt\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week07_seq2seq/main_dataset.txt\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week07_seq2seq/voc.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", "\n", - " !touch .setup_complete\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week07_seq2seq/basic_model_tf.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week07_seq2seq/he-pron-wiktionary.txt\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week07_seq2seq/main_dataset.txt\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week07_seq2seq/voc.py\n", + "\n", + " !touch .setup_complete\n", "\n", "# This code creates a virtual display to draw game images on.\n", "# It will have no effect if your machine has a monitor.\n", @@ -34,7 +35,7 @@ "source": [ "## Reinforcement Learning for seq2seq\n", "\n", - "This time we'll solve a problem of transribing hebrew words in english, also known as g2p (grapheme2phoneme)\n", + "This time we'll solve a problem of transcribing Hebrew words in English, also known as g2p (grapheme2phoneme)\n", "\n", " * word (sequence of letters in source language) -> translation (sequence of letters in target language)\n", "\n", @@ -43,19 +44,19 @@ "\n", "### About the task\n", "\n", - "One notable property of Hebrew is that it's consonant language. That is, there are no wovels in the written language. One could represent wovels with diacritics above consonants, but you don't expect people to do that in everyay life.\n", + "One notable property of Hebrew is that it's a consonant language. That is, there are no vowels in the written language. One could represent vowels with diacritics above consonants, but you don't expect people to do that in everyday life.\n", "\n", - "Therefore, some hebrew characters will correspond to several english letters and others - to none, so we should use encoder-decoder architecture to figure that out.\n", + "Therefore, some Hebrew characters will correspond to several English letters and others - to none, so we should use encoder-decoder architecture to figure that out.\n", "\n", "![img](https://esciencegroup.files.wordpress.com/2016/03/seq2seq.jpg)\n", "_(img: esciencegroup.files.wordpress.com)_\n", "\n", "Encoder-decoder architectures are about converting anything to anything, including\n", " * Machine translation and spoken dialogue systems\n", - " * [Image captioning](http://mscoco.org/dataset/#captions-challenge2015) and [image2latex](https://openai.com/requests-for-research/#im2latex) (convolutional encoder, recurrent decoder)\n", + " * [Image captioning](https://cocodataset.org/#captions-2015) and [image2latex](https://openai.com/requests-for-research/#im2latex) (convolutional encoder, recurrent decoder)\n", " * Generating [images by captions](https://arxiv.org/abs/1511.02793) (recurrent encoder, convolutional decoder)\n", " * Grapheme2phoneme - convert words to transcripts\n", - " \n", + "\n", "We chose simplified __Hebrew->English__ machine translation for words and short phrases (character-level), as it is relatively quick to train even without a gpu cluster." ] }, @@ -87,10 +88,7 @@ "\n", "This is mostly due to the fact that many words have several correct translations.\n", "\n", - "We have implemented this thing for you so that you can focus on more interesting parts.\n", - "\n", - "\n", - "__Attention python2 users!__ You may want to cast everything to unicode later during homework phase, just make sure you do it _everywhere_." + "We have implemented this thing for you so that you can focus on more interesting parts." ] }, { @@ -163,7 +161,7 @@ "source": [ "### Building vocabularies\n", "\n", - "We now need to build vocabularies that map strings to token ids and vice versa. We're gonna need these fellas when we feed training data into model or convert output matrices into english words." + "We now need to build vocabularies that map strings to token ids and vice versa. We're gonna need these fellas when we feed training data into the model or convert output matrices into English words." ] }, { @@ -233,13 +231,13 @@ "* Encoder reads words character by character and outputs code vector (usually a function of last RNN state)\n", "* Decoder takes that code vector and produces translations character by character\n", "\n", - "Than it gets fed into a model that follows this simple interface:\n", - "* __`model.symbolic_translate(inp, **flags) -> out, logp`__ - takes symbolic int32 matrix of hebrew words, produces output tokens sampled from the model and output log-probabilities for all possible tokens at each tick.\n", - " * if given flag __`greedy=True`__, takes most likely next token at each iteration. Otherwise samples with next token probabilities predicted by model.\n", - "* __`model.symbolic_score(inp, out, **flags) -> logp`__ - takes symbolic int32 matrices of hebrew words and their english translations. Computes the log-probabilities of all possible english characters given english prefices and hebrew word.\n", - "* __`model.weights`__ - weights from all model layers [a list of variables]\n", + "Then it gets fed into a model that follows this simple interface:\n", + "* __`model.symbolic_translate(inp, **flags) -> out, logp`__ - takes symbolic int32 matrix of Hebrew words, produces output tokens sampled from the model and output log-probabilities for all possible tokens at each tick.\n", + " * if given flag __`greedy=True`__, takes most likely next token at each iteration. Otherwise, samples with next token probabilities predicted by model.\n", + "* __`model.symbolic_score(inp, out, **flags) -> logp`__ - takes symbolic int32 matrices of Hebrew words and their English translations. Computes the log-probabilities of all possible English characters given English prefixes and Hebrew word.\n", + "* __`model.weights`__ - weights from all model layers, [a list of variables]\n", "\n", - "That's all! It's as hard as it gets. With those two methods alone you can implement all kinds of prediction and training." + "That's all! It's as hard as it gets. With those two methods alone, you can implement all kinds of prediction and training." ] }, { @@ -311,7 +309,7 @@ "\n", "def translate(lines):\n", " \"\"\"\n", - " You are given a list of input lines. \n", + " You are given a list of input lines.\n", " Make your neural network translate them.\n", " :return: a list of output lines\n", " \"\"\"\n", @@ -350,7 +348,7 @@ "### Scoring function\n", "\n", "LogLikelihood is a poor estimator of model performance.\n", - "* If we predict zero probability once, it shouldn't ruin entire model.\n", + "* If we predict zero probability once, it shouldn't ruin the entire model.\n", "* It is enough to learn just one translation if there are several correct ones.\n", "* What matters is how many mistakes model's gonna make when it translates!\n", "\n", @@ -594,7 +592,7 @@ "\n", " Params:\n", " - words_ix - a matrix of letter indices, shape=[batch_size,word_length]\n", - " - words_mask - a matrix of zeros/ones, \n", + " - words_mask - a matrix of zeros/ones,\n", " 1 means \"word is still not finished\"\n", " 0 means \"word has already finished and this is padding\"\n", "\n", @@ -704,7 +702,7 @@ "\n", "* You may now want to __remove/comment asserts__ from function code for a slight speed-up.\n", "\n", - "* There's a more detailed tutorial on custom tensorflow ops: [`py_func`](https://www.tensorflow.org/api_docs/python/tf/py_func), [`low-level`](https://www.tensorflow.org/api_docs/python/tf/py_func)." + "* There's a more detailed tutorial on custom tensorflow ops: [`py_func`](https://www.tensorflow.org/api_docs/python/tf/compat/v1/py_func), [`low-level`](https://www.tensorflow.org/lite/guide/ops_custom)." ] }, { @@ -715,7 +713,7 @@ "\n", "In this section you'll implement algorithm called self-critical sequence training (here's an [article](https://arxiv.org/abs/1612.00563)).\n", "\n", - "The algorithm is a vanilla policy gradient with a special baseline. \n", + "The algorithm is a vanilla policy gradient with a special baseline.\n", "\n", "$$ \\nabla J = E_{x \\sim p(s)} E_{y \\sim \\pi(y|x)} \\nabla log \\pi(y|x) \\cdot (R(x,y) - b(x)) $$\n", "\n", @@ -865,7 +863,7 @@ "## Step 6: Make it actually work (5++ pts)\n", "\n", "\n", - "In this section we want you to finally __restart with EASY_MODE=False__ and experiment to find a good model/curriculum for that task.\n", + "In this section, we want you to finally __restart with EASY_MODE=False__ and experiment to find a good model/curriculum for that task.\n", "\n", "We recommend you to start with the following architecture\n", "\n", @@ -881,24 +879,24 @@ "input y_prev\n", "```\n", "\n", - "__Note:__ you can fit all 4 state tensors of both LSTMs into a in a single state - just assume that it contains, for example, [h0, c0, h1, c1] - pack it in encode and update in decode.\n", + "__Note:__ you can fit all 4 state tensors of both LSTMs into in a single state - just assume that it contains, for example, [h0, c0, h1, c1] - pack it in encode and update in decode.\n", "\n", "\n", "Here are some cool ideas on what you can do then.\n", "\n", "__General tips & tricks:__\n", - "* In some tensorflow versions and for some layers, it is required that each rnn/gru/lstm cell gets it's own `tf.variable_scope(unique_name, reuse=False)`.\n", - " * Otherwise it will complain about wrong tensor sizes because it tries to reuse weights from one rnn to the other.\n", + "* In some tensorflow versions and for some layers, it is required that each rnn/gru/lstm cell gets its own `tf.variable_scope(unique_name, reuse=False)`.\n", + " * Otherwise, it will complain about wrong tensor sizes because it tries to reuse weights from one rnn to the other.\n", "* You will likely need to adjust pre-training time for such a network.\n", "* Supervised pre-training may benefit from clipping gradients somehow.\n", "* SCST may indulge a higher learning rate in some cases and changing entropy regularizer over time.\n", - "* It's often useful to save pre-trained model parameters to not re-train it every time you want new policy gradient parameters. \n", + "* It's often useful to save pre-trained model parameters to not re-train it every time you want new policy gradient parameters.\n", "* When leaving training for nighttime, try setting REPORT_FREQ to a larger value (e.g. 500) not to waste time on it.\n", "\n", "__Formal criteria:__\n", - "To get 5 points we want you to build an architecture that:\n", + "To get 5 points, we want you to build an architecture that:\n", "* _doesn't consist of single GRU_\n", - "* _works better_ than single GRU baseline. \n", + "* _works better_ than single GRU baseline.\n", "* We also want you to provide either learning curve or trained model, preferably both\n", "* ... and write a brief report or experiment log describing what you did and how it fared.\n", "\n", @@ -907,7 +905,7 @@ " * __Vanilla:__ layer_i of encoder last state goes to layer_i of decoder initial state\n", " * __Every tick:__ feed encoder last state _on every iteration_ of decoder.\n", " * __Attention:__ allow decoder to \"peek\" at one (or several) positions of encoded sequence on every tick.\n", - " \n", + "\n", "The most effective (and cool) of those is, of course, attention.\n", "You can read more about attention [in this nice blog post](https://distill.pub/2016/augmented-rnns/). The easiest way to begin is to use \"soft\" attention with \"additive\" or \"dot-product\" intermediate layers.\n", "\n", @@ -915,8 +913,8 @@ "* Model usually generalizes better if you no longer allow decoder to see final encoder state\n", "* Once your model made it through several epochs, it is a good idea to visualize attention maps to understand what your model has actually learned\n", "\n", - "* There's more stuff [here](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/bonus.ipynb)\n", - "* If you opted for hard attention, we recommend [gumbel-softmax](https://blog.evjang.com/2016/11/tutorial-categorical-variational.html) instead of sampling. Also please make sure soft attention works fine before you switch to hard.\n", + "* There's more stuff [here](bonus.ipynb)\n", + "* If you opted for hard attention, we recommend [gumbel-softmax](https://blog.evjang.com/2016/11/tutorial-categorical-variational.html) instead of sampling. Also, please make sure soft attention works fine before you switch to hard.\n", "\n", "### UREX\n", "* This is a way to improve exploration in policy-based settings. The main idea is that you find and upweight under-appreciated actions.\n", @@ -927,8 +925,8 @@ "\n", "### Some additional ideas:\n", "* (advanced deep learning) It may be a good idea to first train on small phrases and then adapt to larger ones (a.k.a. training curriculum).\n", - "* (advanced nlp) You may want to switch from raw utf8 to something like unicode or even syllables to make task easier.\n", - "* (advanced nlp) Since hebrew words are written __with vowels omitted__, you may want to use a small Hebrew vowel markup dataset at `he-pron-wiktionary.txt`.\n", + "* (advanced nlp) You may want to switch from raw utf8 to something like unicode or even syllables to make the task easier.\n", + "* (advanced nlp) Since Hebrew words are written __with vowels omitted__, you may want to use a small Hebrew vowel markup dataset at `he-pron-wiktionary.txt`.\n", "\n" ] }, @@ -936,7 +934,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Bonus hints: [here](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/bonus.ipynb)" + "### Bonus hints: [here](bonus.ipynb)" ] }, { diff --git a/week07_seq2seq/practice_theano.ipynb b/week07_seq2seq/practice_theano.ipynb deleted file mode 100644 index 790ebe158..000000000 --- a/week07_seq2seq/practice_theano.ipynb +++ /dev/null @@ -1,939 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Reinforcement Learning for seq2seq\n", - "\n", - "This time we'll solve a problem of transribing hebrew words in english, also known as g2p (grapheme2phoneme)\n", - "\n", - " * word (sequence of letters in source language) -> translation (sequence of letters in target language)\n", - "\n", - "Unlike what most deep learning researchers do, we won't only train it to maximize likelihood of correct translation, but also employ reinforcement learning to actually teach it to translate with as few errors as possible.\n", - "\n", - "\n", - "### About the task\n", - "\n", - "One notable property of Hebrew is that it's consonant language. That is, there are no wovels in the written language. One could represent wovels with diacritics above consonants, but you don't expect people to do that in everyay life.\n", - "\n", - "Therefore, some hebrew characters will correspond to several english letters and others - to none, so we should use encoder-decoder architecture to figure that out.\n", - "\n", - "![img](https://esciencegroup.files.wordpress.com/2016/03/seq2seq.jpg)\n", - "_(img: esciencegroup.files.wordpress.com)_\n", - "\n", - "Encoder-decoder architectures are about converting anything to anything, including\n", - " * Machine translation and spoken dialogue systems\n", - " * [Image captioning](http://mscoco.org/dataset/#captions-challenge2015) and [image2latex](https://openai.com/requests-for-research/#im2latex) (convolutional encoder, recurrent decoder)\n", - " * Generating [images by captions](https://arxiv.org/abs/1511.02793) (recurrent encoder, convolutional decoder)\n", - " * Grapheme2phoneme - convert words to transcripts\n", - " \n", - "We chose simplified __Hebrew->English__ machine translation for words and short phrases (character-level), as it is relatively quick to train even without a gpu cluster." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If True, only translates phrases shorter than 20 characters (way easier).\n", - "EASY_MODE = True\n", - "# Please keep it until you're done debugging your code\n", - "# If false, works with all phrases (please switch to this mode for homework assignment)\n", - "\n", - "# way we translate. Either \"he-to-en\" or \"en-to-he\"\n", - "MODE = \"he-to-en\"\n", - "# maximal length of _generated_ output, does not affect training\n", - "MAX_OUTPUT_LENGTH = 50 if not EASY_MODE else 20\n", - "REPORT_FREQ = 100 # how often to evaluate validation score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 1: preprocessing\n", - "\n", - "We shall store dataset as a dictionary\n", - "`{ word1:[translation1,translation2,...], word2:[...],...}`.\n", - "\n", - "This is mostly due to the fact that many words have several correct translations.\n", - "\n", - "We have implemented this thing for you so that you can focus on more interesting parts.\n", - "\n", - "\n", - "__Attention python2 users!__ You may want to cast everything to unicode later during homework phase, just make sure you do it _everywhere_." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from collections import defaultdict\n", - "word_to_translation = defaultdict(list) # our dictionary\n", - "\n", - "bos = '_'\n", - "eos = ';'\n", - "\n", - "with open(\"main_dataset.txt\", encoding='utf8') as fin:\n", - " for line in fin:\n", - "\n", - " en, he = line[:-1].lower().replace(bos, ' ').replace(eos,\n", - " ' ').split('\\t')\n", - " word, trans = (he, en) if MODE == 'he-to-en' else (en, he)\n", - "\n", - " if len(word) < 3:\n", - " continue\n", - " if EASY_MODE:\n", - " if max(len(word), len(trans)) > 20:\n", - " continue\n", - "\n", - " word_to_translation[word].append(trans)\n", - "\n", - "print(\"size = \", len(word_to_translation))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get all unique lines in source language\n", - "all_words = np.array(list(word_to_translation.keys()))\n", - "# get all unique lines in translation language\n", - "all_translations = np.array(\n", - " [ts for all_ts in word_to_translation.values() for ts in all_ts])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### split the dataset\n", - "\n", - "We hold out 10% of all words to be used for validation.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "train_words, test_words = train_test_split(\n", - " all_words, test_size=0.1, random_state=42)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Building vocabularies\n", - "\n", - "We now need to build vocabularies that map strings to token ids and vice versa. We're gonna need these fellas when we feed training data into model or convert output matrices into english words." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from voc import Vocab\n", - "inp_voc = Vocab.from_lines(''.join(all_words), bos=bos, eos=eos, sep='')\n", - "out_voc = Vocab.from_lines(''.join(all_translations), bos=bos, eos=eos, sep='')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Here's how you cast lines into ids and backwards.\n", - "batch_lines = all_words[:5]\n", - "batch_ids = inp_voc.to_matrix(batch_lines)\n", - "batch_lines_restored = inp_voc.to_lines(batch_ids)\n", - "\n", - "print(\"lines\")\n", - "print(batch_lines)\n", - "print(\"\\nwords to ids (0 = bos, 1 = eos):\")\n", - "print(batch_ids)\n", - "print(\"\\nback to words\")\n", - "print(batch_lines_restored)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Draw word/translation length distributions to estimate the scope of the task." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "plt.figure(figsize=[8, 4])\n", - "plt.subplot(1, 2, 1)\n", - "plt.title(\"words\")\n", - "plt.hist(list(map(len, all_words)), bins=20)\n", - "\n", - "plt.subplot(1, 2, 2)\n", - "plt.title('translations')\n", - "plt.hist(list(map(len, all_translations)), bins=20)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 3: deploy encoder-decoder (1 point)\n", - "\n", - "__assignment starts here__\n", - "\n", - "Our architecture consists of two main blocks:\n", - "* Encoder reads words character by character and outputs code vector (usually a function of last RNN state)\n", - "* Decoder takes that code vector and produces translations character by character\n", - "\n", - "Than it gets fed into a model that follows this simple interface:\n", - "* __`model.symbolic_translate(inp, **flags) -> out, logp`__ - takes symbolic int32 matrix of hebrew words, produces output tokens sampled from the model and output log-probabilities for all possible tokens at each tick.\n", - " * if given flag __`greedy=True`__, takes most likely next token at each iteration. Otherwise samples with next token probabilities predicted by model.\n", - "* __`model.symbolic_score(inp, out, **flags) -> logp`__ - takes symbolic int32 matrices of hebrew words and their english translations. Computes the log-probabilities of all possible english characters given english prefices and hebrew word.\n", - "\n", - "That's all! It's as hard as it gets. With those two methods alone you can implement all kinds of prediction and training." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# set flags here if necessary\n", - "import theano\n", - "theano.config.floatX = 'float32'\n", - "import theano.tensor as T\n", - "import lasagne" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from basic_model_theano import BasicTranslationModel\n", - "model = BasicTranslationModel(inp_voc, out_voc,\n", - " emb_size=64, hid_size=128)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Play around with symbolic_translate and symbolic_score\n", - "inp = T.constant(np.random.randint(0, 10, [3, 5], dtype='int32'))\n", - "out = T.constant(np.random.randint(0, 10, [3, 5], dtype='int32'))\n", - "\n", - "# translate inp (with untrained model)\n", - "sampled_out, logp = model.symbolic_translate(inp, greedy=False)\n", - "dummy_translate = theano.function([], sampled_out, updates=model.auto_updates)\n", - "\n", - "print(\"\\nSymbolic_translate output:\\n\", sampled_out, logp)\n", - "print(\"\\nSample translations:\\n\", dummy_translate())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# score logp(out | inp) with untrained input\n", - "logp = model.symbolic_score(inp, out)\n", - "dummy_score = theano.function([], logp)\n", - "\n", - "print(\"\\nSymbolic_score output:\\n\", logp)\n", - "print(\"\\nLog-probabilities (clipped):\\n\", dummy_score()[:, :2, :5])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Prepare any operations you want here\n", - "\n", - "inp = T.imatrix(\"input tokens [batch,time]\")\n", - "trans, _ = \n", - "translate_fun = theano.function([inp], trans, updates=model.auto_updates)\n", - "\n", - "\n", - "def translate(lines):\n", - " \"\"\"\n", - " You are given a list of input lines. \n", - " Make your neural network translate them.\n", - " :return: a list of output lines\n", - " \"\"\"\n", - " # Convert lines to a matrix of indices\n", - " lines_ix = \n", - "\n", - " # Compute translations in form of indices (call your function)\n", - " trans_ix = \n", - "\n", - " # Convert translations back into strings\n", - " return out_voc.to_lines(trans_ix)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Sample inputs:\", all_words[:3])\n", - "print(\"Dummy translations:\", translate(all_words[:3]))\n", - "\n", - "assert trans.ndim == 2 and trans.dtype.startswith(\n", - " 'int'), \"trans must be a tensor of integers (token ids)\"\n", - "assert translate(all_words[:3]) == translate(\n", - " all_words[:3]), \"make sure translation is deterministic (use greedy=True and disable any noise layers)\"\n", - "assert type(translate(all_words[:3])) is list and (type(translate(all_words[:1])[0]) is str or type(\n", - " translate(all_words[:1])[0]) is unicode), \"translate(lines) must return a sequence of strings!\"\n", - "print(\"Tests passed!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Scoring function\n", - "\n", - "LogLikelihood is a poor estimator of model performance.\n", - "* If we predict zero probability once, it shouldn't ruin entire model.\n", - "* It is enough to learn just one translation if there are several correct ones.\n", - "* What matters is how many mistakes model's gonna make when it translates!\n", - "\n", - "Therefore, we will use minimal Levenshtein distance. It measures how many characters do we need to add/remove/replace from model translation to make it perfect. Alternatively, one could use character-level BLEU/RougeL or other similar metrics.\n", - "\n", - "The catch here is that Levenshtein distance is not differentiable: it isn't even continuous. We can't train our neural network to maximize it by gradient descent." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import editdistance # !pip install editdistance\n", - "\n", - "\n", - "def get_distance(word, trans):\n", - " \"\"\"\n", - " A function that takes word and predicted translation\n", - " and evaluates (Levenshtein's) edit distance to closest correct translation\n", - " \"\"\"\n", - " references = word_to_translation[word]\n", - " assert len(references) != 0, \"wrong/unknown word\"\n", - " return min(editdistance.eval(trans, ref) for ref in references)\n", - "\n", - "\n", - "def score(words, bsize=100):\n", - " \"\"\"a function that computes levenshtein distance for bsize random samples\"\"\"\n", - " assert isinstance(words, np.ndarray)\n", - "\n", - " batch_words = np.random.choice(words, size=bsize, replace=False)\n", - " batch_trans = translate(batch_words)\n", - "\n", - " distances = list(map(get_distance, batch_words, batch_trans))\n", - "\n", - " return np.array(distances, dtype='float32')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# should be around 5-50 and decrease rapidly after training :)\n", - "[score(test_words, 10).mean() for _ in range(5)]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 2: Supervised pre-training\n", - "\n", - "Here we define a function that trains our model through maximizing log-likelihood a.k.a. minimizing crossentropy." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from agentnet.learning.generic import get_values_for_actions, get_mask_by_eos\n", - "\n", - "\n", - "class llh_trainer:\n", - "\n", - " # variable for correct answers\n", - " input_sequence = T.imatrix(\"input sequence [batch,time]\")\n", - " reference_answers = T.imatrix(\"reference translations [batch, time]\")\n", - "\n", - " # Compute log-probabilities of all possible tokens at each step. Use model interface.\n", - " logprobs_seq = \n", - "\n", - " # compute mean crossentropy\n", - " crossentropy = - get_values_for_actions(logprobs_seq, reference_answers)\n", - "\n", - " mask = get_mask_by_eos(T.eq(reference_answers, out_voc.eos_ix))\n", - "\n", - " loss = T.sum(crossentropy * mask)/T.sum(mask)\n", - "\n", - " # Build weight updates. Use model.weights to get all trainable params.\n", - " updates = \n", - "\n", - " train_step = theano.function(\n", - " [input_sequence, reference_answers], loss, updates=updates)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Actually run training on minibatches" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import random\n", - "\n", - "\n", - "def sample_batch(words, word_to_translation, batch_size):\n", - " \"\"\"\n", - " sample random batch of words and random correct translation for each word\n", - " example usage:\n", - " batch_x,batch_y = sample_batch(train_words, word_to_translations,10)\n", - " \"\"\"\n", - " # choose words\n", - " batch_words = np.random.choice(words, size=batch_size)\n", - "\n", - " # choose translations\n", - " batch_trans_candidates = list(map(word_to_translation.get, batch_words))\n", - " batch_trans = list(map(random.choice, batch_trans_candidates))\n", - "\n", - " return inp_voc.to_matrix(batch_words), out_voc.to_matrix(batch_trans)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "bx, by = sample_batch(train_words, word_to_translation, batch_size=3)\n", - "print(\"Source:\")\n", - "print(bx)\n", - "print(\"Target:\")\n", - "print(by)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.display import clear_output\n", - "from tqdm import tqdm, trange # or use tqdm_notebook,tnrange\n", - "\n", - "loss_history = []\n", - "editdist_history = []\n", - "\n", - "for i in trange(25000):\n", - " loss = llh_trainer.train_step(\n", - " *sample_batch(train_words, word_to_translation, 32))\n", - " loss_history.append(loss)\n", - "\n", - " if (i+1) % REPORT_FREQ == 0:\n", - " clear_output(True)\n", - " current_scores = score(test_words)\n", - " editdist_history.append(current_scores.mean())\n", - " plt.figure(figsize=(12, 4))\n", - " plt.subplot(131)\n", - " plt.title('train loss / traning time')\n", - " plt.plot(loss_history)\n", - " plt.grid()\n", - " plt.subplot(132)\n", - " plt.title('val score distribution')\n", - " plt.hist(current_scores, bins=20)\n", - " plt.subplot(133)\n", - " plt.title('val score / traning time')\n", - " plt.plot(editdist_history)\n", - " plt.grid()\n", - " plt.show()\n", - " print(\"llh=%.3f, mean score=%.3f\" %\n", - " (np.mean(loss_history[-10:]), np.mean(editdist_history[-10:])))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for word in train_words[:10]:\n", - " print(\"%s -> %s\" % (word, translate([word])[0]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_scores = []\n", - "for start_i in trange(0, len(test_words), 32):\n", - " batch_words = test_words[start_i:start_i+32]\n", - " batch_trans = translate(batch_words)\n", - " distances = list(map(get_distance, batch_words, batch_trans))\n", - " test_scores.extend(distances)\n", - "\n", - "print(\"Supervised test score:\", np.mean(test_scores))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Preparing for reinforcement learning (2 points)\n", - "\n", - "First we need to define loss function as a custom theano operation.\n", - "\n", - "The simple way to do so is\n", - "```\n", - "@theano.compile.as_op(input_types,output_type(s),infer_shape)\n", - "def my_super_function(inputs):\n", - " return outputs\n", - "```\n", - "\n", - "\n", - "\n", - "\n", - "__Your task__ is to implement `_compute_levenshtein` function that takes matrices of words and translations, along with input masks, then converts those to actual words and phonemes and computes min-levenshtein via __get_distance__ function above.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "@theano.compile.as_op([T.imatrix]*2, [T.fvector], lambda _, shapes: [shapes[0][:1]])\n", - "def _compute_levenshtein(words_ix, trans_ix):\n", - " \"\"\"\n", - " A custom theano operation that computes levenshtein loss for predicted trans.\n", - "\n", - " Params:\n", - " - words_ix - a matrix of letter indices, shape=[batch_size,word_length]\n", - " - words_mask - a matrix of zeros/ones, \n", - " 1 means \"word is still not finished\"\n", - " 0 means \"word has already finished and this is padding\"\n", - "\n", - " - trans_mask - a matrix of output letter indices, shape=[batch_size,translation_length]\n", - " - trans_mask - a matrix of zeros/ones, similar to words_mask but for trans_ix\n", - "\n", - "\n", - " Please implement the function and make sure it passes tests from the next cell.\n", - "\n", - " \"\"\"\n", - "\n", - " # convert words to strings\n", - " words = \n", - "\n", - " assert type(words) is list and type(\n", - " words[0]) is str and len(words) == len(words_ix)\n", - "\n", - " # convert translations to lists\n", - " translations = \n", - "\n", - " assert type(distances) in (list, tuple, np.ndarray) and len(\n", - " distances) == len(words_ix)\n", - "\n", - " distances = np.array(list(distances), dtype='float32')\n", - " return distances\n", - "\n", - "\n", - "# forbid gradient\n", - "from theano.gradient import disconnected_grad\n", - "\n", - "\n", - "def compute_levenshtein(*args):\n", - " return disconnected_grad(_compute_levenshtein(*[arg.astype('int32') for arg in args]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Simple test suite to make sure your implementation is correct. Hint: if you run into any bugs, feel free to use print from inside _compute_levenshtein." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# test suite\n", - "# sample random batch of (words, correct trans, wrong trans)\n", - "batch_words = np.random.choice(train_words, size=100)\n", - "batch_trans = list(map(random.choice, map(\n", - " word_to_translation.get, batch_words)))\n", - "batch_trans_wrong = np.random.choice(all_translations, size=100)\n", - "\n", - "batch_words_ix = T.constant(inp_voc.to_matrix(batch_words))\n", - "batch_trans_ix = T.constant(out_voc.to_matrix(batch_trans))\n", - "batch_trans_wrong_ix = T.constant(out_voc.to_matrix(batch_trans_wrong))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# assert compute_levenshtein is zero for ideal translations\n", - "correct_answers_score = compute_levenshtein(\n", - " batch_words_ix, batch_trans_ix).eval()\n", - "\n", - "assert np.all(correct_answers_score ==\n", - " 0), \"a perfect translation got nonzero levenshtein score!\"\n", - "\n", - "print(\"Everything seems alright!\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# assert compute_levenshtein matches actual scoring function\n", - "wrong_answers_score = compute_levenshtein(\n", - " batch_words_ix, batch_trans_wrong_ix).eval()\n", - "\n", - "true_wrong_answers_score = np.array(\n", - " list(map(get_distance, batch_words, batch_trans_wrong)))\n", - "\n", - "assert np.all(wrong_answers_score ==\n", - " true_wrong_answers_score), \"for some word symbolic levenshtein is different from actual levenshtein distance\"\n", - "\n", - "print(\"Everything seems alright!\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once you got it working...\n", - "\n", - "\n", - "* You may now want to __remove/comment asserts__ from function code for a slight speed-up.\n", - "\n", - "* There's a more detailed tutorial on custom theano ops here: [docs](http://deeplearning.net/software/theano/extending/extending_theano.html), [example](https://gist.github.com/justheuristic/9f4ffef6162a8089c3260fc3bbacbf46)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Self-critical policy gradient (2 points)\n", - "\n", - "In this section you'll implement algorithm called self-critical sequence training (here's an [article](https://arxiv.org/abs/1612.00563)).\n", - "\n", - "The algorithm is a vanilla policy gradient with a special baseline. \n", - "\n", - "$$ \\nabla J = E_{x \\sim p(s)} E_{y \\sim \\pi(y|x)} \\nabla log \\pi(y|x) \\cdot (R(x,y) - b(x)) $$\n", - "\n", - "Here reward R(x,y) is a __negative levenshtein distance__ (since we minimize it). The baseline __b(x)__ represents how well model fares on word __x__.\n", - "\n", - "In practice, this means that we compute baseline as a score of greedy translation, $b(x) = R(x,y_{greedy}(x)) $.\n", - "![img](https://github.com/yandexdataschool/Practical_RL/raw/master/yet_another_week/_resource/scheme.png)\n", - "\n", - "Luckily, we already obtained the required outputs: `model.greedy_translations, model.greedy_mask` and we only need to compute levenshtein using `compute_levenshtein` function.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "class trainer:\n", - "\n", - " input_sequence = T.imatrix(\"input tokens [batch,time]\")\n", - "\n", - " # use model to __sample__ symbolic translations given input_sequence\n", - " sample_translations, sample_logp = \n", - " auto_updates = model.auto_updates\n", - " # use model to __greedy__ symbolic translations given input_sequence\n", - " greedy_translations, greedy_logp = \n", - " greedy_auto_updates = model.auto_updates\n", - "\n", - " # Note: you can use model.symbolic_translate(...,unroll_scan=True,max_len=MAX_OUTPUT_LENGTH)\n", - " # to run much faster at a cost of longer compilation\n", - "\n", - " rewards = - compute_levenshtein(input_sequence, sample_translations)\n", - "\n", - " baseline = \n", - "\n", - " # compute advantage using rewards and baseline\n", - " advantage = \n", - "\n", - " # compute log_pi(a_t|s_t), shape = [batch, seq_length]\n", - " logprobs_phoneme = get_values_for_actions(sample_logp, sample_translations)\n", - "\n", - " # policy gradient\n", - " J = logprobs_phoneme*advantage[:, None]\n", - "\n", - " mask = get_mask_by_eos(T.eq(sample_translations, out_voc.eos_ix))\n", - " loss = - T.sum(J*mask) / T.sum(mask)\n", - "\n", - " # regularize with negative entropy. Don't forget the sign!\n", - " # note: for entropy you need probabilities for all tokens (sample_logp), not just phoneme_logprobs\n", - " entropy = \n", - "\n", - " assert entropy.ndim == 2, \"please make sure elementwise entropy is of shape [batch,time]\"\n", - "\n", - " loss -= 0.01*T.sum(entropy*mask) / T.sum(mask)\n", - "\n", - " # compute weight updates, clip by norm\n", - " grads = T.grad(loss, model.weights)\n", - " grads = lasagne.updates.total_norm_constraint(grads, 50)\n", - "\n", - " updates = lasagne.updates.adam(grads, model.weights, learning_rate=1e-5)\n", - "\n", - " train_step = theano.function([input_sequence], loss,\n", - " updates=auto_updates+greedy_auto_updates+updates)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Policy gradient training\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for i in trange(100000):\n", - " loss_history.append(\n", - " trainer.train_step(sample_batch(\n", - " train_words, word_to_translation, 32)[0])\n", - " )\n", - "\n", - " if (i+1) % REPORT_FREQ == 0:\n", - " clear_output(True)\n", - " current_scores = score(test_words)\n", - " editdist_history.append(current_scores.mean())\n", - " plt.figure(figsize=(8, 4))\n", - " plt.subplot(121)\n", - " plt.title('val score distribution')\n", - " plt.hist(current_scores, bins=20)\n", - " plt.subplot(122)\n", - " plt.title('val score / traning time')\n", - " plt.plot(editdist_history)\n", - " plt.grid()\n", - " plt.show()\n", - " print(\"J=%.3f, mean score=%.3f\" %\n", - " (np.mean(loss_history[-10:]), np.mean(editdist_history[-10:])))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.translate(\"EXAMPLE;\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for word in train_words[:10]:\n", - " print(\"%s -> %s\" % (word, translate([word])[0]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_scores = []\n", - "for start_i in trange(0, len(test_words), 32):\n", - " batch_words = test_words[start_i:start_i+32]\n", - " batch_trans = translate(batch_words)\n", - " distances = list(map(get_distance, batch_words, batch_trans))\n", - " test_scores.extend(distances)\n", - "print(\"Supervised test score:\", np.mean(test_scores))\n", - "\n", - "# ^^ If you get Out Of Memory, please replace this with batched computation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 6: Make it actually work (5++ pts)\n", - "\n", - "\n", - "\n", - "In this section we want you to finally __restart with EASY_MODE=False__ and experiment to find a good model/curriculum for that task.\n", - "\n", - "We recommend you to start with the following architecture\n", - "\n", - "```\n", - "encoder---decoder\n", - "\n", - " P(y|h)\n", - " ^\n", - " LSTM -> LSTM\n", - " ^ ^\n", - " biLSTM -> LSTM\n", - " ^ ^\n", - "input y_prev\n", - "```\n", - "\n", - "__Note:__ you can fit all 4 state tensors of both LSTMs into a in a single state - just assume that it contains, for example, [h0, c0, h1, c1] - pack it in encode and update in decode.\n", - "\n", - "\n", - "Here are some cool ideas on what you can do then.\n", - "\n", - "__General tips & tricks:__\n", - "* In some tensorflow versions and for some layers, it is required that each rnn/gru/lstm cell gets it's own `tf.variable_scope(unique_name, reuse=False)`.\n", - " * Otherwise it will complain about wrong tensor sizes because it tries to reuse weights from one rnn to the other.\n", - "* You will likely need to adjust pre-training time for such a network.\n", - "* Supervised pre-training may benefit from clipping gradients somehow.\n", - "* SCST may indulge a higher learning rate in some cases and changing entropy regularizer over time.\n", - "* It's often useful to save pre-trained model parameters to not re-train it every time you want new policy gradient parameters. \n", - "* When leaving training for nighttime, try setting REPORT_FREQ to a larger value (e.g. 500) not to waste time on it.\n", - "\n", - "__Formal criteria:__\n", - "To get 5 points we want you to build an architecture that:\n", - "* _doesn't consist of single GRU_\n", - "* _works better_ than single GRU baseline. \n", - "* We also want you to provide either learning curve or trained model, preferably both\n", - "* ... and write a brief report or experiment log describing what you did and how it fared.\n", - "\n", - "### Attention\n", - "There's more than one way to connect decoder to encoder\n", - " * __Vanilla:__ layer_i of encoder last state goes to layer_i of decoder initial state\n", - " * __Every tick:__ feed encoder last state _on every iteration_ of decoder.\n", - " * __Attention:__ allow decoder to \"peek\" at one (or several) positions of encoded sequence on every tick.\n", - " \n", - "The most effective (and cool) of those is, of course, attention.\n", - "You can read more about attention [in this nice blog post](https://distill.pub/2016/augmented-rnns/). The easiest way to begin is to use \"soft\" attention with \"additive\" or \"dot-product\" intermediate layers.\n", - "\n", - "__Tips__\n", - "* Model usually generalizes better if you no longer allow decoder to see final encoder state\n", - "* Once your model made it through several epochs, it is a good idea to visualize attention maps to understand what your model has actually learned\n", - "\n", - "* There's more stuff [here](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/bonus.ipynb)\n", - "* If you opted for hard attention, we recommend [gumbel-softmax](https://blog.evjang.com/2016/11/tutorial-categorical-variational.html) instead of sampling. Also please make sure soft attention works fine before you switch to hard.\n", - "\n", - "### UREX\n", - "* This is a way to improve exploration in policy-based settings. The main idea is that you find and upweight under-appreciated actions.\n", - "* Here's [video](https://www.youtube.com/watch?v=fZNyHoXgV7M&feature=youtu.be&t=3444)\n", - " and an [article](https://arxiv.org/abs/1611.09321).\n", - "* You may want to reduce batch size 'cuz UREX requires you to sample multiple times per source sentence.\n", - "* Once you got it working, try using experience replay with importance sampling instead of (in addition to) basic UREX.\n", - "\n", - "### Some additional ideas:\n", - "* (advanced deep learning) It may be a good idea to first train on small phrases and then adapt to larger ones (a.k.a. training curriculum).\n", - "* (advanced nlp) You may want to switch from raw utf8 to something like unicode or even syllables to make task easier.\n", - "* (advanced nlp) Since hebrew words are written __with vowels omitted__, you may want to use a small Hebrew vowel markup dataset at `he-pron-wiktionary.txt`.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "assert not EASY_MODE, \"make sure you set EASY_MODE = False at the top of the notebook.\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`[your report/log here or anywhere you please]`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__Contributions:__ This notebook is brought to you by\n", - "* Yandex [MT team](https://tech.yandex.com/translate/)\n", - "* Denis Mazur ([DeniskaMazur](https://github.com/DeniskaMazur)), Oleg Vasilev ([Omrigan](https://github.com/Omrigan/)), Dmitry Emelyanenko ([TixFeniks](https://github.com/tixfeniks)) and Fedor Ratnikov ([justheuristic](https://github.com/justheuristic/))\n", - "* Dataset is parsed from [Wiktionary](https://en.wiktionary.org), which is under CC-BY-SA and GFDL licenses.\n" - ] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/week07_seq2seq/practice_torch.ipynb b/week07_seq2seq/practice_torch.ipynb index d0b596764..97bc2ebbe 100644 --- a/week07_seq2seq/practice_torch.ipynb +++ b/week07_seq2seq/practice_torch.ipynb @@ -6,28 +6,28 @@ "source": [ "## Reinforcement Learning for seq2seq\n", "\n", - "This time we'll solve a problem of transribing hebrew words in english, also known as g2p (grapheme2phoneme)\n", + "This time we'll solve a problem of transсribing Hebrew words in English, also known as g2p (grapheme2phoneme)\n", "\n", " * word (sequence of letters in source language) -> translation (sequence of letters in target language)\n", "\n", - "Unlike what most deep learning practicioners do, we won't only train it to maximize likelihood of correct translation, but also employ reinforcement learning to actually teach it to translate with as few errors as possible.\n", + "Unlike what most deep learning practitioners do, we won't only train it to maximize likelihood of correct translation, but also employ reinforcement learning to actually teach it to translate with as few errors as possible.\n", "\n", "\n", "### About the task\n", "\n", - "One notable property of Hebrew is that it's consonant language. That is, there are no wovels in the written language. One could represent wovels with diacritics above consonants, but you don't expect people to do that in everyay life.\n", + "One notable property of Hebrew is that it's a consonant language. That is, there are no vowels in the written language. One could represent vowels with diacritics above consonants, but you don't expect people to do that in everyday life.\n", "\n", - "Therefore, some hebrew characters will correspond to several english letters and others - to none, so we should use encoder-decoder architecture to figure that out.\n", + "Therefore, some Hebrew characters will correspond to several English letters and others - to none, so we should use encoder-decoder architecture to figure that out.\n", "\n", "![img](https://esciencegroup.files.wordpress.com/2016/03/seq2seq.jpg)\n", "_(img: esciencegroup.files.wordpress.com)_\n", "\n", "Encoder-decoder architectures are about converting anything to anything, including\n", " * Machine translation and spoken dialogue systems\n", - " * [Image captioning](http://mscoco.org/dataset/#captions-challenge2015) and [image2latex](https://htmlpreview.github.io/?https://github.com/openai/requests-for-research/blob/master/_requests_for_research/im2latex.html) (convolutional encoder, recurrent decoder)\n", + " * [Image captioning](https://cocodataset.org/#captions-2015) and [image2latex](https://htmlpreview.github.io/?https://github.com/openai/requests-for-research/blob/master/_requests_for_research/im2latex.html) (convolutional encoder, recurrent decoder)\n", " * Generating [images by captions](https://arxiv.org/abs/1511.02793) (recurrent encoder, convolutional decoder)\n", " * Grapheme2phoneme - convert words to transcripts\n", - " \n", + "\n", "We chose simplified __Hebrew->English__ machine translation for words and short phrases (character-level), as it is relatively quick to train even without a gpu cluster." ] }, @@ -39,9 +39,9 @@ "source": [ "import sys\n", "if 'google.colab' in sys.modules:\n", - " !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week07_seq2seq/basic_model_torch.py -O basic_model_torch.py\n", - " !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week07_seq2seq/main_dataset.txt -O main_dataset.txt\n", - " !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week07_seq2seq/voc.py -O voc.py\n", + " !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week07_seq2seq/basic_model_torch.py -O basic_model_torch.py\n", + " !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week07_seq2seq/main_dataset.txt -O main_dataset.txt\n", + " !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week07_seq2seq/voc.py -O voc.py\n", " !pip3 install torch==1.0.0 nltk editdistance" ] }, @@ -74,10 +74,7 @@ "\n", "This is mostly due to the fact that many words have several correct translations.\n", "\n", - "We have implemented this thing for you so that you can focus on more interesting parts.\n", - "\n", - "\n", - "__Attention python2 users!__ You may want to cast everything to unicode later during homework phase, just make sure you do it _everywhere_." + "We have implemented this thing for you so that you can focus on more interesting parts." ] }, { @@ -150,7 +147,7 @@ "source": [ "### Building vocabularies\n", "\n", - "We now need to build vocabularies that map strings to token ids and vice versa. We're gonna need these fellas when we feed training data into model or convert output matrices into english words." + "We now need to build vocabularies that map strings to token ids and vice versa. We're gonna need these fellas when we feed training data into the model or convert output matrices into English words." ] }, { @@ -220,12 +217,12 @@ "* Encoder reads words character by character and outputs code vector (usually a function of last RNN state)\n", "* Decoder takes that code vector and produces translations character by character\n", "\n", - "Than it gets fed into a model that follows this simple interface:\n", - "* __`model(inp, out, **flags) -> logp`__ - takes symbolic int32 matrices of hebrew words and their english translations. Computes the log-probabilities of all possible english characters given english prefices and hebrew word.\n", - "* __`model.translate(inp, **flags) -> out, logp`__ - takes symbolic int32 matrix of hebrew words, produces output tokens sampled from the model and output log-probabilities for all possible tokens at each tick.\n", - " * if given flag __`greedy=True`__, takes most likely next token at each iteration. Otherwise samples with next token probabilities predicted by model.\n", + "Then it gets fed into a model that follows this simple interface:\n", + "* __`model(inp, out, **flags) -> logp`__ - takes symbolic int32 matrices of Hebrew words and their English translations. Computes the log-probabilities of all possible English characters given English prefixes and Hebrew word.\n", + "* __`model.translate(inp, **flags) -> out, logp`__ - takes symbolic int32 matrix of Hebrew words, produces output tokens sampled from the model and output log-probabilities for all possible tokens at each tick.\n", + " * if given flag __`greedy=True`__, takes most likely next token at each iteration. Otherwise, samples with next token probabilities predicted by model.\n", "\n", - "That's all! It's as hard as it gets. With those two methods alone you can implement all kinds of prediction and training." + "That's all! It's as hard as it gets. With those two methods alone, you can implement all kinds of prediction and training." ] }, { @@ -289,7 +286,7 @@ "source": [ "def translate(lines, max_len=MAX_OUTPUT_LENGTH):\n", " \"\"\"\n", - " You are given a list of input lines. \n", + " You are given a list of input lines.\n", " Make your neural network translate them.\n", " :return: a list of output lines\n", " \"\"\"\n", @@ -329,7 +326,7 @@ "### Scoring function\n", "\n", "LogLikelihood is a poor estimator of model performance.\n", - "* If we predict zero probability once, it shouldn't ruin entire model.\n", + "* If we predict zero probability once, it shouldn't ruin the entire model.\n", "* It is enough to learn just one translation if there are several correct ones.\n", "* What matters is how many mistakes model's gonna make when it translates!\n", "\n", @@ -545,9 +542,9 @@ "\n", "* __Train loss__ - that's your model's crossentropy over minibatches. It should go down steadily. Most importantly, it shouldn't be NaN :)\n", "* __Val score distribution__ - distribution of translation edit distance (score) within batch. It should move to the left over time.\n", - "* __Val score / training time__ - it's your current mean edit distance. This plot is much whimsier than loss, but make sure it goes below 8 by 2500 steps. \n", + "* __Val score / training time__ - it's your current mean edit distance. This plot is much whimsier than loss, but make sure it goes below 8 by 2500 steps.\n", "\n", - "If it doesn't, first try to re-create both model and opt. You may have changed it's weight too much while debugging. If that doesn't help, it's debugging time." + "If it doesn't, first try to re-create both model and opt. You may have changed its weight too much while debugging. If that doesn't help, it's debugging time." ] }, { @@ -584,7 +581,7 @@ "\n", "In this section you'll implement algorithm called self-critical sequence training (here's an [article](https://arxiv.org/abs/1612.00563)).\n", "\n", - "The algorithm is a vanilla policy gradient with a special baseline. \n", + "The algorithm is a vanilla policy gradient with a special baseline.\n", "\n", "$$ \\nabla J = E_{x \\sim p(s)} E_{y \\sim \\pi(y|x)} \\nabla log \\pi(y|x) \\cdot (R(x,y) - b(x)) $$\n", "\n", @@ -637,7 +634,7 @@ "\n", " # compute log_pi(a_t|s_t), shape = [batch, seq_length]\n", " logp_sample = \n", - " \n", + "\n", " # ^-- hint: look at how crossentropy is implemented in supervised learning loss above\n", " # mind the sign - this one should not be multiplied by -1 :)\n", "\n", @@ -727,11 +724,11 @@ "\n", "\n", " * As usual, don't expect improvements right away, but in general the model should be able to show some positive changes by 5k steps.\n", - " * Entropy is a good indicator of many problems. \n", + " * Entropy is a good indicator of many problems.\n", " * If it reaches zero, you may need greater entropy regularizer.\n", " * If it has rapid changes time to time, you may need gradient clipping.\n", " * If it oscillates up and down in an erratic manner... it's perfectly okay for entropy to do so. But it should decrease at the end.\n", - " \n", + "\n", " * We don't show loss_history cuz it's uninformative for pseudo-losses in policy gradient. However, if something goes wrong you can check it to see if everything isn't a constant zero." ] }, @@ -775,7 +772,7 @@ "source": [ "## Step 6: Make it actually work (5++ pts)\n", "\n", - "In this section we want you to finally __restart with EASY_MODE=False__ and experiment to find a good model/curriculum for that task.\n", + "In this section, we want you to finally __restart with EASY_MODE=False__ and experiment to find a good model/curriculum for that task.\n", "\n", "We recommend you to start with the following architecture\n", "\n", @@ -791,7 +788,7 @@ "input y_prev\n", "```\n", "\n", - "__Note:__ you can fit all 4 state tensors of both LSTMs into a in a single state - just assume that it contains, for example, [h0, c0, h1, c1] - pack it in encode and update in decode.\n", + "__Note:__ you can fit all 4 state tensors of both LSTMs into in a single state - just assume that it contains, for example, [h0, c0, h1, c1] - pack it in encode and update in decode.\n", "\n", "\n", "Here are some cool ideas on what you can do then.\n", @@ -800,13 +797,13 @@ "* You will likely need to adjust pre-training time for such a network.\n", "* Supervised pre-training may benefit from clipping gradients somehow.\n", "* SCST may indulge a higher learning rate in some cases and changing entropy regularizer over time.\n", - "* It's often useful to save pre-trained model parameters to not re-train it every time you want new policy gradient parameters. \n", + "* It's often useful to save pre-trained model parameters to not re-train it every time you want new policy gradient parameters.\n", "* When leaving training for nighttime, try setting REPORT_FREQ to a larger value (e.g. 500) not to waste time on it.\n", "\n", "__Formal criteria:__\n", - "To get 5 points we want you to build an architecture that:\n", + "To get 5 points, we want you to build an architecture that:\n", "* _doesn't consist of single GRU_\n", - "* _works better_ than single GRU baseline. \n", + "* _works better_ than single GRU baseline.\n", "* We also want you to provide either learning curve or trained model, preferably both\n", "* ... and write a brief report or experiment log describing what you did and how it fared.\n", "\n", @@ -815,7 +812,7 @@ " * __Vanilla:__ layer_i of encoder last state goes to layer_i of decoder initial state\n", " * __Every tick:__ feed encoder last state _on every iteration_ of decoder.\n", " * __Attention:__ allow decoder to \"peek\" at one (or several) positions of encoded sequence on every tick.\n", - " \n", + "\n", "The most effective (and cool) of those is, of course, attention.\n", "You can read more about attention [in this nice blog post](https://distill.pub/2016/augmented-rnns/). The easiest way to begin is to use \"soft\" attention with \"additive\" or \"dot-product\" intermediate layers.\n", "\n", @@ -823,8 +820,8 @@ "* Model usually generalizes better if you no longer allow decoder to see final encoder state\n", "* Once your model made it through several epochs, it is a good idea to visualize attention maps to understand what your model has actually learned\n", "\n", - "* There's more stuff [here](https://github.com/yandexdataschool/Practical_RL/blob/master/week8_scst/bonus.ipynb)\n", - "* If you opted for hard attention, we recommend [gumbel-softmax](https://blog.evjang.com/2016/11/tutorial-categorical-variational.html) instead of sampling. Also please make sure soft attention works fine before you switch to hard.\n", + "* There's more stuff [here](bonus_pytorch.ipynb)\n", + "* If you opted for hard attention, we recommend [gumbel-softmax](https://blog.evjang.com/2016/11/tutorial-categorical-variational.html) instead of sampling. Also, please make sure soft attention works fine before you switch to hard.\n", "\n", "### UREX\n", "* This is a way to improve exploration in policy-based settings. The main idea is that you find and upweight under-appreciated actions.\n", @@ -835,8 +832,8 @@ "\n", "### Some additional ideas:\n", "* (advanced deep learning) It may be a good idea to first train on small phrases and then adapt to larger ones (a.k.a. training curriculum).\n", - "* (advanced nlp) You may want to switch from raw utf8 to something like unicode or even syllables to make task easier.\n", - "* (advanced nlp) Since hebrew words are written __with vowels omitted__, you may want to use a small Hebrew vowel markup dataset at `he-pron-wiktionary.txt`.\n", + "* (advanced nlp) You may want to switch from raw utf8 to something like unicode or even syllables to make the task easier.\n", + "* (advanced nlp) Since Hebrew words are written __with vowels omitted__, you may want to use a small Hebrew vowel markup dataset at `he-pron-wiktionary.txt`.\n", "\n" ] }, diff --git a/week08_pomdp/README.md b/week08_pomdp/README.md index 8154b5b49..6bb589a2f 100644 --- a/week08_pomdp/README.md +++ b/week08_pomdp/README.md @@ -5,6 +5,7 @@ _Links on all articles mentioned during the lecture could be found in "Reference ## Basics * Our [lecture](https://yadi.sk/i/AHzpTjiT3U8L8e) and [seminar](https://yadi.sk/i/Ka-I7nBp3U8LAG) (russian) +* A Lecture on Basics by Pavel Shvechikov (english) [Video](https://www.youtube.com/watch?v=aV4wz7FAXmo) * A lecture on basics by Andrew NG (english, LQ) - [video](https://www.youtube.com/watch?v=yCqPMD6coO8) * A lecture on basics by 5vision (russian) - [video](https://www.youtube.com/watch?v=_dkaynuKUFE) * _[alternative]_ Chalkboard-style 2-part lecture by B. Ravindran. - [part1](https://www.youtube.com/watch?v=9G_KevA8DFY), [part2](https://www.youtube.com/watch?v=dMOUp7YzUpQ) diff --git a/week08_pomdp/atari_util.py b/week08_pomdp/atari_util.py index ae19e2ab3..11065cde6 100644 --- a/week08_pomdp/atari_util.py +++ b/week08_pomdp/atari_util.py @@ -1,14 +1,14 @@ import cv2 import numpy as np -from gym.core import Wrapper -from gym.spaces.box import Box +from gymnasium import Wrapper +from gymnasium.spaces import Box class PreprocessAtari(Wrapper): def __init__(self, env, height=42, width=42, color=False, crop=lambda img: img, n_frames=4, dim_order='pytorch', reward_scale=1): """A gym wrapper that reshapes, crops and scales image into the desired shapes""" - super(PreprocessAtari, self).__init__(env) + super().__init__(env) self.img_size = (height, width) self.crop = crop self.color = color @@ -18,7 +18,6 @@ def __init__(self, env, height=42, width=42, color=False, n_channels = (3 * n_frames) if color else n_frames obs_shape = { - 'theano': (n_channels, height, width), 'pytorch': (n_channels, height, width), 'tensorflow': (height, width, n_channels), }[dim_order] @@ -26,18 +25,19 @@ def __init__(self, env, height=42, width=42, color=False, self.observation_space = Box(0.0, 1.0, obs_shape) self.framebuffer = np.zeros(obs_shape, 'float32') - def reset(self): + def reset(self, **kwargs): """Resets the game, returns initial frames""" self.framebuffer = np.zeros_like(self.framebuffer) - self.update_buffer(self.env.reset()) - return self.framebuffer + state, info = self.env.reset(**kwargs) + self.update_buffer(state) + return self.framebuffer, info def step(self, action): """Plays the game for 1 step, returns frame buffer""" - new_img, r, done, info = self.env.step(action) + new_img, r, terminated, truncated, info = self.env.step(action) self.update_buffer(new_img) - return self.framebuffer, r * self.reward_scale, done, info + return self.framebuffer, r * self.reward_scale, terminated, truncated, info ### image processing ### diff --git a/week08_pomdp/env_pool.py b/week08_pomdp/env_pool.py index 2eda898b7..709dca407 100644 --- a/week08_pomdp/env_pool.py +++ b/week08_pomdp/env_pool.py @@ -1,5 +1,5 @@ """ -A thin wrapper for OpenAI gym environments that maintains a set of parallel games and has a method to generate +A thin wrapper for Farama gymnasium environments that maintains a set of parallel games and has a method to generate interaction sessions given agent one-step applier function. """ @@ -15,7 +15,7 @@ def __init__(self, agent, make_env, n_parallel_games=1): and is capable of some auxilary actions like evaluating agent on one game session (See .evaluate()). :param agent: Agent which interacts with the environment. - :param make_env: Factory that produces environments OR a name of the gym environment. + :param make_env: Factory that produces environments OR a name of the gymnasium environment. :param n_games: Number of parallel games. One game by default. :param max_size: Max pool size by default (if appending sessions). By default, pool is not constrained in size. """ @@ -25,17 +25,17 @@ def __init__(self, agent, make_env, n_parallel_games=1): self.envs = [self.make_env() for _ in range(n_parallel_games)] # Initial observations. - self.prev_observations = [env.reset() for env in self.envs] + self.prev_observations = [env.reset()[0] for env in self.envs] # Agent memory variables (if you use recurrent networks). self.prev_memory_states = agent.get_initial_state(n_parallel_games) - # Whether particular session has just been terminated and needs + # Whether particular session has just been terminated or truncated and needs # restarting. self.just_ended = [False] * len(self.envs) def interact(self, n_steps=100, verbose=False): - """Generate interaction sessions with ataries (OpenAI gym Atari environments) + """Generate interaction sessions with ataries (Farama gymnasium Atari environments) Sessions will have length n_steps. Each time one of games is finished, it is immediately getting reset and this time is recorded in is_alive_log (See returned values). @@ -46,9 +46,9 @@ def interact(self, n_steps=100, verbose=False): def env_step(i, action): if not self.just_ended[i]: - new_observation, cur_reward, is_done, info = \ + new_observation, cur_reward, terminated, truncated, info = \ self.envs[i].step(action) - if is_done: + if terminated or truncated: # Game ends now, will finalize on next tick. self.just_ended[i] = True @@ -58,7 +58,7 @@ def env_step(i, action): else: # Reset environment, get new observation to be used on next # tick. - new_observation = self.envs[i].reset() + new_observation = self.envs[i].reset()[0] # Reset memory for new episode. initial_memory_state = self.agent.get_initial_state( diff --git a/week08_pomdp/homework_common_part2.ipynb b/week08_pomdp/homework_common_part2.ipynb deleted file mode 100644 index ec4766342..000000000 --- a/week08_pomdp/homework_common_part2.ipynb +++ /dev/null @@ -1,307 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### This is a common homework assignment for both frameworks\n", - "\n", - "This week's assignment appears to be unusually grandeur, so please read submission/grading guidelines before you upload it for review.\n", - "\n", - "__Submisson__: To ease mutual pain, please submit\n", - "- Some kind of readable report with links to your evaluations, gym uploads, investigation results, etc.\n", - "- Explicitly state that you took on a bonus task and where to find it [to make sure it is found and graded].\n", - "\n", - "__Grading__: The main purpose (and source of points) for this notebook is your investigation, not squeezing out average rewards from random environments. \n", - "\n", - "Getting near/above state of the art performance on one particular game will earn you some bonus points, but you can get much more by digging deeper into what makes the algorithms tick and how they compare to one another.\n", - "\n", - "Okay, now brace yourselves, here comes an assignment!\n", - "\n", - "#### 7.2 Deep kung-fu (3 pts)\n", - "\n", - "Implement and train recurrent actor-critic on `KungFuMaster-v0` env in the second notebook. Try to get a score of >=20k.\n", - "\n", - "__[bonus points]__ +1 point per each +5k average reward over 20k baseline (25k = +1, 30k = +2, ...)\n", - "\n", - "\n", - "#### 7.3 Comparing what we know (7+ pts)\n", - "\n", - "_Please read this assignment carefully._\n", - "\n", - "Choose a partially-observable environment for experimentation out of [atari](https://gym.openai.com/envs#atari), [doom](https://gym.openai.com/envs#doom) or [pygame](https://gym.openai.com/envs#pygame) catalogue (if you really want to try some other pomdp, feel free to proceed at your own risk).\n", - "\n", - "Not all Atari environements are bug free and these minor bugs can hurt learning performance. \n", - "We recommend to pick one of those:\n", - "* [Assault-v0](https://gym.openai.com/envs/Assault-v0) \n", - "* [DoomDefendCenter-v0](https://gym.openai.com/envs/DoomDefendCenter-v0) (use env code from [this](https://github.com/yandexdataschool/Practical_RL/blob/master/week4/Seminar4.2_conv_agent.ipynb) notebook)\n", - "* [RoadRunner-v0](https://gym.openai.com/envs/RoadRunner-v0)\n", - "\n", - "Unless you have aesthetical preference, we would appreciate if you chose env out of recommended ones by `random.choice`.\n", - "\n", - "__Your task__ is to implement DRQN and A3C (seminar code may be reused) and apply them __both__ to the environement of your choice. Then compare them on the chosen game (convergence / sample efficiency / final performance).\n", - "\n", - "\n", - "* It's probably a good idea to compare a3c vs q-learning with similar network complexity. \n", - "* Also remember that you can only use large experience replay for 1-step q-learning\n", - "\n", - "\n", - "__Tips__:\n", - "Your new environment may require some tuning before it gives up to your agent:\n", - "\n", - "\n", - "* Different preprocessing. Mostly cropping.\n", - " * In some cases, even larger screen size or colorization. \n", - " * View resulting image to figure that out.\n", - "\n", - "\n", - "* Reward scaling. \n", - " * Kung-fu used `rewards=replay.rewards/100.` because you got +100 per success.\n", - " * Just avoid training on raw +100 rewards or it's gonna blow up mean squared error.\n", - "\n", - "\n", - "* Deterministic/FrameSkip\n", - " * For doom/pygame/custom, use frameskip to speed up learning\n", - " * ```from gym.wrappers import SkipWrapper```\n", - " * ```env = SkipWrapper(how_many_frames_to_skip)(your_env)``` in your make_env\n", - " \n", - " * For Atari only, consider __training__ on deterministic version of environment\n", - " * Works by appending Deterministic to env name: `AssaultDeterministic-v0`, `KungFuMasterDeterministic-v0`\n", - " * Expect faster training due to less variance.\n", - " * You still need to __switch back to normal env for evaluation__ (there's no leaderbord for deterministic envs)\n", - "\n", - "* Knowledge transfer\n", - " * If you want to switch network mid-game, you are recommended to use some pre-trained layers\n", - " * At minimum, save convolutional weights and re-use them in every new architecture using fine-tuning\n", - " * At it's darkest, [soft-targets](http://www.kdnuggets.com/2015/05/dark-knowledge-neural-network.html), [policy distillation](https://arxiv.org/pdf/1511.06295.pdf), [net2net](https://arxiv.org/abs/1511.05641) or similar __[bonus points]__.\n", - "\n", - "\n", - "\n", - "#### For the curious\n", - "- __[4+ bonus points]__ Implement attentive model for DQRN/A3C (see lecture slides for implementation details). How does it compare to the vanilla architecture? \n", - "* __[2+ bonus points]__ If you have any q-learning modiffications from week5 (double q-l, prioritized replay, etc.), they are most welcome here!\n", - "* __[2+ bonus points]__ How different memory amounts and types (LSTM / GRU / RNN / combo / custom) affects DRQN / A2C performance? Try to find optimal configuration.\n", - "- __[2+ bonus points]__ No one said l2 loss is perfect. Implement Huber or MAE loss for DRQN and/or A2C critic and compare it's performance on the game of your choice (pass proper `loss_function` to `get_elementwise_objective()`) .\n", - "- __[1++ bonus points]__ Does it help to add recurrent units when in MDP scenario, e.g fully observable \"CartPole-v0\"? How about if you only give it access to position observations? Only speed observations? Try that out!\n", - "- __[4+ bonus points]__ See the very end of this notebook. Some of the games (right side) benefit a lot from additional LSTM memory. But others (left side) do not. That is interesting. Pick up one or several games from the left side and try to figure out why A2C performance decreases when adding LSTM to feadforward architecture?\n", - "\n", - "#### Bonus: Neural Maps (a LOT of points if successful)\n", - "\n", - "Pick up either [DoomMyWayHome-v0](https://gym.openai.com/envs/DoomMyWayHome-v0) or [RaycastMaze-v0](https://gym.openai.com/envs/RaycastMaze-v0) and apply Neural Map to it. Main details of Neural Map are given in lecture slides and you could also benefit from reading [Neural Map article](https://arxiv.org/abs/1702.08360). \n", - "\n", - "[hse/ysda] Feel free to ask Pavel Shvechikov / Fedor Ratnikov any questions, guidance and clarifications on the topic.\n", - "\n", - "This block is highly experimental and may be connected with some additional difficulties compared to main track. With some brief description of you work you could get additional points\n", - "\n", - "_Scoring points are not pre-determined for this task because we're uncertain of implementation complexity._\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "__You can use the following template for DRQN implementation or throw it away entirely__" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "N_ACTIONS = env.action_space.n\n", - "OBS_SHAPE = env.observation_space.shape\n", - "OBS_CHANNELS, OBS_HEIGHT, OBS_WIDTH = OBS_SHAPE\n", - "\n", - "N_SIMULTANEOUS_GAMES = 2 # this is also known as number of agents in exp_replay_pool\n", - "MAX_POOL_SIZE = 1000\n", - "REPLAY_SIZE = 100\n", - "SEQ_LENGTH = 15\n", - "\n", - "N_POOL_UPDATES = 1\n", - "EVAL_EVERY_N_ITER = 10\n", - "N_EVAL_GAMES = 1\n", - "\n", - "N_FRAMES_IN_BUFFER = 4 # number of consequent frames to feed in CNN" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "observation_layer = InputLayer((None,) + OBS_SHAPE)\n", - "prev_wnd = InputLayer((None, N_FRAMES_IN_BUFFER) + OBS_SHAPE)\n", - "new_wnd = WindowAugmentation(observation_layer, prev_wnd)\n", - "wnd_reshape = reshape(\n", - " new_wnd, [-1, N_FRAMES_IN_BUFFER * OBS_CHANNELS, OBS_HEIGHT, OBS_WIDTH])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "conv1 = Conv2DLayer(wnd_reshape, num_filters=32, filter_size=(8, 8), stride=4)\n", - "conv2 = Conv2DLayer(conv1, num_filters=64, filter_size=(4, 4), stride=2)\n", - "conv3 = Conv2DLayer(conv2, num_filters=64, filter_size=(3, 3), stride=1)\n", - "dense1 = DenseLayer(conv3, num_units=512)\n", - "qvalues_layer = DenseLayer(dense1, num_units=N_ACTIONS, nonlinearity=None)\n", - "action_layer = EpsilonGreedyResolver(qvalues_layer)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "targetnet = TargetNetwork(qvalues_layer)\n", - "qvalues_old_layer = targetnet.output_layers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "agent = Agent(observation_layers=observation_layer,\n", - " policy_estimators=(qvalues_layer, qvalues_old_layer),\n", - " action_layers=action_layer,\n", - " agent_states={new_wnd: prev_wnd})\n", - "pool = EnvPool(agent, make_env=make_env,\n", - " n_games=N_SIMULTANEOUS_GAMES, max_size=MAX_POOL_SIZE)\n", - "replay = pool.experience_replay.sample_session_batch(REPLAY_SIZE)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# .get_sessions() returns env_states, observations, agent_states, actions, policy_estimators\n", - "(qvalues_seq, old_qvalues_seq) = agent.get_sessions(\n", - " replay, session_length=SEQ_LENGTH, experience_replay=True)[-1]\n", - "elwise_mse_loss = qlearning.get_elementwise_objective(\n", - " qvalues_seq,\n", - " replay.actions[0],\n", - " replay.rewards,\n", - " replay.is_alive,\n", - " qvalues_target=old_qvalues_seq,\n", - " gamma_or_gammas=0.999,\n", - " n_steps=1\n", - ")\n", - "loss = elwise_mse_loss.sum() / replay.is_alive.sum()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "weights = lasagne.layers.get_all_params(action_layer, trainable=True)\n", - "updates = lasagne.updates.adam(loss, weights, learning_rate=1e-4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_step = theano.function([], loss, updates=updates)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "***\n", - "***\n", - "***" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### A3C feadforward vs A3C LSTM on Atari games" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "a3c_ff = [518.4, 263.9, 5474.9, 22140.5, 4474.5, 911091.0, 970.1, 12950.0, 22707.9, 817.9, 35.1, 59.8, 681.9, 3755.8, 7021.0, 112646.0, 56533.0, 113308.4, -0.1, -82.5, 18.8, 0.1, 190.5, 10022.8, 303.5, 32464.1, -2.8, 541.0, 94.0,\n", - " 5560.0, 28819.0, 67.0, 653.7, 10476.1, 52894.1, -78.5, 5.6, 206.9, 15148.8, 12201.8, 34216.0, 32.8, 2355.4, -10911.1, 1956.0, 15730.5, 138218.0, -9.7, -6.3, 12679.0, 156.3, 74705.7, 23.0, 331628.1, 17244.0, 7157.5, 24622.0]\n", - "a3c_lstm = [945.3, 173.0, 14497.9, 17244.5, 5093.1, 875822.0, 932.8, 20760.0, 24622.2, 862.2, 41.8, 37.3, 766.8, 1997.0, 10150.0, 138518.0, 233021.5, 115201.9, 0.1, -82.5, 22.6, 0.1, 197.6, 17106.8, 320.0, 28889.5, -1.7, 613.0,\n", - " 125.0, 5911.4, 40835.0, 41.0, 850.7, 12093.7, 74786.7, -135.7, 10.7, 421.1, 21307.5, 6591.9, 73949.0, 2.6, 1326.1, -14863.8, 1936.4, 23846.0, 164766.0, -8.3, -6.4, 27202.0, 144.2, 105728.7, 25.0, 470310.5, 18082.0, 5615.5, 23519.0]\n", - "game_names = \"Alien Amidar Assault Asterix Asteroids Atlantis Bank Battle Beam Berzerk Bowling Boxing Breakout Centipede Chopper Crazy Defender Demon Double Enduro Fishing Freeway Frostbite Gopher Gravitar H.E.R.O. Ice James Kangaroo Krull Kung-Fu Montezuma's Ms. Name Phoenix Pit Pong Private Q*Bert River Road Robotank Seaquest Skiing Solaris Space Star Surround Tennis Time Tutankham Up Venture Video Wizard Yars Zaxxon\".split(\n", - " \" \")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "score_difference = np.array(a3c_lstm) - np.array(a3c_ff)\n", - "idxs = np.argsort(score_difference)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAABNgAAAJcCAYAAAArRxmIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJzsnXeYXGX5hu+X0ExQxFASqhCqSEtCifQiTXapEppIUDrS\nDAgiZIOKoCIa+FEiIKIQDCghKBIEQRKUtgGkRVpIKAESCC2bkPb+/vjOkNnZSXZn9ux+s/s993Wd\na2e+OfOc5zlnZnbn3a+YuyOEEEIIIYQQQgghhKiOpWIbEEIIIYQQQgghhBCiK6MCmxBCCCGEEEII\nIYQQ7UAFNiGEEEIIIYQQQggh2oEKbEIIIYQQQgghhBBCtAMV2IQQQgghhBBCCCGEaAcqsAkhhBBC\nCCGEEEII0Q5UYBNCCCGEEEIIIYQQoh2owCaEEEIIIYQQQgghRDtQgU0IIYQQQgghhBBCiHagApsQ\nQgjRDsxsoZldGNtHezGzb5nZC2Y218zej+2nNczsNTO7IbaP7o6ZNZjZwkr2NbMvdbSv9mBm62Q+\nj+4g/RvNbHIF+37cET6EEEII0bmowCaEEKJdmNl6Znatmb1iZrPN7EMzm2Bmp5nZ8rH9dQKebV0W\nM9sI+B3wEvBd4PgcNAeZ2TAz+0J7tRbDQrr4ee8iOOFcf4aZnWdm+y9mX12TknNmZp/L3gs7LWbf\nKOes0vdoW4uBZraZmd2eFcFnm9kbZnavmZ2aPT4sK3C2tv2z6LgLzewDM1uuzPHWL3rOWZWeByGE\nECIvlo5tQAghRNfFzL4BjAbmADcBzwLLAjsAPwe+ApwYzWDn8DlgfmwT7WQXwIDT3b1NPW/awNeA\nCwmFu49y0ixmI0oKP6JD+DHws5K2HwK3AXd2vp0uwXdp/k/snsAwQiHtoSiOylPpe7TVYqCZfQ34\nJzAFGAm8DawFbAecBlwJ/JlQzC+wAnAN8JdsK/BO0XHnE85jHXB7yWGPJPwOalF8E0IIIToTFdiE\nEEJUhZl9GRgFTAZ2c/d3ix6+2swuAL4RwVqHY2YGLOvun7r73Nh+cmC17GeehTDLUWuRqNny7j7H\n3ed1hL5ojrsvBLrDa7zTcPcFwIKipg55L+RAR/g6H/gAGOjuzXq7mdnKAO7+LOGfMYX23oQC23/d\n/ZbF6M4BHgYOp2WB7Qjgr8DBeQToDmQ9/ea6u3qUCiFEJ6IhokIIIarlB0Av4DslxTUA3P1Vd7+i\ncN/MepjZBWb2spnNMbPJZvZTM1u2+HnZsKKxZrazmT1uZk1m9l8z2zl7/KDs/mwze8LMtix5/o1m\n9rGZrWtm48zsEzN7Myv4UbLvUDN72MxmZMd5wsxafEnLhh6NMLMjzOxZwpe9vYoeu7Bo3xXM7NdZ\nvjlm9k42PKrU5zez4zWZ2XQz+4OZrb6YLKub2Zjs9rtm9ousyNcqZnaymT2beXnTzK40sxWLHp8M\nNGR3p5fmKaO3mZn9rmhI8DQzu96K5t0ys2GEHowAr2WaC8xs7ezxIWZ2f3Zu5pjZc2bWoqdj0Wth\nz+y1MJts+Kq1cQ42M/tSdm4/NLOZmffNrWQOrrbkyvYrzDO2gZn90cKwtXfN7KLs8bWya/VhptFi\nyJqZLWtmw83spSz/VDO7tMx74etmNj7z/bGZTTKzn5bss5aFIb6tnYfpZvbLovuWeZ9nRUMEzewH\nWVvP4rxFjy8k9CQ6xhYNyyu9Ditlr92Z2TFusDYMFzezHcxstJlNKTovvyp9biXvCzNbMdv/g8L1\nB77YBi8rmtl8y4Y1Zm29s7zTS/a92szeKvE3Obu9DvAuoRdW4bXT4j3Wxiw9zeyy7LzMyV4P3y/Z\nZ7HzyxUf11p5j7aD9YDnSotrAO4+o53atwD7lrxetwbWzx5r9TOx6PycZeGz8RUzm2Xhd8Ua2T4X\nmNnrFj6bx5hZi9eLme1jZg9Z+P3ykZn91cy+UrJP4XW6Vvb4xxaGy56cPb6Zhc/BTyx8nh1e5jjr\nmtltZvZe5vM/ZrZvyT47Z5kGm9lPzOwNYBawZdZ+ehndrxWe09o5E0II0XZUYBNCCFEt+wGvuvuj\nbdz/emA48ARwBvAgcB6hF1wxDmwA3AyMBc4FVgLGmtkRwGWE4agXAv2AP5V5/lLAPcA04OzsmMPN\nrKFk39OAicAFmZd5wGgz26eM/92BXwG3AqcDry0m57XACYQhdCcBvwCagE0KO5jZMZnveVm+kcBB\nwHhrPh9SIcs4YDrwfcJ5O4s2zJOW5b0SeCN7zu2Zt3Fm1iPb7XTgjuz2CcBRNB+mVcrXgXWBG4BT\nCdfvMOBvRfv8mUXX9fRM81tZBgjDhl8Dfpr5mgpcZWYnlRzLgY0JX57vJVyvp4oeWyJZgeKvwGDC\nMLgfAn2B35d5fltyFR+38Lr7AfAIcL6ZnZH5fAM4hzAM7hdmtkOJp7uy3Hdmx7oDOJPw2irs95Vs\nv2UIr8/C/l8r8fMH4IXWzgWh90/xHGCbA4XX2vZF7TsAE929qShv8bk6itCj7aHs9lGE1/xn1gnD\nxnsRXtt/Ar5NGCLZGt8kDLm+inBe7gG+R7hexVTyvhhLGEJ4E6F31ZqUv/7ND+D+IaGXVfE524Ew\nLPlLZrZJSfv4En8F/emE17sR3leFc1b8Hlu6jVnuIryf7ia8XiYRXl+XLSnLYvgLS36PVssUYICZ\nbdpOnXL8hXBeDypqO4JwHp6sUOsowufzCOCXwM7AbWb2E2BP4BLC67oue/wzzOxbhM+Vjwnv84sI\nn+/jSwqUhdfp3wnn5WxCj+8rzOzbWfvjmcZHwO+zgmzhOKsC/yF8Nl1J+PxajvC7sNwciBcA+xB+\n55yXnZeHCa//Uo7Mjqlh3kIIkSfurk2bNm3atFW0AZ8nfNH8Sxv33zzb/5qS9p8ThlLtXNQ2OWvb\npqjt69nzPwHWKGo/Ltt3p6K232Vtl5cc6y5gNvClorblSvbpAfwX+EdJ+0JCMWyjMtkWAhcW3Z8J\njFjCuViaMC/RU4RhpoX2fTOtYWWy/LBEoxF4rJVzvjKhp93dJe0nZ5rfLmoblrV9aUma5c5Z1jY4\ne/72RW3fz9rWbqPG34GXStoKr4U9yuw/GbihFa8HZef01JL2+zLdo6vINSzTvKqobSlCkXA+MLSo\nfUVCT5IbitqOyl5Lg0qOdXx2rO2y+6dn91dqJeMDwPw2XLfvEwpjvbL7pwKvEr7AX5y1GfA+8MvS\n10aJ1sflzn3RuRlZ0v5n4N0qX1s/yM7rmpW+L4D9Mz9nFbUZ8K/S678YP1cAbxXd/2V2vqcBx2dt\nK2Vap5b4e7Xofm9KPifakeXckv1GZ+dn3ez+Otl+LbKVemAJ79HFnI/fAR+1ss8e2etsHqG4cwnh\n83vpJTxnseen9LhZ3nuLruVbhMJpIfdZrfgr7Pc2sEJR+0+z9onAUkXtNxN+byyT3e+VvUeuLtFd\nhfDZf02J7wXAOUVthc+E+cAhRe0blrk+l2fPH1TU1gt4BXilqG3n7LkvUfQ7JXus8Dtyw6K2pQm9\nKq9vy3XXpk2bNm1t39SDTQghRDUUer60uqJcxr6E/+ZfXtJ+GeFLUulcbc+7+2NF9wu95O539zdL\n2o0wLKmU/yu5fyVhAYY9Cg3u/mnhdjYMaCVCT5T+ZfQedPf/lWkv5QNgWzPru5jHBwKrEgo0n81t\n5e53E3oclJu37tqS++Mpn7mYPQi9n35d0v5bwnWran68knO2nIX5kwrXodx5a03jC5nGQ8B6Zvb5\nkt0nu/t91XglDOOdC1xX0v5/lAwnqzCXE3pkFp67kNBL0gg94ArtHwL/o/m1OoTQ4+zFbMhh7+xY\nD2TP3zXb74Ps54GlQwVLfO/q7m2ZU3c84Yt1oQfcjlnb+Ow2wGaE4ZPjWzy77TjlX6+9zWyFJT6x\n+TXomZ2X/xAKmFuVeUpr74t9CIWea4qO4YTCWVuGWI8HVjOzDbL7OxJep8XnbMeifdtDW7LMJ3gv\n5jLC+SnX67bTyd6rgwg9ozYn9NoaB7xpZnU5HOIWYJesd9fuhPkjFzdv25IY7e6fFN0v/I75Q/Z+\nLm5fFlgju78noUh2a8n717N9d6UlxZ8Vhc+EWe5+e1H7i4T3fOk1f8zd/1O03yxCj+cvlw5JBW70\nlnOCjgY+pXkvtr0JRc0/lvEqhBCiHajAJoQQohoKk+GXFkMWR6HXwMvFje7+DuFLxTol+08t2a9w\nvDdK9vsw+7lSSftCQu+cYl4kfKn+cqHBzPbL5rSZTeiV8C5h2NCKtOS1Mm3lOAf4KvC6mT1qZsPM\nbN2ix9chfBl7scxzJ9HyXMxx9/dK2mbSMnMpBZ1mx/GwOMCrZY7TJsxsJTP7jZm9TejZMT3Tc8qf\nt3Ia25vZfWb2CeH6Tyf0IKGMxuRqfGasA0xz9zkl7S+X7lhFrqkl9z8kXKv3y7QXX6sNgE0z/eLt\nf9mxVs32+xOhB9BvgXfMbJSFefuqnZh+ImGocnFhqFBgG2hh/rcdMw8TqjxGgdJzMzP7ucTXbDZX\n1Y1m9h6ht+p0wnDJctegLe+LwvVvKtmvLYVyCOfGgB0tzEm3FS2LkjsSelc93UbNcrQ1y1tZgaWY\nF4oerwncvdHdDyH43wa4mLBS6G1mtnE75e8m/IPgMMLw0Me9upWPXy+5X/hd0trvmPUJr4kHaP7+\nfZfQU2/VkueXu7YfljlOob30mpd7rS7umr9WumNW0LuLcK4KHAm86e4PlNEWQgjRDrSKqBBCiIpx\n948tTOr91Uqf2sb9FlTYXnHRwcx2JPSyeJBQVJtG6O1yLGGlulJmt0XX3W8zs4eAAwm9HYYCPzCz\nA919XKU+WXzmWNwGbEcY3vs0oRBSmA+r1X/cmdl6hCGaLxDmkXqd0MvsG4S5+Uo12nTec6DSXOWu\nS1ten0sBzxCyl3vdvg6QFQV3MrNdCedmb8KQ1fvNbM+sJ1abcff5ZvZoptkP6EPojTWd0NNxW8Jc\nYpPKFAQqpeL3qZktRXhdfBH4GVkvH0LPod/T8hp0+PvC3adZWKxgJ8IcWhB61M0Afm1maxHO2b/b\neag8s5R9XWTnt1Nx9/mEoa6NZvYSYcjkN4Eft0NzrpndQZjXbz3aNrdfOar9HbMU4RwfBbxTZr/5\nOR2nGhb3WXkTcIiZbUeYV7CO0KNbCCFEzqjAJoQQolr+ChxnZtt66wsdTCF8MdmAov/IZ8N8vsii\nL695sRThy1dxT6XCSouF3g4HE76Q7JV9ESx4+k57D571zLsGuMbMViZMwH0+oVgzhfAlaiNCca+Y\njcjvXBR0NqKoZ4OZLUOYzP8flQpmw2h3Ay5w958Wta9fZvfFFYDqCEOu6oqH+5rZ7pX6aQNTCMPJ\nli/pxbZB8U4V5movrwCbt7X3SLbfA8BQMzsP+AlhGNo/qzj2eEIPyz2A6dmwNMzsOUIRaUdCb5dW\nbVVx7NbYjHBdvuXuNxcazWyPxT+lVaYAu5lZz5JebJX0oir0VnsNeMrdZ5nZ04TeRvsQhg8vdtXd\njDzO1xRgdzPrVdKLbZOix2FRb8HSlS/L9XDriOu4OJ7Ifi5u6Hwl3EL4R8gCihYG6SReIXx+T3f3\nat6DlTCFRb+3iim95q1xD6EofCTwGGEhEQ0PFUKIDkBDRIUQQlTLzwlDzq7LCmXNMLN+ZnZadvdu\nwpeSM0p2+z7hS17pSo15cGqZ+3NZVJiYnx37s382mdmXCZOJV4WZLVWyCijuPoMwEfdyWdMThOFE\nJ2bFrsJz9yF8cfprtccv4T5Cj7zTStq/S5hDr5rjFHpdlP79cCYtv6wXigClX/RbaJjZisAxVfhp\njXGEYt5xRccy4BSa+60kV3sZDaxpZseVPmBmy2dDETGzcsMpnya8j5Yres5aZlbuS3g5xgPLE96H\nxcNAJxBWkOxL2+YSm0XL69peFncNzqD6a3A3oXfeZ6vTZj25vleB5nhCQfrQ7HZhHrf/EFb6XJrW\nz1mhuNeec3Z3dqzSz7UzCUPi/555+5hQTNmpZL/S1zws/j1aNWa2y2IeKsz5OCmHwzwA/IiwsMS7\nOehVwjjCFAk/NLMWHRWyf6jkxd3ANma2bZF+L8KCKJPd/fm2iLj7AsKKsYMJn7PPuPuzOfoUQgiR\noR5sQgghqsLdXzWzIwg9CF4ws5sIw0+WBbYnTOb+u2zf/5rZ74Hjs8LBvwhD0o4mrET6r5ztfQrs\nbWY3Eiae3pfQ2+SnRcPf/kb4gjzOzG4hTJZ9MmElts2rPO7ngTfM7HYWDTP8OmFhg7Pgs6F6PyBM\nhv+QmY0iDNc7jTDnV+miBFXh7jPM7GfAhWZ2DzCW0HPnJEIvhpuX9PzFaH6cDX89J5uz603CMNgv\n03JoU2PWdrGZ3Uoo9o0F7s1u/9XMriWcs+8Shlv1qdRTK4whZL0sm6h+ElDPooKCV5GrvfyBUKy5\nOhv++TBh9dpNCMPn9iTMl3ahme1EeJ1OIbw+TyLMbzahRG8n2vZP0/8QCssb0nxS/YcybadtBbZG\nYA8zO5NQPJ5csihJNUwi9A66zMzWJBQxDqZ9xZ+7yFayzOZBfJ6wsmxb546ERedjI+CHRe0PET5T\n5gCPL0nA3eeY2fPA4Gyo5PvAs+7+XAU+7iIUln6aZXmasIhHHWHF5OJ5yK4DzjWz3xIK+jsRege2\n6T3q7ksalr2smZ1fpv19d78auCIrEt9BuKaF3weHEj7fbmx75PJkBc6L26tTAZ+dt+yz4iTCsMuJ\n2XmbDqxNKCJOoOU/NarlEsJ0BfeY2QjC6+YYQm/EgyrUuinztQuhF6sQQogOoNsX2LLeCDcQ/jCd\nD2zXyh8OQggh2oi732VmhZXi6oETCb3EniXMPTayaPfvEL5AHwMcALxNmNj+olJZyvcuqaR9PmHO\nqmsIPe0+Bhrc/bO5f9z9ATM7FjiXsLrpZMIXj3VpWWBb3LFLH2sirFC5J2EOtqUIw1RPcvfPzoW7\n/97MZmXHvoTQk+TPwLlFCzoU6y/uuEvE3Yeb2buEXi+/InxBuwY4P+vVUA2HE1YyPJnwxXMcodDw\nVrEnd3/CzH5EeE3sRTgX67r7i2Z2MGGo4y8Ir4OrgPcoWm2vKGNbznv5HdwXmtm+wG8IxdyFhHn3\nfkwonBQPG21TrlZo9Vq5u5vZ/oSeR0cT3gtNhOLD5SxalOJOwhfpIcDKhF5JDxJex8Wr93qWq3Vz\n7k1m9iQwgOZFuvGZzlR3L538vVyuswgFuh8Thpv9nlDIrJqs8LwfMILwvpgD/IXwfiq3gEBbz3Ud\noWh9ZPbYnZn/J9vo68XsPbQy5c/Zo9nCIa35+w7h9fUrQtFpOPDcYvZdUpaLWNQT6TVgqLuXrs58\nUeb3EELR9m7Ca/ld2vAepeUiFcUsQ8vPbAifc1cTeiV/MzvecVnWqYQ5v35a5vOtOOuS3mdteQ+2\n+pnQyn5t+qx191Fm9ibhdTqU0KP0TcJr4nfVaJbz5e7vmtkg4FLCZ/jywH+B/dz9njYep6A1MRsK\nvjHVrboqhBCiDViFc+R2OczsQeCH7v7vbI6Vj0qW3xZCCNGNMLPfAQe7+xda3VkkiZkdQCho7uDu\n/4ntRwghOhozmwi85+5fj+1FCCG6K916DjYz+wow193/DeDuH6i4JoQQQqSDmS1fcr8wB9dHhKGY\nQgjRrTGzgcCWhN6mQgghOojuPkR0A2CWmY0FVgf+7O4/i+xJCCGEEJ3HFWb2OcL8Y8sR5vXaDjjP\n3T+N6kwIIToQM9uURXOAvklYZEUIIUQHUbM92MxsRzMba2ZvmtlCM6svs88pZjbZzGab2SNmtnXJ\nLksDOxDmlvga8HUz270T7AshhIhL957/QFTCPwkT1P+EMOffFwirD/48qishhOh4DiHMbdkDONzd\n50b2I4QQ3ZqanYPNzPYmFMUaCZPcHujuY4seH0zo5nw8YWLdMwmTqm7o7jOyfbYDhrn7Ptn9oYR5\nYi/rzCxCCCGEEEIIIYQQovtSsz3Y3P0ed7/Q3e+k5bLiEApq17r7Te4+idBLrQk4tmifx4FVzWzF\nbM6VnYAXOtq7EEIIIYQQQgghhEiHLjkHm5ktQ1hi/uJCW7Z8+X3AoKK2BWb2Q8Ky2QD3uvvdS9Dt\nTVim/DXC8vBCCCGEEEIIIYQQIk2WB74MjHP395a0Y5cssAErE+YSeKek/R3CPCuf4e7jgHFt1N0L\nuLnd7oQQQgghhBBCCCFEd+FI4JYl7dBVC2wdxWsAf/zjH9lkk03aLXbmmWdy+eWXt1snb6289VLx\nppzx9VLxppzx9VLxppzx9VLxppzx9VLxppzx9VLxppzx9VLxVss5U+CFF17gqKOOgqxetCS6aoFt\nBrAAWK2kfTXg7XbozgHYZJNN6N+/fztkAiuuuGIuOnlr5a2XijfljK+XijfljK+XijfljK+Xijfl\njK+XijfljK+XijfljK+XirdazpkYrU4jVrOLHCwJd59HWF1090KbmVl2/9+xfJXy2GOP1aRW3nqp\neFPO+HqpeFPO+HqpeFPO+HqpeFPO+HqpeFPO+HqpeFPO+HqpeKvlnKI5NduDzcx6AeuzaAXR9cxs\nC+B9d38d+BVwo5k1Ao8RVhXtCdwYwW5Z1l133ZrUylsvFW/KGV8vFW/KGV8vFW/KGV8vFW/KGV8v\nFW/KGV8vFW/KGV8vFW+1nFM0p2YLbMBA4AHAs+2yrP33wLHuPtrMVgYuIgwNfQrYy92nxzBbjlVW\nWaUmtfLWS8WbcsbXS8WbcsbXS8WbcsbXS8WbcsbXS8WbcsbXS8WbcsbXS8VbLecUzanZApu7/4tW\nhrC6+1XAVZ3jqHIOP/zwmtTKWy8Vb8oZXy8Vb8oZXy8Vb8oZXy8Vb8oZXy8Vb8oZXy8Vb8oZXy8V\nb7WcUzTH3D22h5rBzPoDjY2NjZr0TwghhBBCCCGEECJhJk6cyIABAwAGuPvEJe3bJRc56CqMGTOm\nJrXy1kvFm3LG10vFm3LG10vFm3LG10vFm3LG10vFm3LG10vFm3LG10vFWy3nFM1Rga0DGTVqVE1q\n5a2XijfljK+XijfljK+XijfljK+XijfljK+XijfljK+XijfljK+XirdazimaoyGiRWiIqBBCCCGE\nEEIIIYQADREVQgghhBBCCCGEEKLTUIFNCCGEEEIIIYQQQoh2oAKbEEIIIYQQQgghhBDtQAW2DmTI\nkCE1qZW3XirelDO+XirelDO+XirelDO+XirelDO+XirelDO+XirelDO+XireajmnaE6PhoaG2B5q\nhuHDh/cFTjjhhBPo27dvu/Vmz57NZptt1n5jOWvlrZeKN+WMr5eKN+WMr5eKN+WMr5eKN+WMr5eK\nN+WMr5eKN+WMr5eKt1rOmQLTpk1j5MiRACMbGhqmLWlfrSJahFYRFUIIIYQQQgghhBCgVUSFEEII\nIYQQQgghhOg0VGATQgghhBBCCCGEEKIdqMDWgUyYMKEmtfLWS8WbcsbXS8WbcsbXS8WbcsbXS8Wb\ncsbXS8WbcsbXS8WbcsbXS8VbLecUJbi7tmwD+gPe2NjoeVBXV5eLTt5aeeul4k054+ul4k054+ul\n4k054+ul4k054+ul4k054+ul4k054+ul4q2Wc6ZAY2OjAw7091ZqSlrkoIi8FzloamqiZ8+e7TeW\ns1beeql4U874eql4U874eql4U874eql4U874eql4U874eql4U874eql4q+WcKVDJIgcVF9jM7HPZ\n85qy++sABwLPu/u91VmuDbSKqBBCCCGEEEIIIYSAjl9F9E7gaAAz+yLwKPB94E4zO6kKPSGEEEII\nIYQQQgghuizVFNj6A+Oz24cA7wDrEIpup+XkSwghhBBCCCGEEEKILkE1BbaewMfZ7T2Bv7j7QuAR\nQqFNZJx99tk1qZW3XirelDO+XirelDO+XirelDO+XirelDO+XirelDO+XirelDO+XireajmnaE41\nBbaXgQPMbC1gL6Aw79qqwEd5GesOrL322jWplbdeKt6UM75eKt6UM75eKt6UM75eKt6UM75eKt6U\nM75eKt6UM75eKt7y1Jo3D9ZYI9/zJhZRzSIHhwC3AD2Af7r717P284Cd3H2f3F12ElrkQAghhBBC\nCCGEEF2duXPhueegsXHR9t//wm23QV1dbHddh0oWOVi6UnF3v93MJgB9gaeLHrofuKNSPSGEEEII\nIYQQQghRHXPnwrPPtiymzZ0LSy0FG28MAwbAkUfCZpvFdtt9qbjABuDub5vZCsDXzewhd58NPO6V\ndofrBMzsNeADwIH33X33uI6EEEIIIYQQQgghKufTT1sW0555ZlExbZNNQjHtW98KP7fcEnr1iu06\nDSqeg83MepvZ/cCLwN2EnmwA15vZZXmay4mFwCB336qzi2uTJk2qSa289VLxppzx9VLxppzx9VLx\nppzx9VLxppzx9VLxppzx9VLxppzx9VLw5g5//vMkrr0Wjj8+FMw+/3kYOBBOOgn+/W/46lfhl7+E\nhx+Gjz4Kxbff/x5OOw22375lcS3v8yaKcPeKNuAm4B5gTcJqoutl7XsBz1Wq19EbMBno1cZ9+wPe\n2NjoeVBXV5eLTt5aeeul4k054+ul4k054+ul4k054+ul4k054+ul4k054+ul4k054+t1d2/z57sf\ncYQ71HmPHu6bb+4+ZIj7lVe6//vf7rNmxfOWEo2NjU4YEdnfW6sptbZDiyfA28AW2e3iAtt6wCeV\n6nX0BrwKNAKPAke0sm+uBbYpU6bkopO3Vt56qXhTzvh6qXhTzvh6qXhTzvh6qXhTzvh6qXhTzvh6\nqXhTzvh63dnbggXuxxzj3qOH+69+NcWbmnIy5vmft+5OJQW2alYR/TgTfim7vYW7v2pmA4Fx7t67\n0l50iznOjsDZwADCMNQD3H1syT6nAEOBPoQFF77n7o+X7NPX3aeZWR/gPuAwd392McfUKqJCCCGE\nEEIIIYSIgnsY/jlyJPzxj3DEEbEdpU0lq4hWPAcbMB44uui+m9lSwDnAA1XoLY5ewFPAyYRqYTPM\nbDBwGTAM2IpQYBtnZisX7+fu07KfbxPmjFPlTAghhBBCCCGEEDWFO5x5Jlx7LVx/vYprXY1qVhE9\nB7g/67FqueE5AAAgAElEQVS2LPBzYFPgS8D2eRlz93sIc71hZlZmlzOBa939pmyfE4FvAMdmnjCz\nnsBS7v5JturpbsCf8vIohBBCCCGEEEII0V7c4dxz4Te/gauvhiFDYjsSlVJxD7ZseOWGwATgTkJP\ns78AW7n7K/naK4+ZLUMYOnp/kS8nDAEdVLTrasAEM3sS+Ddwo7s3doZHgEsvvbQmtfLWS8WbcsbX\nS8WbcsbXS8WbcsbXS8WbcsbXS8WbcsbXS8WbcsbX627ehg+Hn/8cLr8cTjyxY3x1hJ5YREUFNjNb\n2swuBD7v7j9190PdfV93/1FhKGYnsTLQA3inpP0dwnxsALj7ZHff0t23cvfN3f3KtogPGjSIPn36\nMGDAAOrr66mvr2fQoEGMGTOm2X733nsv9fX1LZ5/yimncP3119PU1PRZ28SJE6mvr2fGjBnN9h02\nbFiLF/jUqVOpr69vtnxuU1MTV1xxBWeffXazfZuamqivr2fChAnN2keNGsWQMiXvwYMHM2bMmGbe\nWstRTLkcTU1Nbc4BtJqj2FtrOYopl6OpqanNOWDJ12Pq1KkV5SimXI6mpqY254C2X4/WcrR2PZqa\nmqp+XZXLUXw925ujqamp6tdVuRyl3tpzPZqamnK7HnfeeWdFOUopzdHU1JTb6+of//hHm3O0dj0K\n57+97/NCjocffrjNOVq7HgVveX3uTpzYfLqG9lyPgrc8PndvuOGGXN7nxZptzVFgcTluuOGGXN7n\nxb7amqPA4nLccMMNuX3uFrzl9bl7ww03tDlHa9ej4C2Pz93HHnssl/d5IUfBWx6fuw8++GBN/n0F\ncPfdd9fk31cAt99+e25/lxR7q7W/r4q9tfdz98MPP6zJv68Apk2bVpN/XwG88soruf29W+ytlv6+\nuvTSS5t5a+/nblNTU03+fVXwVot/XxU85PW9tnTf1q7Hz34WCmyXXAJf+UrH/X3VEd9ri3MUE/t7\nbTU5Bg0axDbbbEN9fT0DBgygT58+7LLLLi32WxzVLHLwCfBVd3+toie2AzNbSNEiB2bWF3gTGOTu\njxbtdymwk7sPKq/U6nG0yIEQQgghhBBCCCE6hcsvh7POgoYGGDYsthtRSkcvcnA/sHM1xnJkBrCA\nMAS0mNWAtzvfjhBCCCGEEEIIIUTbueqqUFw791y48MLYbkR7qWaRg78Dl5jZZkAjMKv4wUIvs47E\n3eeZWSOwO1Do1WbZ/REdfXwhhBBCCCGEEEKIarnhBjjlFDjjDLj4Yii7tKPoUlTTg+0qQk+xs4Cb\ngTFF2x15GTOzXma2hZltmTWtl91fK7v/K+A4MzvazDYGrgF6Ajfm5aG9lI4frhWtvPVS8aac8fVS\n8aac8fVS8aac8fVS8aac8fVS8aac8fVS8aac8fW6srebb4bvfhdOOgl+9aslF9dqOadoTjWriC61\nhK1Hjt4GAk8Sesk5cBkwERie+RgNDAUuyvbbHNjL3afn6KFdHHvssTWplbdeKt6UM75eKt6UM75e\nKt6UM75eKt6UM75eKt6UM75eKt6UM75eV/V2221w9NFwzDFw5ZWt91yr5ZyiBHfXlm1Af8AbGxs9\nD/LSyVsrb71UvClnfL1UvClnfL1UvClnfL1UvClnfL1UvClnfL1UvClnfL2u6O3OO92XXtr9iCPc\n58/vfF8dodfdaWxsdEKnr/7eSk2p4lVEAcxsZ0LvsU2ypueBX7j7+HzKfnHQKqJCCCGEEEIIIYTI\nm3vugf33h7o6uPVWWLqaGfFFp9Ohq4ia2VHAfUATYUGBEcBs4H4zO6Jyu0IIIYQQQgghhBDdk3/+\nEw48EPbaC265RcW17ko1l/V84Bx3v7yobYSZnQVcANySizMhhBBCCCGEEEKILsyECaHX2s47h/nX\nll02tiPRUVSziuh6wF1l2scC67bPTvfi+uuvr0mtvPVS8aac8fVS8aac8fVS8aac8fVS8aac8fVS\n8aac8fVS8aac8fW6grdHH4V994Vtt4W//AWWWy6ur47QE4uopsD2OrB7mfY9ssdExsSJSxyeG00r\nb71UvClnfL1UvClnfL1UvClnfL1UvClnfL1UvClnfL1UvClnfL1a9zZxYhgSuvnmMHYs9OwZ31dH\n6IlFVLzIgZmdBPwauAH4d9a8PXAMcLq7X5unwc5EixwIIYQQQgghhBCiPTzzDOy6K/TrB//4B3zh\nC7EdiWqpZJGDiudgc/erzext4PvAoVnzC8Bgd7+zUj0hhBBCCCGEEEKI7sBzz8Eee8Baa4WVQ1Vc\nS4eq1q5w9zuAO3L2IoQQQgghhBBCCNEl+PBDaGyEJ54I2+OPw2uvwaabhp5rK60U26HoTCousJnZ\n1sBS7v5oSfu2wAJ3fyIvc0IIIYQQQgghhBCxmTULnnxyUSHtiSfgxRfDYyusAAMGwMEHw8CBsM8+\nsOKKcf2KzqeaRQ7+D1i9TPsa2WMio76+via18tZLxZtyxtdLxZtyxtdLxZtyxtdLxZtyxtdLxZty\nxtdLxZtyxtfrSG9z5sBjj8H//R8MGQKbbRaGeu64I5x3HkyeHBYwuOkmeP55+OADePBB+OUv4bDD\n4Fvf6ho5Rb70aGhoqOgJw4cPvxwY1tDQ8EFJ+xzgwoaGhp/lZ69zGT58eF/ghBNOOIG+ffu2W693\n797069ev/cZy1spbLxVvyhlfLxVvyhlfLxVvyhlfLxVvyhlfLxVvyhlfLxVvyhlfLy+tjz4K86ZN\nndqbO+7ox7BhcMYZMHJkGOZpBoMGwQknwEUXweWXw3HHhV5qW2wBq6wCS5V0XarFnB2l192ZNm0a\nI0eOBBjZ0NAwbUn7VrOK6HvAfu7+n5L2rwF/c/cuO8pYq4gKIYQQQgghhBDdhzlzYMqU0Ous3Pb+\n+2G/Hj3C3GkDB4Zt661Dz7XllovrX8SlQ1cRBe4FfmZm+7v7hwBm9kXgYuAfVegJIYQQQgghhBBC\nVMz8+fDGG82LZq+9tuj2W28t2nfppWHttWHddWGrreCgg8Lt9dYLxbSePaPFEN2AagpsQ4GHgClm\n9mTWtiXwDvCtvIwJIYQQQgghhBCia7NwIUybBq++Ch9/DPPmwdy5zX+Wa2vtsZkzQwHt9ddDkQ3C\ncM7VV19UNNt993C7sK2xRuipJkRHUPEiB+7+JrA5cA7wPNAInA5s5u6v52uvazNmzJia1MpbLxVv\nyhlfLxVvyhlfLxVvyhlfLxVvyhlfLxVvyhlfLxVvytmcefPg5Zdh3Di46io46yzYf3/46lehVy9Y\nc03YaSf4xjfGcMABcOihcOSRYQGB44+HM8+EH/0ILr4YRoyA666DW2+FsWPh/vvhkUfgv/8NRbq3\n34ZPPoE5c8bwzW/CFVfAPffA//4Hs2eH3mzjx4cFCIYPh2OOgZ13Dj3XllRc6+rXIJaeWEQ1q4ji\n7rPcfaS7n+LuQ939Jnefl7e5rs6oUaNqUitvvVS8KWd8vVS8KWd8vVS8KWd8vVS8KWd8vVS8KWd8\nvVS8pZhz1ix45hkYMwYuuwxOOgn23BP69YPPfQ422AD23jssEPC3v4Wi2667ws9+BnfdFRYSqKsb\nxbvvhlU3Z80K+yxcCJ9+GopmM2fCO++EItmrr4ai2TPPwMSJ8OijoXD2z3+Ggtqaa47i0kvhxBPD\nip4bbti++dK6wjWoRT2xiGoWOfg2MMPd/5bd/zlwPKE32+HuPiV3l52EFjkQQgghhBBCCJE6s2bB\nX/8aeqS99BK88koY5lmgV69QWOvXD9Zfv/nPtdbSMEzRfejoRQ5+CJwEYGaDgFOBM4D9gMuBg6rQ\nFEIIIYQQQgghRCRmz4a774bRo0NxrakJttgirKy5666LCmj9+sFqq4X5zoQQi6imwLYW8HJ2+wDg\ndncfaWYPAw/mZUwIIYQQQgghhBAdx6efhuGWo0eH+c4++QS23BIuuCDMk7beerEdCtF1qKbA9gnQ\nG5gK7An8KmufA3wuJ19CCCGEEEIIIYTImblz4b774E9/CvOpffRRWIzgBz8IRbUNN4ztUIiuSTWL\nHPwDuM7MrgM2BO7O2jcFXsvJV7dgyJAhNamVt14q3pQzvl4q3pQzvl4q3pQzvl4q3pQzvl4q3pQz\nvl4q3rpazvnz4d574TvfgT594BvfCIsGnHFGWHzgmWfCKp6lxbWulrNW9GpVqyP0xCKq6cF2CvAT\nwlDRg939vax9AFCTy1GY2eeAF4DR7n5OZx13zz33rEmtvPVS8aac8fVS8aac8fVS8aac8fVS8aac\n8fVS8aac8fVS8dYVci5YAP/6Vxj++ec/w4wZYf60k06CwYNhs81an0etK+SsRb1a1eoIPbGIilcR\n7YqY2U+AfsDrSyqwaRVRIYQQQgghhBBdlYUL4eGHw/DP22+Hd96BddYJQz8HD4b+/bU4gRCV0NGr\niHYpzGx9YCPgLuCrke0IIYQQQgghhBDtYvZseOklmDSp+fa//4XVP9dYA444IhTVttlGRTUhOoNu\nX2ADfgkMBbaPbUQIIYQQQgghhGgL7mFYZ2kRbdIkmDw5PA6wyiqw8cYwcCAcdRRsuy0MGgRLVTPj\nuhCiamr2LWdmO5rZWDN708wWmll9mX1OMbPJZjbbzB4xs61LHq8H/ufuLxeaOsN7gQkTJtSkVt56\nqXhTzvh6qXhTzvh6qXhTzvh6qXhTzvh6qXhTzvh6qXjLU2v+fBg1agJ33QW/+EVYiGD77WHllWHV\nVWGnneDEE2Hs2FA0O/hguO66MBR0xgx491146CEYORLOOgvcJ+RWXEvleuatV6taHaEninD3mtyA\nvYGLgP2BBUB9yeODgTnA0cDGwLXA+8DKRftcDEwBXgWmAzOBHy3hmP0Bb2xs9Dyoq6vLRSdvrbz1\nUvGmnPH1UvGmnPH1UvGmnPH1UvGmnPH1UvGmnPH1UvHWHq0PP3S/5x73889333ln9+WXd4c6B/cV\nVnAfOND9qKPcf/IT99tvd3/2Wfc5czrHW0dq5a2XirdazpkCjY2NDjjQ31upY3WJRQ7MbCFwgLuP\nLWp7BHjU3U/P7hvwOjDC3X9eRuPbwKbeiYscNDU10bNnz3br5K2Vt14q3pQzvl4q3pQzvl4q3pQz\nvl4q3pQzvl4q3pQzvl4q3irRmjYNJkyA8ePDz6efDosRrLIK7Lhj6K220UZNbLllT1Zfvf3zpdXq\nOctbLxVvtZwzBSpZ5KBNBTYze5JQsWsVd899+c3SApuZLQM0AQeXFN1uBFZ09wPLaHR6gU0IIYQQ\nQgghRDq4h8UHxo9fVFB75ZXwWL9+oaC2ww7h5wYbaPEBIWqdjlhFdEzR7eWBk4Hngf9kbdsBmwJX\nVWa1alYGegDvlLS/Q1gxtAXu/vuONiWEEEIIIYQQIh3mz4ennlpUTJswIcyJttRSsMUWsO++i3qp\nrb56bLdCiI6kTVMfuvvwwgasQhiGOcjdz8q2rwG/BlbrSLOdxaBBg+jTpw8DBgygvr6e+vp6Bg0a\nxJgxY5rtd++991Jf32LtBU455RSuv/76Zm0TJ06kvr6eGTNmNGsfNmwYl156abO2qVOnUl9fz6RJ\nk5q1X3HFFZx99tnN2pqamqivr28xUeGoUaMYMmRIC2+DBw9WDuVQDuVQDuVQDuVQDuVQDuVQjjbm\nWLAA9t9/ML/5zRjuuw9uuQWGD4f+/e9l+eXr2XprOO+8sODAccfBfvudwq9/fT0TJ8KIEfDNb8Lb\nb8fPAd3jeiiHcnRUjkGDBrHNNttQX1/PgAED6NOnD7vsskuL/RZLa5O0lW7Ah8AGZdo3AD6sVK+N\nx1xI0SIHwDLAPFoufHAjcEc7jpPrIgdDhw7NRSdvrbz1UvGmnPH1UvGmnPH1UvGmnPH1UvGmnPH1\nUvGmnPH1uoq3WbPcJ092f+QR97Fj3X/7W/ef/tT99NPdDzvMfbfd3Dfd1H2VVdzN3MPgz0XbcssN\n9f32c7/0UveHH65sEYLWvLWXWtXKWy8Vb7WcMwUqWeSgrUNEi5kNbA+8VNK+PWFVzw7H3eeZWSOw\nO1CYl82y+yM6w0NbWHvttWtSK2+9VLwpZ3y9VLwpZ3y9VLwpZ3y9VLwpZ3y9VLwpZ3y9vLQ+/RTe\new969lyb556DOXOab7Nnt2wrtxXv9/zza/OXv4QhnJ980vx4ZtC7N6y6Kqy2Wvi56aaLbpf+vOGG\ntfne93KJCtTmNchbK2+9VLzVck7RnIpXETWzc4FhwG+Bx7LmbYFjgR+7+yW5GDPrBawPGDAROAt4\nAHjf3V83s0MJPdZOzHycCRwCbOzu06s8phY5EEIIIYQQQogccYcPPwxDKKdPX/Sz+HZp28cft11/\nueVg+eUXbZ/7XPP7yy8f9llppZaFssLtlVeGpavpfiKE6NZ0xCIHn+Hul5jZq8DpwFFZ8wvAEHcf\nXaneEhhIKKgVuuNdlrX/HjjW3Ueb2crARYS5354C9qq2uCaEEEIIIYQQou3MnQtvvglTp8Lrr4ef\nb73Vslg2YwbMm9fy+V/8IqyySthWXhk22yz8LLR96UvQq9eSC2fLLhsWFBBCiNhUVGAzsx6EoaD3\n5lxMa4G7/4tWFmFw96vovJVLhRBCCCGEECIJ3MNQykLhrLiIVvj59tthvwK9e4eVMgsFsk02aV5A\nK/7Zuzcss0y8fEIIkTcV1frdfQFwL7BSx9jpXpSualErWnnrpeJNOePrpeJNOePrpeJNOePrpeJN\nOePrpeJNOSvnscfgoosmccEFcMwxsNtusP76obdYnz6w9dZw8MFhlcy//hU++AC+8hU4/nj47W/h\n3nth0qQwp9mMGTB69CTuvx9uvRWuvBKGDYOTT4ZDD4Vddw291Pr0aXtxrVbPm15r8fVS8VbLOUUJ\nra2CULoBTwC7V/q8rrCR8yqidXV1uejkrZW3XirelDO+XirelDO+XirelDO+XirelDO+XirelLPt\nPPec+777FlbFrPM113QfNMh98GD3oUPdR4xwHzPGvbHRffp094ULO89bR+nVqlbeerWqlbdeKt5q\nOWcKVLKKaDVFqL2BJ4H9gL7AF4q3SvVqacu7wDZlypRcdPLWylsvFW/KGV8vFW/KGV8vFW/KGV8v\nFW/KGV8vFW/K2TrvvON+0knuPXq4r7uu++jR7i+/XBveOlqvVrXy1qtVrbz1UvFWyzlToJICWzWr\niC4s7gBX/FDoEOc9KutDVztoFVEhhBBCCCFEd2TOHPj1r+Hii8OiABdcAKeeGlbXFEIIUZ4OXUUU\n2LUqV0IIIYQQQgghOpWFC8N8aOedF1b4PPlkuPDCsMiAEEKI/Ki4wOZhdU8hhBBCCCGEEDXMww/D\nWWeFhQwOOAD+8Q/YcMPYroQQontS0SqixZhZTzPb2Mw2L97yNNfVufTSS2tSK2+9VLwpZ3y9VLwp\nZ3y9VLwpZ3y9VLwpZ3y9VLwpZ+CVV+CQQ2CHHWD+fHjwQbjjjsUX12o1Z956taqVt16tauWtl4q3\nWs4pmlNxDzYzWwX4HbDPYnbpsnOw5U1TU1NNauWtl4o35Yyvl4o35Yyvl4o35Yyvl4o35Yyvl4q3\n1HPOnAk//jFceSWsthrcdBMceWSYcy22t1rQq1WtvPVqVStvvVS81XJO0ZxqFjm4GVgHOAN4EDgQ\nWA34EfB9d/9bzh47DS1yIIQQQgghhOhqzJ0LV10FF10Ubp93Hpx5JvTsGduZEEJ0bTp6kYPdgP3d\n/YlsRdEp7v4PM/sIOA/osgU2IYQQQgghhOgquMOYMXDOOfDqq/Cd74QiW58+sZ0JIUR6VDMHWy/g\n3ez2TGCV7PYzgLp9CSGEEEIIIUQH88QTsMsucNBB0K8fPP00jByp4poQQsSimgLb/4CNsttPAyeY\n2RrAicC0vIx1B2bMmFGTWnnrpeJNOePrpeJNOePrpeJNOePrpeJNOePrpeIthZzPPQff/OYMtt4a\n3nsP/v53uOce+OpX43vLWytvvVrVyluvVrXy1kvFWy3nFM2ppsD2G6Bvdns4YbGDqcBpwA9z8tUt\nOPbYY2tSK2+9VLwpZ3y9VLwpZ3y9VLwpZ3y9VLwpZ3y9VLx115xz5sDNN8OOO4ZC2l13Hcu118JT\nT8Hee8f11pFaeevVqlbeerWqlbdeKt5qOacowd3btQE9CUNDV26vVuwty+GNjY2eB3np5K2Vt14q\n3pQzvl4q3pQzvl4q3pQzvl4q3pQzvl4q3rpbzhdfdB861L13b3dw320399Gj3R95pHvl7Cy9WtXK\nW69WtfLWS8VbLedMgcbGRgcc6O+t1JSqWUV0PXd/Nd8yX22gVUSFEEIIIYQQMZk3D8aOhWuugfvu\ngy99CY45Bk44ATbcMLY7IYRIi45eRfRlM3sD+BfwIPAvd3+5Ch0hhBBCCCGEEMCUKfDb38L118Pb\nb8P228Mf/gCHHALLLx/bnRBCiNaopsC2FrALsDNwDvBbM3uLUHB7wN2vy8+eEEIIIYQQQnRPFiwI\nixRccw3cfTessAIcfXTorbbZZrHdCSGEqISKFzlw9zfd/WZ3P97dNyKsKHofcChwbd4GuzLXX399\nTWrlrZeKN+WMr5eKN+WMr5eKN+WMr5eKN+WMr5eKt66Sc9o0+MlPYN11oa4u9FgbORLeeguuvLL1\n4lpXyVlrerWqlbderWrlrZeKt1rOKZpTcYHNzHqa2Z5mdrGZ/Rv4L7AFcCVwUN4GuzITJy5xeG40\nrbz1UvGmnPH1UvGmnPH1UvGmnPH1UvGmnPH1UvFWyzkbGydy331hyOfaa8PPfgZ77gmPPw5PPAHf\n/W7owdbZvvLWS8WbcsbXS8VbLecUzalmkYO5wEzgZsIcbOPdfWb+1jofLXIghBBCCCFEusyeDa+9\nBpMnhznR5syBhQvDtmDBotul91t7bMECePBBePll2HRTOOkkOOooWHHF2ImFEEIsiY5e5OBuYAfg\nMKAP0MfMHnT3F6vQEkIIIYQQQohOYcECeOONUEArbK++uuj2tGmL9u3RIywu0KMHLLVU2Cq9XXx/\n0CC48Ub42tfALNopEEII0UFUXGBz9wMAzGxzwkIHewI/NrP5wIPufmS+FoUQQgghhBCiddzhvfea\nF82Ki2hTp8K8eYv2X311WG896NcP9tgjzIm23nrh5+qrh+KYEEII0Raq6cFW4Jns+csCywN7AYOB\nmimwmdmKhAUYehC8jtAqp0IIIYQQQnQvpk2DhgYYNQo+/nhR+xe/uKhoduCB4Xbh/jrrhB5qQggh\nRB5Us8jBWWY2FngPeBQ4HHgROBhYJV977eYjYEd37w9sC/zQzFbqrIPX19fXpFbeeql4U874eql4\nU874eql4U874eql4U874et3V20cfwQUXwPrrw+23Q9++9dx+O0ycCDNnhm3ixPDYL34BJ58M++wD\nG23UtuJareTsSK289VLxppzx9VLxVss5RXN6NDQ0VPSE4cOHjwAmApcDp7r7FQ0NDeMaGhr+19DQ\nMKcDPFZNQ0MDDQ0N8wGGDx/+BeB44LcNDQ2zy+0/fPjwvsAJJ5xwAn379m338Xv37k2/fv3arZO3\nVt56qXhTzvh6qXhTzvh6qXhTzvh6qXhTzvh63c3b3Llw1VVw8MFh8YDTT4fRo2GLLXpTV9ePvn3z\n6Z0WO2dnaOWtl4o35Yyvl4q3Ws6ZAtOmTWPkyJEAIxsaGqYtad+KVhE1s6WBHwI3uPsb7XLZSWTD\nRP8FrA+c7e5XL2FfrSIqhBBCCCFEjeIeCmnnnx/mVDvmGBg+HNZcM7YzIYQQ3ZFKVhGtaIiou88H\nzqZ9c7e1CTPb0czGmtmbZrbQzFr0YzSzU8xsspnNNrNHzGzrMp4/dPctgXWBI82s1oaxCiGEEEII\nIVrhgQdgm23gsMNgk03g6afh+utVXBNCCFEbVDwHG/BPwuqhHU0v4CngZKBFNzszGwxcBgwDtgKe\nBsaZ2crlxNx9erbPjh1lWAghhBBCCJEvzzwD++4Lu+0GSy0VhoTedRd89auxnQkhhBCLqKbA9nfg\nEjP7pZkdbmb1xVtextz9Hne/0N3vBKzMLmcC17r7Te4+CTgRaAKOLexgZqua2QrZ7RWBnYD/5eWx\nNcaMGVOTWnnrpeJNOePrpeJNOePrpeJNOePrpeJNOePrdUVvU6eGIaBbbAEvvRSGhj7yCOy8hH/1\nd8WcsbXy1kvFm3LG10vFWy3nFM2ppsB2FbAacBZwMzCmaLsjP2uLx8yWAQYA9xfaPEwmdx8wqGjX\ndYDxZvYkYR6237j7c53hEWDUqFE1qZW3XirelDO+XirelDO+XirelDO+XirelDO+XlfyNnMmnHMO\nbLgh3H03XHEFPP88fPObYOX+7d5BvvLWq1WtvPVS8aac8fVS8VbLOUUJ7l7zG7AQqC+63zdr27Zk\nv0uB/7TjOP0BX3bZZX211Vbz/v37e11dndfV1fl2223nd9xxhxczbtw4r6ur81JOPvlkv+6665q1\nNTY2el1dnU+fPr1Z+4UXXuiXXHJJs7YpU6Z4XV2dv/DCC83aR4wY4UOHDm3WNmvWLK+rq/Px48c3\na7/lllv8mGOOaeHt0EMPVQ7lUA7lUA7lUA7lUA7lqLkcs2e777//CF9uuaHeq5f7hRe6f/RR18vh\n3j2uh3Ioh3IoR2o5tttuO9966629rq7O+/fv76uttpp//vOfd8K0Zf29lZpSRauIxsLMFgIHuPvY\n7H5f4E1gkLs/WrTfpcBO7j6ovFKrx9EqokIIIYQQQnQiCxbAzTfDBRfAm2/CccfBsGHQp09sZ0II\nIVKnw1YRLWBmO5vZXWb2craNNbPOXDxgBrCAMFS1mNWAtzvRhxBCCCGEEKIK3n8/zKvWvz98+9sw\ncCA89xxcfbWKa0IIIboeFRfYzOwowlxnTcCIbJsN3G9mR+RrrzzuPg9oBHYv8mXZ/X93hgchhBBC\nCCFE23n/fRgzBs44A7bcElZeGQYPhi98AR5+GP78Z9hoo9guhRBCiOqopgfb+cA57j7Y3Udk22Dg\nXELCpJkAACAASURBVOCCvIyZWS8z28LMtsya1svur5Xd/xVwnJkdbWYbA9cAPYEb8/LQXoYMGVKT\nWnnrpeJNOePrpeJNOePrpeJNOePrpeJNOePrxfA2cybceSeceSZstVUoqB14YGjbaiv43e9g8mRY\nf/0hfO1rnecrll6tauWtl4o35Yyvl4q3Ws4pmrN0Fc9ZD7irTPtY4OL22WnGQOABwmRyDlyWtf8e\nONbdR5vZysBFhKGhTwF7ufv0HD20iz333LMmtfLWS8WbcsbXS8WbcsbXS8WbcsbXS8WbcsbX6wxv\nM2fC+PHw4INhe+opcIe114Zdd4XTT4dddoEvf7njvKVyDVLJmbderWrlrVerWnnrpeKtlnOK5lS8\nyIGZvQz8wt2vLWk/Efi+u2+Qo79ORYscCCGEEEII0TY++KB5Qe3JJ5sX1HbZpXxBTQghhOgqVLLI\nQTU92C4DRmRDNwvznW0PHAOcXoWeEEIIIYQQosaZPx8mTIC//Q0eeCAU1BYuhLXWCgW1U09dVFAz\ni+1WCCGE6FwqLrC5+9Vm9jbwfeDQrPkFYLC735mnOSGEEEIIIUQ8PvoIxo0L86bdfXcYBrr66rD7\n7nDKKSqoCSGEEAWqWeQAd7/D3Xdw997ZtoOKay2ZMGFCTWrlrZeKN+WMr5eKN+WMr5eKN+WMr5eK\nN+WMr1eJ1uuvw1VXwd57h4UJDj0Unn0Wvvc9eOIJeOMNOP74CQwZAuuu2/7imq5BXK289VLxppzx\n9VLxVss5RQnu3qYNOBZYrq37d8UN6A94Y2Oj50FdXV0uOnlr5a2XijfljK+XijfljK+XijfljK+X\nijfljK+3JK2FC92ffNK9ocG9f393cF96afc99nAfMcL9tdfieYuplbderWrlrZeKN+WMr5eKt1rO\nmQKNjY2FhTf7eys1pTYvcmBmC4C+7v5udv8t4Gvu/lr+Zb845L3IQVNTEz179my/sZy18tZLxZty\nxtdLxZtyxtdLxZtyxtdLxZtyxtcr1Zo7F/71rzD0c+zY0GvtC1+AffeF/fcPvde++MU43mpFK2+9\nWtXKWy8Vb8oZXy8Vb7WcMwUqWeSgkgLbQqBPUYHtY2ALd3+1nX5rBq0iKoQQQgghuiszZ8Lf/x6K\navfcE+ZXW2cdqK8PRbUdd4Rll43tUgghhKgdOnoVUSGEEEIIIUQNM2cOvPgivPBC2MaPh4ceCiuB\nDhwIQ4eGwtrmm2uBAiGEECIPKimwFcadLu6+EEIIIYQQohP58MNFRbTibfJkWLgw7LPaatC/P1xx\nBey3H6y5ZlzPQgghRHekklVEDXjRzN43s/eBFYAnC/eL2kXG2WefXZNaeeul4k054+ul4k054+ul\n4k054+ul4k0526fnDm+/DQ88EFb3/N73YI89YI01wjxpgwbBscfCbbfBggVhuOe118KECfDee+G5\nm256NieemF9xrVbPm15r8fVS8aac8fVS8VbLOUVzKunBNqTDXHRT1l577ZrUylsvFW/KGV8vFW/K\nGV8vFW/KGV8vFW/KWRkffAC33goTJqzN9tuHHmkzZ4bHll4aNtgANtkEhgwJPzfZBDbaCHr16nhv\nHaFXq1p569WqVt56qXhTzvh6qXir5ZyiOW1e5CAFtMiBEEIIIYSIxeOPwzXXwKhRMG9emB+tUEDb\nZBP4ylegXz9YZpnYToUQQog00CIHQgghhBBCdAFmzQq91a6+GhobYe214fzz4TvfgT59YrsTQggh\nRFtpU4HNzGbSxgUN3P1L7XIkhBBCCCFEN+f550NvtZtugo8+gn32gbvuCj979IjtTgghhBCV0tZF\nDs4Azsy2n2Rt44CGbBuXtf04R29dnkmTJtWkVt56qXhTzvh6qXhTzvh6qXhTzvh6qXhTzsDcuaG3\n2s47w6abwp/+BCefDK+8An/7W1jhs7i4Vqs589arVa289WpVK2+9VLwpZ3y9VLzVck5RgrtXtAF/\nBk4t034qMKZSvVragP6ANzY2eh7U1dXlopO3Vt56qXhTzvh6qXhTzvh6qXhTzvh6qXhLPeerr7qf\ne677qqu6g/vOO7vfeqv7p5/G91YLerWqlbderWrlrZeKN+WMr5eKt1rOmQKNjY1OGNHZ31urKbW2\nQ4snwCfA+mXa1wc+qVSvlra8C2xTpkzJRSdvrbz1UvGmnPH1UvGmnPH1UvGmnPH1UvGWYs75893H\njnXfZx93M/cVV3Q/7TT3556L763W9GpVK2+9WtXKWy8Vb8oZXy8Vb7WcMwUqKbBVvIqomU0BRrj7\nZSXt3wdOc/d1qupKVwNoFVEhhBBCCNEe3n4brrsORo6E11+HAQPgpJPgsMOgV6/Y7oQQQghRCR29\niugw4Doz2wV4NGvbFtgbOK4KPSGEEEIIIWqeefNg5sywvf9+2IpvP/ss3HknLLMMHH54KKwNHBjb\ntRBCCCE6g4oLbO5+o5m9AJwGHJQ1vwDs4O6PLv6ZQgghhBBC1A7vvw9PP12+WFZ8u3D/44/L6/w/\ne/ceJ3VZ////cYmQoWgJBmTigTS1TARF1yOhkpo7eSYPaWCCSZ5SyjJlMbVWw4+JUZLr6ZPiT1M3\nvyIq5XFVNBeP6XpIAz+ICpGnXTzAvn5/vGd0Z1nYOVyz13v3et5vt7nFvPe9z3m+HHYbL98z1zrr\nwIYbwsYbw7Rp8P3vwxe/2LWziIiISFiF7iKax8weM7OjzWx49na0FtdWVVtbm8os33mxdNOc4fNi\n6aY5w+fF0k1zhs+LpVsa57zlFthySxg9upbDDoOJE+E3v4G//AWefBLeew++9CWoqkoWzGpq4Kqr\noL4eHnwQnn0WFi2ClhZYvjz586GH1nLKKf4W13r6c5D2LN95ac3ynRdLN80ZPi+WbmmeU/KV8hbR\nTznn1gH6tD1mZu+V1agHaWlpSWWW77xYumnO8HmxdNOc4fNi6aY5w+fF0i1Nc777LpxyClx3HRxy\nCAwc2MIFF8D660OvXuF6VTovlm6aM3xeLN00Z/i8WLqleU7JV8omB32Bi4AjgP7tv25mZb40CUeb\nHIiIiIj0XPffD8cdl7zd8/LLkyvTnAvdSkRERNKqmE0OSnmL6MXAaOBHwEfAD0k2PngDOLaEvIpx\nzn3FOXefc+6fzrmnnHOHhe4kIiIiIl3rww/hzDNh9GjYbLPk7Z3HHqvFNREREfGnlLeIVgPHmtn9\nzrmrgYfM7BXn3ALgaOB6rw3LswI41cyecc4NBBqdc7PNbHnoYiIiIiJSec88A8ccAy++CBddBKef\nXv5bQUVERETaK+UKtg2BV7N/fi97H6AB2NNHKV/M7E0zeyb757eApXzWt+KWLl2ayizfebF005zh\n82LppjnD58XSTXOGz4ulW4g5V66Eiy+GnXZK7v/jH8lVbO0X17r7nCGyfOelNct3XlqzfOfF0k1z\nhs+LpVua55R8pSywvQpsnv1zE8lnsUFyZds7PkpVgnNuBLCWmS3qqsccP358KrN858XSTXOGz4ul\nm+YMnxdLN80ZPi+Wbl0957//Dd/6FvzsZ3Dqqcni2je/WflusTyfvvPSmuU7L61ZvvNi6aY5w+fF\n0i3Nc0o7ZlbUDTgdOCX7532A5cCHwEqSt2MWnbmax9kDuB1YBLQCmQ7OmQS8lu0wD9hpNVkbAs8B\nO3fymMMBa2xsNB985fjO8p0XSzfNGT4vlm6aM3xeLN00Z/i8WLp11ZytrWZXX23Wr5/ZppuaPfBA\n13aL5fn0nZfWLN95ac3ynRdLN80ZPi+WbmmeMwaNjY0GGDDcOlnHKnoX0facc5sCI4BXLPt2TB+c\nc/sBuwKNwK3AwWZ2e5uvjwWuBSYAj5Ms/B0ObGVmS9uc1weYC1xhZjd08pjaRVRERESkG1qyBCZO\nhNtuS3YKvewyWH/90K1ERESkOytmF9FSNjn4lHNuHTNbACwoJ6cjZnYXcFf2cTra4+l0kkWz67Ln\nnAh8BxgPXNTmvGuBv3e2uCYiIiIi3dPs2XD88bBiBfzlL3DooaEbiYiISGyK/gw251wv59w5zrlF\nwAfOuS2yx3/lnDvee8OOO/QmuWru77ljllyK9zegqs15u5Fc1XaQc+5J59x859zXu6KjiIiIiFRW\nczP86Edw4IEwYgQ8+6wW10RERCSMUjY5OBv4AfBT4OM2x58DfuihUyEGAL2At9odfwsYlLtjZg+b\n2dpmNtzMdsj+7z87C6+qqmLQoEGMGDGCTCZDJpOhqqqK+vr6vPPuueceMpnMKt8/adIk6urqqKur\n+/TY/PnzyWQyq+zYMWXKFGpra/OOLVy4kEwmQ1NT06fH6urqmD59OpMnT847t6WlhUwmQ0NDQ97x\nWbNmMW7cuFW6jR07lvr6+rxunc3RVkdz1NXVFTwH0OkcbR+zszna6miOurq6gueANT8f559/flFz\ntNXRHHV1dQXPAWt+Pi699NKC5+js+airqyv571VHc7TvXM7zUVdXV/Lfq47maN+jnOejrq6u7J/z\n3Bz77bdfUXO0136Ourq6sn/Oc3McdthhBc/R2fORe9xyf85zc/zgBz8oeI7Ono9cN1+/d3/84x8X\nPEdnz0fu6z5+7w4bNszLz3lOrpuP37vDhg3z8nPetlehc+Ssbo5hw4aV/XOemyOX7+v37rBhwwqe\no7PnI/c9Pn7vTpgwwcvPea5TXV0d8+bB1lvP509/yvDb3y7ljjtg8OBV58hZ3RxHHXVUKl9fAWQy\nmVS+vgIYPXq0l9dXTU1NeTlpen21dOnSvK+V+3t3xowZqXx9BXDRRRel8vUVwLnnnuvl9dXkyZPz\nstP0+qq2tjbv/HJ/79bV1aXy9VWuWxpfX+W6+fr32rPOOqvgObry9VUl/r220DlyfM3h63ViW1VV\nVYwcOZJMJsOIESMYNGgQo0aNWuW81ersQ9ra34BXgL2zf34f2CL7562B/xabV+Bj5m1yAAzOHtu5\n3Xm1wKNlPI7XTQ5OOukkLzm+s3znxdJNc4bPi6Wb5gyfF0s3zRk+L5ZuPrM+/thsxx1PsrXWMtt5\nZ7OXXiovL61z+s6LpZvmDJ8XSzfNGT4vlm5pnjMGFd3kwDm3HNjazBY4594HtjezV51z2wKPm9l6\nRQUW9pitwEGW3eQg+xbRFuBQy9/44BpgAzM7uMTH0SYHIiIiIinx7rvwr3/l3x55BF58EaZMgZ//\nHNYu6xOFRURERFav0pscPA/swaobGxwGPFlCXtHM7BPnXCOwN5BbdHPZ+5d1RQcRERERKU9rKyxe\nvOoiWu62bNln526wAQwdCt/4BlxzDey0U7DaIiIiIqsoZYHtPOBa59zGJJ/hdohz7mvAscCBvoo5\n59YFvgrkdhDdwjm3PbDMzF4HLgGuyS60PU6yq2hf4BpfHURERESkPK2t8PLLHS+gvfYafPjhZ+du\nvPFni2iZTPLn3G3DDaHDfeVFREREUqDoBTYz+6tzrho4F2gmWXCbD1Sb2VyP3XYE7iN5r6sB07LH\nrwXGm9lNzrkB2ccfCDwFfNvMlnjsICIiIiIlePvt5EqzmTOTxTSAPn1g882TBbN99oEttvhsAW3z\nzeHznw9aWURERKRkpewiipk9ZGb7mtmXzKyvme1uZvf4LGZmD5jZWmbWq91tfJtzZpjZZmb2eTOr\nMrMnfHYoV0c7ZqQhy3deLN00Z/i8WLppzvB5sXTTnOHzelq31la4914YOxa+8hU491zYdVfYeecM\nCxZASws0NcHs2XDZZXDaaVBdDdtuW9ziWug5uyLLd14s3TRn+LxYumnO8HmxdEvznJKvV01NTUEn\nOufGT5069fmampqVla0UztSpUwcDEydOnMjg3D7vZejfvz9Dhw4tv5jnLN95sXTTnOHzYummOcPn\nxdJNc4bP6yndliyBGTNg3Di49FJYuTLZgOC66+Coo2DIkP4MHz6UtUr6T7vldeuuWb7zYummOcPn\nxdJNc4bPi6VbmueMweLFi5k5cybAzJqamsVrOrfgXUSdcyuBwWb2dvb+G8CuZvbv8uqmh3YRFRER\nESmMGdx/P1xxBdx6K6y1Fhx2GEycCLvvrs9LExERke6vUruItn+Z1I8S32IqIiIiIt3T0qWffbba\nyy/D174GtbVw7LHQv3/odiIiIiJhlLKLqIiIiIhExAweeOCzq9UguVrtT3+CPffU1WoiIiIixVyB\nltvNc3X3pZ36+vpUZvnOi6Wb5gyfF0s3zRk+L5ZumjN8Xtq7LV0Kl1wC22wD3/oWzJ8PF14IixbB\n9dfDXnsVtriW9jnTmOU7L5ZumjN8XizdNGf4vFi6pXlOyVfMApsDXnLOLXPOLQPWA57M3W9zXLJm\nzZqVyizfebF005zh82LppjnD58XSTXOGz0tjt48/hr/9DU49dRYbbwxnnQU77AD33ZfsAHrGGTBg\nQNf3qlReWrN858XSTXOGz4ulm+YMnxdLtzTPKfmK2eTguELOM7Nry2oUkDY5EBERkRi99RbMmQN3\n3AH33APvvw9bbgkTJsBxx8FGG4VuKCIiItL1KrLJQbELZ865I4Hbzay5mO8TERERkcoygyefTBbU\nZs+Gf/wjOT5yJEyeDAceCMOG6bPVRERERApVyU0OrgAeA16t4GOIiIiISAE++CB56+fs2clt8WJY\nf30YMwZOOgn23x++9KXQLUVERES6p0ousOm/eYqIiIgE9OqryWLaHXfA/fcnn6/2ta/BkUcmV6nt\nthv06RO6pYiIiEj3V8wmB1KkcePGpTLLd14s3TRn+LxYumnO8HmxdNOc4fN8dzvuuHE88EDyNs9t\nt4WhQ5NNCVpb4aKL4OWXk40Kpk1LdgVd0+JamudMa7dY5vSdl9Ys33lpzfKdF0s3zRk+L5ZuaZ5T\n8lXyCrbojRkzJpVZvvNi6aY5w+fF0k1zhs+LpZvmDJ/nK6uxMVk0q68fw3XXwaBBcMABcMEFsM8+\n0K9fuG6+s3znpTXLd14s3TRn+LxYumnO8HmxdEvznJKv4F1Eiw527n1gezPrNp/Bpl1ERUREpDt5\n/nk45xy49VbYais46ij4zndg+HBYS+9TEBERESlLRXYRFREREZF0ePVVqKmBP/8ZNt0UrrkGjj4a\n1tYrOxEREZEgKvnfNhcAn1QwX0RERCQqixbBj36UbFTwt7/B5ZfDiy/CccdpcU1EREQkpIIX2Jxz\nX3TOneycW7+Dr22Q/doXc8fM7Btm9rqvot1RQ0NDKrN858XSTXOGz4ulm+YMnxdLN80ZPq/QrCVL\nks0KvvpVuOkmuPBCeOUVOOmk/I0K0vrPrSc8B12d5Tsvlm6aM3xeLN00Z/i8WLqleU5px8wKugHn\nADev4es3ARcWmpfGGzAcsMbGRvOhurraS47vLN95sXTTnOHzYummOcPnxdJNc4bP6yzrnXfMzjnH\nbL31zPr1M6upMXv33XR0C5XlOy+tWb7zYummOcPnxdJNc4bPi6VbmueMQWNjowEGDLdO1pQK3uTA\nOfcUcIaZ/X01X98bmGZmw8pd9AvF9yYHLS0t9O3bt/xinrN858XSTXOGz4ulm+YMnxdLN80ZPm91\nWc3NMH06XHQRLF8OJ58MP/sZ9O8fvlvoLN95ac3ynRdLN80ZPi+WbpozfF4s3dI8ZwyK2eSgmAW2\n94Gvm9nC1Xx9CPCcma3yFtLuQruIioiISEgffQQzZ8IFF8CyZXDCCXD22fDlL4duJiIiIhKfYhbY\nitnkYCWwppd3XwZai8gTEREREWDFCrjqKthqKzjtNNh/f3jpJfj977W4JiIiItIdFLPA9iRw0Bq+\nfnD2HBEREREpQGsr3HgjfP3rcPzxsPPO8NxzcPXVsNlmoduJiIiISKGKWWC7HDjDOfdj51yv3EHn\nXC/n3MnA6cDvfRfsziZPnpzKLN95sXTTnOHzYummOcPnxdJNc4bNu+8+GDhwMkcemewOOn9+skPo\nNtuE75bmLN95ac3ynRdLN80ZPi+WbpozfF4s3dI8p+QreIHNzG4BLgIuA5Y55550zj0JLAMuBS4x\ns79UpmbpnHO3OueWOedu6urHHjJkSCqzfOfF0k1zhs+LpZvmDJ8XSzfNGSbvww/hJz+B0aOhX78h\nNDTA7Nmwww7hu3WHLN95ac3ynRdLN80ZPi+WbpozfF4s3dI8p+QreJODT7/BuZHA0cBXAQe8BNxg\nZo/7r1c+59yeQD/gODM7opNztcmBiIiIVMxTT8Exx8Arr8Cvfw2nngprFfN+AhERERHpMsVscrB2\nseHZhbRULqZ1xMwedM7tFbqHiIiIxGvlSvjtb+Gcc2DbbeGJJ+Ab3wjdSkRERER8Kfq/mbb9/LXs\n/Z2dc3s653r7qyUiIiLSM7z2GowaBT//efLW0Mce0+KaiIiISE9T8AKbc26wc64B+Mg594Bz7ovO\nuTuAR4H7geecc4N9FXPO7eGcu905t8g51+qcy3RwziTn3GvOueXOuXnOuZ18Pb4PTU1NqczynRdL\nN80ZPi+WbpozfF4s3TRnZfPM4JprYPvt4fXX4YEH4De/gc99Lny37pzlOy+tWb7zYummOcPnxdJN\nc4bPi6VbmueUdsysoBtwHfAwUA3cmP3zg8DGwBCgAbi80LwCHm8/4Dzgu8BKINPu62OBD4Fjga2B\nK0g2XBjQQdZewE0FPOZwwBobG82H6upqLzm+s3znxdJNc4bPi6Wb5gyfF0s3zVm5vLffNjv4YDMw\n+8EPzN59Nz3dunuW77y0ZvnOi6Wb5gyfF0s3zRk+L5ZuaZ4zBo2NjQYYMNw6W1Pq7AT7bPHpDWCX\n7J83BFqBvdt8fTTwr0LzirllH6v9Ats84Hdt7jvg/4CfdvD9o4CbC3gcrwtsCxYs8JLjO8t3Xizd\nNGf4vFi6ac7webF005yVyZs922zgQLP+/c1uuaW8rGKl9Z+b5gyfF0s3zRk+L5ZumjN8Xizd0jxn\nDIpZYCt4F1Hn3HJgKzN7PXv/A2CYmb2SvT8EaDKzvgUFFsE51wocZGa3Z+/3BlqAQ3PHssevATYw\ns4PbHJsLfBNYl+QKt8PN7LHVPI52ERUREZGSNDfDmWfCH/8I++8PV10FgwaFbiUiIiIipSpmF9Fi\nNjl4G2j7GWuXkyxY5XwRaC4irxwDgF7AW+2OvwXkvZQ1s33NbKCZrWdmQ1a3uNZWVVUVgwYNYsSI\nEWQyGTKZDFVVVdTX1+edd88995DJrPLRcEyaNIm6urq8Y/PnzyeTybB06dK841OmTKG2tjbv2MKF\nC8lkMqu8N3r69OlMnjw571hLSwuZTIaGhoa847NmzWLcuHGrdBs7dqzm0ByaQ3NoDs2hOTzP8cMf\nTmHTTWu57jr4wx9g9mz4+OPuN0dPeT40h+bQHJpDc2gOzaE5ip2jqqqKkSNHkslkGDFiBIMGDWLU\nqFGrnLc6xVzB9lfgXjP73Wq+Pgk4xMz2LvjRC9TBFWyDgUVAVdsFM+dcLbCnmVWV+Di6gk1EREQK\n9skncMEFcP75MGIE/O//wlZbhW4lIiIiIj5U5Ao2M/vu6hbXsv4BnFpoXpmWkmx8MLDd8YHAm13U\noVPtV1rTkuU7L5ZumjN8XizdNGf4vFi6ac7y8l58EXbbLVlcO+ccaGgofnFNz0H4vLRm+c6LpZvm\nDJ8XSzfNGT4vlm5pnlPyFfMW0c48AWzmMW+1zOwToBH49Go555zL3n+kKzoUoqWlJZVZvvNi6aY5\nw+fF0k1zhs+LpZvmLE1zcwt/+APssAO88w488ghMmQK9e4fvltZ/bpozfF4s3TRn+LxYumnO8Hmx\ndEvznJKv4LeIrjbAua8C44EfABuZWQkvLzvMXRf4KsnuoPOBnwD3AcvM7HXn3BHANcCJwOPA6cBh\nwNZmtqTEx9RbREVERKRDZvDGG3DCCTBnDvzoR3DxxbDuuqGbiYiIiEglFPMW0bVLeQDn3OeBw4Ef\nArsBDwHnAbeVkrcaO5IsqOW2RJ2WPX4tMN7MbnLODcg+7kDgKeDbpS6uiYiISM/T2gqLFsG778IH\nHyS399//7M/F3l+5MtkZ9M47k51CRURERESgyAU259xOJItq3wP+BVwP7AqcZGbP+yxmZg/QyVtY\nzWwGMMPn44qIiEj39cEH8Pjj8OijyVs3H30U/vvfjs/t1Qv69YP11vvslrs/eHD+/dyf+/WD/faD\n/v27di4RERERSbeCP4PNOfcMcDPwH2BXMxtuZtNIri6TDrTfYjYtWb7zYummOcPnxdJNc4bPi6Vb\nd5/TDF57Da6/HiZNguHDYYMNYO+9k7dutrbCaafB9dcv5ZFH4JlnkvOXLIHly5MdQP/7X3j9dXjh\nBfjHP+Dee+H22+GGG2DmTJg2DaZOhcmT4cQT4eijwUzPQcgs33lpzfKdF0s3zRk+L5ZumjN8Xizd\n0jyn5Ctmk4OvAQ+SvG3T69VqPdX48eNTmeU7L5ZumjN8XizdNGf4vFi6dbc5P/wQHn4YfvtbOOSQ\n5CqzLbaAY46Bv/8dhg2DK66A556DZcuSz0k791y48cbxVFXBdtvBZpvBgAGwzjrgnL9upepuz0Ea\nsnznpTXLd14s3TRn+LxYumnO8HmxdEvznNKOmRV0AzYGzgZeARYBvwV2AD4Gti00J803YDhgjY2N\n5oOvHN9ZvvNi6aY5w+fF0k1zhs+LpVva51y0yOzmm81+8hOzXXYx693bDMz69jUbNcrsF78wu+MO\ns6VLu66X77xYumnO8HmxdNOc4fNi6aY5w+fF0i3Nc8agsbExty/AcOtkTamkXUSdc6NJdg49BFgn\nu9h2pZm9VO6CX0jaRVRERCQMM2hqggcfTG4PPwwLFiRf22wz2HVXqKpK/veb34S1S9qmSURERESk\ncBXfRdTM7gXudc5tABxNsth2pnPuOTP7ZimZIiIiEo+VK+HZZz9bUHvwweSz0Xr1ghEj4NBDYbfd\nkkW1wYNDtxURERERWbOy/vuvmb1LsovnDOfcMJKFNhEREZE8n3wC8+cnC2kPPAANDfDuu9CnrKBn\njAAAIABJREFUD+y8M0ycCHvumSyorbde6LYiIiIiIsUpZpODNTKzp8zsFF95PUFdXV0qs3znxdJN\nc4bPi6Wb5gyfF0u3Ss754YfJQtqvfgX77gtf+ALssgvU1MBHH8GZZyZff/fdZNEtd15uca27zJmm\nLN95ac3ynZfWLN95sXTTnOHzYummOcPnxdItzXNKPm8LbLKq+fPX+PbcYFm+82LppjnD58XSTXOG\nz4ulm8+sDz6Av/51Pr/8ZXIl2gYbwKhRMG0afO5zycLavHnwzjswdy6fnrfOOpXvFsvz6TsvrVm+\n89Ka5Tsvlm6aM3xeLN00Z/i8WLqleU7JV9ImBz2VNjkQEREpzrJlUFsL06fD8uWw0UbJwlnutt12\nyeeqiYiIiIh0NxXf5EBERETi1twMl12WLK6tWAE/+QkcfTRsvTU4F7qdiIiIiEjX0gKbiIiIFOzj\nj+HKK5PPTPvPf+DEE+Hss2HgwNDNRERERETCKWiBzTlX8OYFZnZZ6XVEREQkjVpb4cYb4Zxz4LXX\n4PvfTz5XbfPNQzcTEREREQmv0E0OTm93uxC4FKjJ3i7NHjvNe8NuLJPJpDLLd14s3TRn+LxYumnO\n8HmxdCskywxmz4YddkjeAvqNb8Azz8C11666uNad5wyVF0s3zRk+L5ZumjN8XizdNGf4vFi6pXlO\nyderpqam05Nqamp+l7tNnTp1CbAJMMbMflpTU1M7derUG4GdgItramqerWzlypk6depgYOLEiRMZ\nPHhw2Xn9+/dn6NCh5RfznOU7L5ZumjN8XizdNGf4vFi6dZbV0ADHHAO/+Q187WswaxZMngxf+lL4\nbqGyfOfF0k1zhs+LpZvmDJ8XSzfNGT4vlm5pnjMGixcvZubMmQAza2pqFq/p3KJ3EXXO/Qs4zMye\nbHd8BPAXM+u2bxbRLqIiIiLJFWq/+MVnV65deCF8+9vavEBERERE4lLMLqKFvkW0rcF0/NltvQB9\nxLGIiEg39eqryRVrw4bBiy8mn7n2xBOw335aXBMRERERWZNSFtj+DlyRvdoL+PTqtT8Af/NVTERE\nRLrGm2/CpEnJ20Dvuw/++Ed4/nkYOxbWKuWVgoiIiIhIZEp52TweeBN4wjn3kXPuI+Bx4C3ghz7L\ndXf19fWpzPKdF0s3zRk+L5ZumjN8Xizdrr++nrPPhqFDk89Xu+ACePllmDABevcO2y2tWb7zYumm\nOcPnxdJNc4bPi6Wb5gyfF0u3NM8p+YpeYDOzJWZ2ALA1cHj2to2ZHWBmb/su2J3NmjUrlVm+82Lp\npjnD58XSTXOGz+vp3VauhN//HsaNm8Wll8JppyVvD/3pT6Fv37Dd0p7lOy+WbpozfF4s3TRn+LxY\numnO8HmxdEvznJKv6E0OejJtciAiIj3ds88mV6jNmwc//CGcdx542DhbRERERKTHKWaTg442K1gj\n51wv4AfA3sCXaHcVnJmNLjZTREREKuvDD+H886G2FrbcEh56CHbfPXQrEREREZGeoegFNuB3JAts\ns4HnAF0CJyIikmL33QcTJ8KCBXDOOfCzn8HnPhe6lYiIiIhIz1HKAtv3gCPM7E7fZURERMSfZcvg\nzDPh6qthjz3gr3+FbbYJ3UpEREREpOcpZRfRj4FXfBfpicaNG5fKLN95sXTTnOHzYummOcPndfdu\nZsmuoNtsA7feCjNnwv33r7q41t3nDJHlOy+WbpozfF4s3TRn+LxYumnO8HmxdEvznJKvlAW2acCp\nzjnnu0xPM2bMmFRm+c6LpZvmDJ8XSzfNGT6vO3f797/hO9+Bo46CvfaCF16AE06AtTr4f/zuPGeo\nLN95sXTTnOHzYummOcPnxdJNc4bPi6VbmueUfEXvIuqcuw34FrAM+CfwSduvm9kh3tp1Me0iKiIi\n3dWKFXDZZclnrG24IcyYAdXVoVuJiIiIiHRfFd1FFHgHuK2UYiIiIuLf/PnJVWpPPgknn5zsFtqv\nX+hWIiIiIiLxKHqBzcz0hl0REZEUaG6Gmhr4n/+BbbeFefNg5MjQrURERERE4lPKZ7BJgRoaGlKZ\n5Tsvlm6aM3xeLN00Z/i87tDt7rvhG9+Ayy9PrlhrbCx+ca07zJm2LN95sXTTnOHzYummOcPnxdJN\nc4bPi6VbmueUdsys6BtwGHATMA+Y3/ZWSl5absBwwBobG82H6upqLzm+s3znxdJNc4bPi6Wb5gyf\nl+ZuY8ZU21FHmYHZ3nubvfxyOnr5zktrlu+8WLppzvB5sXTTnOHzYummOcPnxdItzXPGoLGx0QAD\nhlsna0qlbHJwCnABcA0wAbgaGArsBPzezM4uf9kvDN+bHLS0tNC3b9/yi3nO8p0XSzfNGT4vlm6a\nM3xeGru9+SbcdBPU1LTgXF8uuQSOPRbK2dM7jXOmPct3XizdNGf4vFi6ac7webF005zh82LpluY5\nY1DMJgelLLA1AVPNbJZz7n1gezN71Tl3HrChmf241OKV4Jw7EPgt4ICLzKxuDedqF1EREUmNf/0L\nbrstuT36KKy1Fhx5JFxyCWy0Ueh2IiIiIiI9W6V3ER0CPJL983Igt0/Z/5K8ZTQ1C2zOuV7ANGAv\n4ANgvnPuVjP7b9hmIiIiqzKDZ575bFHtmWdgnXVgzBi46iqorob+/UO3FBERERGR9kpZYHsT2BBY\nACwEdgGeBjYnuUosTUYCz5nZmwDOudnAGOD/C9pKREQka+XK5Oq03KLaa6/BBhvAgQfCuefCt78N\n660XuqWIiIiIiKxJKbuI3gtksn++Gvgf59xckkWr23wV8+TLwKI29xcBG3fVg0+ePDmVWb7zYumm\nOcPnxdJNc4bPq3S3jz+Gu+6CiRPhy1+GPfaAG25IrlS76y54+23485/h0ENXXVzrTnP2xCzfebF0\n05zh82LppjnD58XSTXOGz4ulW5rnlHylXME2gezCnJn93jn3H2BX4HbgCl/FnHN7AJOBEcBg4CAz\nu73dOZOAM4FBJFfRnWxm//DVoVxDhgxJZZbvvFi6ac7webF005zh8yrR7YMPYM6c5Cq12bPhvfdg\n6NBko4KDD4Zddkk+Y60ru8XyHMQyp++8tGb5zktrlu+8WLppzvB5sXTTnOHzYumW5jklX9GbHHQV\n59x+JAt3jcCtwMFtF9icc2OBa0kW/B4HTgcOB7Yys6XZc6qAyWZ2SPb+/wCPmdmNq3lMbXIgIiLe\nrFwJ118PN98Mc+fCRx/B9tsnC2oHHwzbbVfeLqAiIiIiIlI5ld7koEuY2V3AXQDOdfivH6cDV5jZ\nddlzTgS+A4wHLsqe8zjwdefcYOB9YD/gvApXFxER4ZNP4Ljj4MYbYddd4cIL4aCDYIstQjcTERER\nERHfUrvAtibOud4kbx29MHfMzMw59zegqs2xlc65M4D7STZgqNUOoiIiUmkffgjf+x7ceSfcdBMc\ndljoRiIiIiIiUkmlbHKQBgOAXsBb7Y6/RfJ5bJ8yszvM7GtmtpWZ1RUSXlVVxaBBgxgxYgSZTIZM\nJkNVVRX19fV5591zzz1kMplVvn/SpEnU1dXR1NT06bH58+eTyWRYunRp3rlTpkyhtrY279jChQvJ\nZDJ539/U1MT06dNX+UDClpYWMpkMDQ0NecdnzZrFuHHjVuk2duxY6uvr87I7m6OtjuZoamoqeA6g\n0znant/ZHG11NEdTU1PBc8Can48777yzqDna6miOpqamgueANT8fjz76aMFzdPZ8NDU1lfz3qqM5\n2j9eOc9HU1NTyX+vOpqjfUY5z0dTU1PZP+e5OY4//vii5miv/RxNTU1l/5zn5jjjjDMKnqOz5yP3\nPeX+nOfmOP/88wueo7PnI/e/pfy9am6GTAbuvhvq6+Hmm8dy+eWXFzxHZ89HrpuP37ujR4/28nOe\nk3tcH793R48e7eXnvG2vQufIWd0co0ePLvvnPDdH7mu+fu+OHj264Dk6ez5y3Xz83p02bZqXn/Pc\nHLkuPn7v/vKXv0zl66vcuWl8fQVw1FFHeXl91dTUlNctTa+vli5dmtet3N+7Tz75ZCpfXwHce++9\nqXx9BXDLLbd4eX01efLkvONpen1VW1ub163c37tNTU2pfH2V6+br32t9vr7KdfP177XXX399wXN0\n5eurSvx7baFz5Piaw9frxLaqqqoYOXIkmUyGESNGMGjQIEaNGrXKeatlZqm/Aa1Aps39wdljO7c7\nrxZ4tIzHGQ5YY2Oj+VBdXe0lx3eW77xYumnO8HmxdNOc4fNKzXrnHbPddzdbbz2z++4rP68jac3y\nnZfWLN95sXTTnOHzYummOcPnxdJNc4bPi6VbmueMQWNjowEGDLfO1pQ6OyENtw4W2HoDn7Q9lj1+\nDXBbGY/jdYFtwYIFXnJ8Z/nOi6Wb5gyfF0s3zRk+r5SsJUvMRoww+8IXzObNKz9vddKa5TsvrVm+\n82LppjnD58XSTXOGz4ulm+YMnxdLtzTPGYNiFtgK3kXUOXdvgVfErfrehDI551qBgyx/F9F5JDuC\nnpq974CFwGVmdnGJj6NdREVEpGiLF8O++8Lbbye7hW6/fehGIiIiIiJSrkrtIjoKWADMJrl6rKKc\nc+sCXyXZnABgC+fc9sAyM3sduAS4xjnXSLJb6OlAX5Kr2ERERLrEggWwzz6wfDk8+CBsvXXoRiIi\nIiIi0tWKWWD7GTAOOBy4HrjKzJ6rSKvEjsB9JJfiGTAte/xaYLyZ3eScGwCcBwwEngK+bWZLKthJ\nRETkUy+9lCyurb02PPQQbL556EYiIiIiIhJCwbuImtnFZrYtcBDQD3jYOfe4c+5E59z6vouZ2QNm\ntpaZ9Wp3G9/mnBlmtpmZfd7MqszsCd89ytF+t4u0ZPnOi6Wb5gyfF0s3zRk+r5CsZ5+FPfeEddft\nfHEtrf/cuvtzECLLd14s3TRn+LxYumnO8HmxdNOc4fNi6ZbmOSVfwQtsOWb2qJmdQLKT5++B8cAb\nlVhk6+5aWlpSmeU7L5ZumjN8XizdNGf4vM6ynngCRo2CwYOTt4VuvHF6uoXK8p2X1izfebF005zh\n82LppjnD58XSTXOGz4ulW5rnlHwFb3Kwyjc6tzvJ4trhwD+Bb5nZco/dupw2ORARkc489BB85zvw\n9a/DnDnwhS+EbiQiIiIiIpVQzCYHRV3B5pz7snPuF865l4C/AMuAnc1sl+6+uCYiItKZu++Gb38b\ndtwx2S1Ui2siIiIiIgJFbHLgnLsT+BZwDzAZmG1mKypVTEREJE3q62HsWNh3X7j5Zvj850M3EhER\nERGRtCjmCrb9SK5YGwJMAR53zs1vf6tIy25q6dKlqczynRdLN80ZPi+WbpozfF77rOuvh8MOg+9+\nF269tfjFtbT+c+tOz0FasnznxdJNc4bPi6Wb5gyfF0s3zRk+L5ZuaZ5T8hWzwDYVmAnUA39dw02y\nxo8f3/lJAbJ858XSTXOGz4ulm+YMn9c2a+ZM+P73k9usWdCnT3q6pSnLd15as3znxdJNc4bPi6Wb\n5gyfF0s3zRk+L5ZuaZ5T2jGzgm4kV66tVej53fEGDAessbHRfPCV4zvLd14s3TRn+LxYumnO8Hm5\nrGnTzMDsxz82W7my/Dwf0prlOy+tWb7zYummOcPnxdJNc4bPi6Wb5gyfF0u3NM8Zg8bGRgMMGG6d\nrCkVvIuoc24lMNjM3q7MUl942kVUREQAzOBXv4IpU+Css+DCC8G50K1ERERERKQrVWoX0W73rxbO\nuUnOudecc8udc/OcczuF7iQiIulmBj/9abK4dsEF8Otfa3FNRERERETWrOBdRLMKu9wtBZxzY4Fp\nwATgceB04G7n3FZmpk/1ExERAFauhBdegMceg8cfh0cegeeeg9/9Dk45JXQ7ERERERHpDoq5gg3g\nV865S9Z0q0jL0pwOXGFm15lZE3Ai0AJ02Sf61dXVpTLLd14s3TRn+LxYumnOyuWZwcKF8Je/JFep\njRoFG2wA220HJ5yQLK6NHAknn1zndXEtrf/c9HctfF4s3TRn+LxYumnO8HmxdNOc4fNi6ZbmOSVf\nsQts2wE7rOE2zGu7EjnnegMjgL/njlnyYXN/A6q6qsf8+Wt8e26wLN95sXTTnOHzYummOf3l/fe/\nMHcunH8+ZDIweDBsuikcfjjceCMMGADnngv33QfvvgvPPgt1dbBypZ6D0HlpzfKdF0s3zRk+L5Zu\nmjN8XizdNGf4vFi6pXlOyVfMJgetwKDusMmBc24wsAioMrPH2hyvBfY0sw4X2bTJgYhI9/Xhh/D0\n08nbPHO3l15KvrbBBsmVabnbTjsli20iIiIiIiKrU8wmB8V8Blu3+fw1EREJb8UKePNNWLSo49uS\nJclbNn1YuRL+9S/45BPo0weGDYMxY+Ccc5IFta9+FdYq9pptERERERGRAvXUXUSXAiuBge2ODwTe\n7Oybq6qqGDRoECNGjCCTyZDJZKiqqqK+vj7vvHvuuYdMJrPK90+aNGmV9zXPnz+fTCbD0qX5+ytM\nmTKF2travGMLFy4kk8nQ1NSUd3z69OlMnjw571hLSwuZTIaGhoa847NmzWLcuHGrdBs7dqzm0Bya\nQ3OUPcf770NTE9xww0J22CHD6ac3MWkSHHRQcnXYBhtMp3fvyWyyCeyyCxx6KJxxRguXXZbh6acb\n2Gij5PPP9tkHNtlkFq2t49hnH/JuH388ls03r887tuWW99DSklnl3D59JnH44XU8/ji8916yYcG4\ncfO56aYMG264NG9xrSc+H5pDc2gOzaE5NIfm0ByaQ3NojvLmqKqqYuTIkWQyGUaMGMGgQYMYNWrU\nKuetTjFvET0OuNHMPio4PSDn3DzgMTM7NXvfAQuBy8zs4tV8j94iKiKyGtOnw4wZydVn77+f/7UN\nN4SNN17zbcAAcN3pP9WIiIiIiEjUinmLaDFXsD0KbN/2gHNub+fcfc65x51zvyi+akVdApzgnDvW\nObc18EegL3BNVxXoaLU2DVm+82LppjnD58XSLY1zmsFFF8Hbb2c491y44QZ44AF45RVoaYH//Aee\neQbmzIErr4SpU2HCBPjOd5K3a2600aqLa2mcsxJZvvPSmuU7L61ZvvNi6aY5w+fF0k1zhs+LpZvm\nDJ8XS7c0zyn5etXU1BR04tSpU68EvlBTU3MfgHNuc6ABeA14Cjhl6tSpy2tqauZVqGtRampq/jl1\n6tT/Ar8EziD5DLmjzOyV1X3P1KlTBwMTJ06cyGAPn37dv39/hg4dWnaO7yzfebF005zh82LplsY5\n//lPqK2Fc87pz89+NpTttkt249xwQ+jdO1yvSuXF0k1zhs+LpZvmDJ8XSzfNGT4vlm6aM3xeLN3S\nPGcMFi9ezMyZMwFm1tTULF7TucW8RfR14AgzezR7/5fAYWY2LHv/eODk3P3uSG8RFRHp2MUXQ01N\ncqXaOuuEbiMiIiIiIlJ5lXqL6ADg/9rc/xbw/9rcvx/YrIg8ERHpJu68E0aP1uKaiIiIiIhIR4pZ\nYFsGDAZwzq0F7Ai0fTtoH7rXTqMiIlKA996DhgbYf//QTURERERERNKpmAW2+4FznHObAKdlv/f+\nNl/fFvi3r2I9QfttatOS5Tsvlm6aM3xeLN3SNuff/w4rViQLbD15zkpl+c5La5bvvLRm+c6LpZvm\nDJ8XSzfNGT4vlm6aM3xeLN3SPKfkK2aB7Wxga2ABUAv81Mya23z9+8C9Hrt1e7NmzUpllu+8WLpp\nzvB5sXRL25xz5sDWW8Pmm/fsOSuV5TsvrVm+89Ka5Tsvlm6aM3xeLN00Z/i8WLppzvB5sXRL85yS\nr+BNDgCcc2sDXweWmNkb7b62PfC6mS3zW7HraJMDEZF8ZrDJJnDEEXDJJaHbiIiIiIiIdJ1KbXKA\nma0ws6fbL65lvQ/cWEyeiIik23PPwaJF+vw1ERERERGRNSlqga0T/YC9PeaJiEhgc+ZA376w556h\nm4iIiIiIiKSXzwW2VHLOHeica3LOveicOz50HxGR7mTOHNh7b/jc50I3ERERERERSa8evcDmnOsF\nTANGASOAnznnvthVjz9u3LhUZvnOi6Wb5gyfF0u3tMz53nvQ0JD/9tCeOGels3znpTXLd15as3zn\nxdJNc4bPi6Wb5gyfF0s3zRk+L5ZuaZ5T8vXoBTZgJPCcmb1pZh8As4ExXfXgY8b4eyifWb7zYumm\nOcPnxdItLXP+7W+wYkX+AltPnLPSWb7z0prlOy+tWb7zYummOcPnxdJNc4bPi6Wb5gyfF0u3NM8p\n+QreRdQ59ySwppP7AluaWS8fxXxwzh0K7GVmp2Tvnwm0mlmHe+FpF1ERkc+ccAI8/DA8/3zoJiIi\nIiIiIl2vmF1E1y4it76sVkVyzu0BTCZ5a+dg4CAzu73dOZOAM4FBwNPAyWb2j67sKSLSE5kln782\ndmzoJiIiIiIiIulX8AKbmU0tJtg5txvwhJl9VHSrxLrAU0AdcGsH+WNJPl9tAvA4cDpwt3NuKzNb\nmj3tDeArbb5tY+CxEvuIiETj2Wdh0aL8t4eKiIiIiIhIxyr5GWxzSBa0SmJmd5nZuWb2V8B1cMrp\nwBVmdp2ZNQEnAi3A+DbnPA583Tk32Dm3HrAfcHepnYrV0NCQyizfebF005zh82LploY558yBddeF\nPfYoP2t10jBnV2T5zktrlu+8tGb5zoulm+YMnxdLN80ZPi+WbpozfF4s3dI8p7RjZhW5Ae8DW3jK\nagUybe73Bj5peyx7/BrgtnbHDgReBF4Cju/kcYYD1tjYaD5UV1d7yfGd5Tsvlm6aM3xeLN3SMOde\ne5llMn6yVicNc3ZFlu+8tGb5zktrlu+8WLppzvB5sXTTnOHzYummOcPnxdItzXPGoLGx0Uj2Ixhu\nna1ddXZCqbcKL7ANzh7bud15tcCjZTzOcMD69OljAwcOtOHDh1t1dbVVV1fbLrvsYrfddlveP+i7\n7767w7+cJ510kl155ZXW3Nyc96RUV1fbkiVL8s4999xz7Te/+U3esQULFlh1dbW98MILnx5rbm62\nyy67zM4888y8c5ubm626utoeeuihvOM33HCD/eAHP1il2xFHHGG33XZbXrfO5mirozmam5sLnsPM\nOp2jbbfO5mirozmam5sLnsNszc/H/Pnzi5qjrY7maG5uLngOszU/HwsWLCh4js6ej+bm5pL/XnU0\nR9vns7M5Ons+mpubS/571dEc7buV83w0NzeX/XOem+PUU08tao722s/R3Nxc1N+r++9vNKi2iy9e\ndY7zzjuv4Dk6ez5y//zL/TnPzTFjxoy8Y+U8H7luvn7v3njjjQXP0dnPR66bj9+7BxxwgJef87aZ\nhc6Rs7o5DjjgAC8/5217FTpHzurmOOCAA8r+Oc/Nkevm6/fuAQccUPAcnT0fuW4+fu9effXVXn7O\nc3Pkuvn4vfvb3/42la+vzMx+/vOfp/L1lZnZhAkTvLy+euGFF/K6pen11ZIlS/K6lft7d8mSJal8\nfWVm1tTUlMrXV2ZmDQ0NXl5fnXnmmXnduvr1VWfPR9tu5f7ebW5uTuXrq1y3NL6+ynXw9e+1c+fO\nLXiOrnx9VYl/ry10jhxfc/h6ndjWLrvsYjvttJNVV1fb8OHDbeDAgdavX7+CF9gK3kW0WM6594Ht\nzezVTs47Crgie9eA/c3s4XbntNJmkwPn3GBgEVBlZo+1Oa8W2NPMqkrsrF1ERSR6t9wChx0G//43\nbLpp6DYiIiIiIiJhVGoX0Ur5KzCvzf1FBXzPUmAlMLDd8YHAm556iYhEac4c2GYbLa6JiIiIiIgU\nqpKbHBR0aZyZNZvZq21une46amafAI3A3rljzjmXvf9IqYVFRGJnliywHXBA6CYiIiIiIiLdRyUX\n2Dra+bPwb3ZuXefc9s65YdlDW2Tvb5K9fwlwgnPuWOfc1sAfgb4kGx2kwuTJk1OZ5Tsvlm6aM3xe\nLN1CzvnMM/DGG7D//uVndSaW59N3XlqzfOelNct3XizdNGf4vFi6ac7webF005zh82LpluY5JV/B\nbxF1zn0e2Be4z8zeb/e19YFRwD1m9iGAmfUrs9uOwH0kV8IZMC17/FpgvJnd5JwbAJxH8tbQp4Bv\nm9mSMh/XmyFDhqQyy3deLN00Z/i8WLqFnHPOHFh3Xdh99/KzOhPL8+k7L61ZvvPSmuU7L5ZumjN8\nXizdNGf4vFi6ac7webF0S/Ockq/gTQ6cc6eS7OS592q+/jdgrpnVeuzXpbTJgYjEbq+94ItfhPr6\n0E1ERERERETCKmaTg2LeIno0cOkavn4pcGgReSIikiLvvgsPP7z6t4eKiIiIiIhIx4pZYNsSeHoN\nX38me46IiHRDc+fCypVaYBMRERERESlWMQtsawMbreHrG1HEZ7rFoKmpKZVZvvNi6aY5w+fF0i3U\nnHPmwLbbwpo+lqEnzNnVWb7z0prlOy+tWb7zYummOcPnxdJNc4bPi6Wb5gyfF0u3NM8p7ZhZQTdg\nHvCzNXz958C8QvPSeAOGA9bY2Gg+VFdXe8nxneU7L5ZumjN8XizdQszZ2mo2eLDZmWeWn1WoWJ5P\n33lpzfKdl9Ys33mxdNOc4fNi6aY5w+fF0k1zhs+LpVua54xBY2NjbuPN4dbZmlJnJ9hni08TgA+A\nAzv4WnX2axMKzUvjzfcC24IFC7zk+M7ynRdLN80ZPi+WbiHmfPLJ5P8R/v738rMKFcvz6TsvrVm+\n89Ka5Tsvlm6aM3xeLN00Z/i8WLppzvB5sXRL85wxKGaBreBdRAGcc38GjgKagBezh7cGtgJuMrMj\nS76ULgW0i6iIxOrXv4YLL4T//Af69AndRkREREREJLxK7SKKmR0DfA94iWRR7WskC21HdvfFNRGR\nmM2ZA3vvrcU1ERERERGRUhS9KYGZ3QTcVIEuIiISwDvvwCOPwIwZoZuIiIiIiIh0T0VdwdYR59xA\n59wa9pyLV21tbSqzfOfF0k1zhs+LpVtXzzl3LqxcCfvvX35WMWJ5Pn3npTXLd15as3z65Q2hAAAg\nAElEQVTnxdJNc4bPi6Wb5gyfF0s3zRk+L5ZuaZ5T8hW8wOac6+ec+7NzboFz7lrnXB/n3O+BxcBr\nzrkHnHPrV65q99PS0pLKLN95sXTTnOHzYunW1XPOmQNf/zpsskn5WcWI5fn0nZfWLN95ac3ynRdL\nN80ZPi+WbpozfF4s3TRn+LxYuqV5TslX8CYHzrnpwD7ADOAQ4F1gKHAi0Av4A1BvZmdXpmrlaZMD\nEYmNGWy8MRxzDFx0Ueg2IiIiIiIi6VHMJgfFfAbbd4HjzOw+59wtwP8BGTN7GMA591NgGtBtF9hE\nRGLz9NOweHFhbw8VERERERGRjhXzGWxfAl4BMLM3gOUku4nmPAcU8AYjERFJizvvhPXWg912C91E\nRERERESk+ypmge0/wEZt7v8VeKfN/fWAj3yU6imWLl2ayizfebF005zh82Lp1pVzzpkD++wDffqU\nn1WsWJ5P33lpzfKdl9Ys33mxdNOc4fNi6aY5w+fF0k1zhs+LpVua55R8xSywPQPslLtjZkeZ2dtt\nvr4T8IKvYj3B+PHjU5nlOy+WbpozfF4s3bpqznfegUcfhQMOKD+rFLE8n77z0prlOy+tWb7zYumm\nOcPnxdJNc4bPi6Wb5gyfF0u3NM8p7ZhZQTdgQ+ALa/j6/sCoQvPSeAOGA9bY2Gg++MrxneU7L5Zu\nmjN8XizdumrOm24yA7PXXy8/qxSxPJ++89Ka5TsvrVm+82LppjnD58XSTXOGz4ulm+YMnxdLtzTP\nGYPGxkYDDBhunawpFbyLaCGcc98ws+e8BXYx7SIqIjEZNw6eeAKefTZ0ExERERERkfQpZhfRYt4i\n2iHnXD/n3ATn3OPA0+XmiYhI5bW2wl13afdQERERERERH0peYHPO7emcuxZYDJwJ3Avs4quYiIhU\nztNPw5tvFvf5ayIiIiIiItKxohbYnHODnHNnOedeBm4G3gM+BxxkZmeZ2T8qUbK7qqurS2WW77xY\numnO8HmxdOuKOe+8E/r1g912Kz+rVLE8n77z0prlOy+tWb7zYummOcPnxdJNc4bPi6Wb5gyfF0u3\nNM8p+QpeYHPO/T/gReCbwGnAl83s5EoV6wnmz1/j23ODZfnOi6Wb5gyfF0u3rphzzhzYZx/o3bv8\nrFLF8nz6zktrlu+8tGb5zoulm+YMnxdLN80ZPi+WbpozfF4s3dI8p+QreJMD59wK4DLgD2b2cpvj\nnwDbm9nzlanYdbTJgYjE4L//hQED4I9/hBNOCN1GREREREQknSq1ycHuQD+g0Tn3mHPux865AWX0\nFBGRAObOTTY50AYHIiIiIiIifhS8wGZm88zsBGAwcAXwPeCNbMa+zrl+lakoIiI+3XknbLcdfOUr\noZuIiIiIiIj0DEXvImpmzWZ2lZntDmwHTAPOAt52zt3uu6CIiPjT2gp33aWr10RERERERHwqeoGt\nLTN70cx+CnwFONJPpZ4jk8mkMst3XizdNGf4vFi6VXLOp56Ct96CAw4oP6tcsTyfvvPSmuU7L61Z\nvvNi6aY5w+fF0k1zhs+LpZvmDJ8XS7c0zyn5etXU1JQdUlNTYzU1NU01NTWzyq8UztSpUwcDEydO\nnMjgwYPLzuvfvz9Dhw4tv5jnLN95sXTTnOHzYulWyTmvugqeeAKmT4devcJ2i+X59J2X1izfeWnN\n8p0XSzfNGT4vlm6aM3xeLN00Z/i8WLqlec4YLF68mJkzZwLMrKmpWbymcwveRTQG2kVURHq63XaD\nQYPglltCNxEREREREUm3Su0iKiIi3diyZTBvnj5/TURERERExDctsImIRGLu3GSTAy2wiYiIiIiI\n+KUFtgqqr69PZZbvvFi6ac7webF0q9Scc+bAN78JG29cfpYPsTyfvvPSmuU7L61ZvvNi6aY5w+fF\n0k1zhs+LpZvmDJ8XS7c0zyn5Slpgc85t6Zyb4Jz7pXPu3LY33wXL5Zy71Tm3zDl3U1c/9qxZ/vZ8\n8JnlOy+WbpozfF4s3SoxZ2trssBW7tVraZ8zjVm+89Ka5TsvrVm+82LppjnD58XSTXOGz4ulm+YM\nnxdLtzTPKfmK3uTAOXcC8AdgKfAm0DbAzCxVuwM45/YE+gHHmdkRnZyrTQ5EpEdqbIQdd4T774e9\n9grdRkREREREJP2K2eRg7RLyfwmcbWa1pZTramb2oHNO/zopIlGbMwfWXx923TV0ExERERERkZ6n\nlLeIfhG42XcRERGpnDvvhH33hd69QzcRERERERHpeUpZYLsZGOO7SHvOuT2cc7c75xY551qdc5kO\nzpnknHvNObfcOTfPObdTpXuJiHQ3y5bBY49p91AREREREZFKKWWB7RXgV865a5xzZzjnTml789ht\nXeAp4CTyP+cNAOfcWGAaMAXYAXgauNs5N8Bjh7KMGzculVm+82LppjnD58XSzfechxwyjtZW2G+/\n8rPSPKe6hc3ynZfWLN95sXTTnOHzYummOcPnxdJNc4bPi6VbmueUfKV8BtsE4ANgr+ytLQMuK7cU\ngJndBdwF4JxzHZxyOnCFmV2XPedE4DvAeOCidue67K1LjRnj70I/n1m+82LppjnD58XSzfecra1j\n2H572Hjj8rPSPKe6hc3ynZfWLN95sXTTnOHzYummOcPnxdJNc4bPi6VbmueUfEXvIhqCc64VOMjM\nbs/e7w20AIfmjmWPXwNsYGYHtzk2F/gmyRVxy4DDzeyx1TyOdhEVkR6ltRUGDYLjj4df/zp0GxER\nERERke6j0ruIAuCc6wNsDvzLzFaUmlOiAUAv4K12x98Cvtb2gJnt21WlRETSZv58WLJEn78mIiIi\nIiJSSUV/Bptzrq9zro7kCrJ/AkOyx6c7584qIe8o59z72dt7zrndis3wraqqikGDBjFixAgymQyZ\nTIaqqirq6+vzzrvnnnvIZFbZe4FJkyZRV1eXd2z+/PlkMhmWLl2ad3zKlCnU1tbmHVu4cCGZTIam\npqa849OnT2fy5Ml5x1paWshkMjQ0NOQdnzVrVofvrR47dqzm0ByaI6I55syBDTaADz7o3nPkdPfn\nQ3NoDs2hOTSH5tAcmkNzaA7Nkc45qqqqGDlyJJlMhhEjRjBo0CBGjRq1ynmrZWZF3YDfAU8Au5N8\nFtsW2ePfBZ4sIW9dYIs2t891cE4rkGlzvzfwSdtj2ePXALcV26HN9w8HrLGx0Xx46KGHvOT4zvKd\nF0s3zRk+L5ZuPrOqqsxGjUpnt1ieT995ac3ynZfWLN95sXTTnOHzYummOcPnxdJNc4bPi6VbmueM\nQWNjo5HsNzDcOltT6uyEVb4BFgC7ZP/8fpsFtq8C7xWbV+BjtnawmDYP+F2b+w54HZhcxuN4XWCr\nrq72kuM7y3deLN00Z/i8WLqVm7Vihdns2Wbf/W7yW37YsPR0q1SW77xYumnO8HmxdNOc4fNi6aY5\nw+fF0k1zhs+LpVua54xBMQtsRW9y4JxrAb5hZq86594Hts/+eXvgQTPboKjA1T/OuiSLdg6YD/wE\nuA9YZmavO+eOILli7UTgcZJdRQ8DtjazJSU+ptdNDlpaWujbt2/ZOb6zfOfF0k1zhs+LpVupWf/3\nf3DVVXDllfD667D99jBxInz/+y2st17PmbMr8mLppjnD58XSTXOGz4ulm+YMnxdLN80ZPi+Wbmme\nMwbFbHJQygLbg8DNZjY9u8D2TTN7zTk3HdjSzPYrtXi7x9mLZEGtfcFrzWx89pyTgJ8CA4GngJPN\n7IkyHlO7iIpIt7JiBdx1F8ycCbNnw+c/D0ceCRMmwI47gnOhG4qIiIiIiHRPld5F9BfAHOfcttnv\nPzX7512BvUrI65CZPUAnmzCY2Qxghq/HFBHpLhYuTK5Wq6tLrlwbPhxmzEgW19ZfP3Q7ERERERGR\nuBS9wGZmDc65YcBZwLPAGJK3cFaZ2bOe+4mISNaKFclVan/6U7I7aN++cNRRydVqyX9UERERERER\nkRDWeIXY6pjZv8zsBDMbaWbbmtkxWlxbVfstYtOS5Tsvlm6aM3xeLN3aZ/3733DOObDppnDQQfDW\nW/DHP8Ibb8AVV3S+uNZd5kxTXizdNGf4vFi6ac7webF005zh82LppjnD58XSLc1zSr5S3iKKc24o\nMA7YAjjNzN52zu0PLDSzf/os2J0NGTIklVm+82LppjnD58XSbciQIXzyCdxxR/LZanffDeutB0cf\nDSeckLwdNGS3NGb5zoulm+YMnxdLN80ZPi+WbpozfF4s3TRn+LxYuqV5TslXyiYHewFzgIeBPYFt\nsruIngXsaGaH+a/ZNbTJgYiE8u67yeeqvf46PPxw8vlqb74JI0cmbwEdOzZZZBMREREREZGuUelN\nDn4D/NLMLsnuIppzL/DjEvJERHq05cuThbPcLbeQ1vb2fpvfpuuvD8cck1ytNmxYuN4iIiIiIiJS\nmFIW2LYDjurg+NvAgPLqiIh0LytWJJ+FtnBhxwtnCxfCf/6T/z1f+hJsskly23vvz/48ZEjyv4MH\nw9olvYFfREREREREQihlk4N3gMEdHN8BWFRenZ6lqakplVm+82LppjnD54Xo9sEH8Pzzya6dV1wB\nv/hFcnXZHnv8/+ydeZgcVdm+75cQtkT2JWEnAVR2EkBGZFFZP8yAfiooyhJRUMJuUFFJAipEILK4\nEQmbH0RcIKIiIIhI+AWUCSCIQRBCSAghI3vCmry/P04109Ppmenqqcmp5Dz3ddU13dXVzzxPL1XV\nb50lFMRWXjlMPrDnnjM44gg491y46y54++3QvfPUU+Hqq+HPf4bHHw+t2ebNg/vvhxtvhEsugdGj\n4fDD4YMfDAW2J55I4z3QZ2350ipar6xaReul4k054+ul4k054+ul4k054+ul4q3MOUUN7p5rAS4A\n7gYGAa8AWwJ7AP8BxuTVK9MCDAO8ra3Ni2DEiBGF6BStVbReKt6UM75eX3h77jn3++5z/9Wv3C+4\nwP3EE90POcR9p53c117bHTqWfv3cN93Ufc893Y84wv3MM90vu8z9j39032efEf7KK8X5KpKyvgep\n5Cxar6xaReuVVatovVS8KWd8vVS8KWd8vVS8KWd8vVS8lTlnCrS1tTngwDDvoabUzCQHKwE/Ao4G\n+gHvZH+vA45290W9rvpFouhJDmbNmlXYDB1FahWtl4o35Yyvl1dr0SKYOxdmzgzL00/X/p3FW291\n6A0YEFqjbbZZaJ1W+3fDDbvuuqn3IK5W0XqpeFPO+HqpeFPO+HqpeFPO+HqpeFPO+HqpeCtzzhTI\nM8lB7gLbu0802xTYDhgIPODujzclVCI0i6gQyxbvvANz5nQumlXfnjUrbFNhnXVg883DUimkVRfR\n1loLzGIkEUIIIYQQQghRNvp0FlEzG+LuT7r7LGBWkx6FEKIh3noLZsyARx6Bxx7rXEibPTu0Uquw\n/vodBbThw5cspg0cGCOBEEIIIYQQQojlnWbmqXvCzGYDdwF/Ae5y9ycKdSWESI7Fi0PR7JFH4OGH\nw1IpqlVaoQ0a1FE0a2kJRbPK/U03hdVWi2ZfCCGEEEIIIUTCNDOL6CbAN4DXgTOAf5vZbDO71syO\nLdTdMs748eNLqVW0XirelLM4veefD7NqXnwxHHss7L47rL46DB0KhxwCF1wAzz4Le+0VZtm8+254\n4QU45ZTxTJsGkyeH2TqPPx4OPBDe977mimtlfd30WYuvl4o35Yyvl4o35Yyvl4o35Yyvl4o35Yyv\nl4q3MucUncndgs3d5wDXZgtmthXwTeAI4HDg8iINLsssXLiwlFpF66XiTTnzs2ABPP74QiZN6miR\n9vDDocAGsPLKsM02sP328L//G/5utx1stFH9sdD0HsTXK6tW0XqpeFPO+HqpeFPO+HqpeFPO+Hqp\neFPO+HqpeCtzTtGZZmYRXQ34ELBPtuwMzCB0F/2Lu/+2UIdLEU1yIETvefNNuPlmuO46+N3vwn0z\n2HLLjgLa9tuHZejQrmflFEIIIYQQQgghYtKnkxwALwEvElqwnQfc7e4vNqEjhFhOWLQI7rorFNV+\n/Wt4+WXYeWc45xz48IdDKzWNjyaEEEIIIYQQYnmlmQLbzYQWbIcDg4BBZvYXd/93oc6EEKXGHR54\nAK69Fn7xizBm2pAhcOKJ8NnPwvvfH9uhEEIIIYQQQgixdMg9yYG7H+ru6wIHAtOA/YG7zWyOmV1b\ntMFlmfb29lJqFa2XijflDDzxBJx9diigDR8O//d/Yfy0adPCY+ec07m4VtacReuVVatovbJqFa2X\nijfljK+XijfljK+XijfljK+XijfljK+Xircy5xSdaWYW0QoPA/cQimx/B9YHDivC1PLCyJEjS6lV\ntF4q3lLO+dxzYcbPD3wAttoKzj8/3L7lFpgzJ8z0ufvu9ScmKGvOovXKqlW0Xlm1itZLxZtyxtdL\nxZtyxtdLxZtyxtdLxZtyxtdLxVuZc4oa3D3XApwG3AS8ALwN3A9MAFqBtfLqlWkBhgHe1tbmRVCU\nTtFaReul4i21nC+/7H7lle777ee+wgru/fu7t7a6X3+9+4IFcb2VUa+sWkXrlVWraL1UvClnfL1U\nvClnfL1UvClnfL1UvClnfL1UvJU5Zwq0tbU54MAw76Gm1Mwson8H7iLMGnq3u79cSKWvBGgWUZE6\nb70VZgC99tqOGUD33huOOCJ0A1177dgOhRBCCCGEEEKIpUNfzyL6v8Bsd19cvdLMDNjE3Wc1oSmE\niMjDD8MVV4Tx1NrbYaedwlhqhx8Om2wS250QQgghhBBCCFFumimwPQUMBp6vWb929li/3poqCjPb\nGPg5YXy4t4HvuPuv47oSohy89BJMnhwKa/ffD+utB0cdBUcfDdttF9udEEIIIYQQQgix7NDMJAd1\nhjEHYCDwRi+89AXvACe7+7bAAcBFZrbq0vrnkyZNKqVW0XqpeFseci5eDHfcEbp8Dh4MJ54Y/t54\nY5is4IIL4L77lv2cMfTKqlW0Xlm1itZLxZtyxtdLxZtyxtdLxZtyxtdLxZtyxtdLxVuZc4rONFxg\nM7MJZjaBMLjb2ZX72XIxcD3wYF8ZbQZ3f87d/5Hdnge0E1raLRWmT++2e240raL1UvG2LOecORPG\njoUhQ2DffaGtDcaNg2eegZtugkMPhf79i/dVtF4q3pQzvl4q3pQzvl4q3pQzvl4q3pQzvl4q3pQz\nvl4q3sqcU3Sm4UkOzOzO7ObewDTgraqH3wJmAhe4++NFGiwKMxsOXOnuO3SzjSY5EMsNr78eWqZd\ncUVotTZwIBx2GIwcCS0tYF21RRVCCCGEEEIIIUTfTHLg7h8GMLMrCd0uX+mVyx4wsz2B0cBwwphv\nh7r7TTXbnAB8FRgEPASc6O5/r6O1NnA18IW+9CxEbNxD67QrroDrroOXX4Y994Qrr4RPfjIU2YQQ\nQgghhBBCCFEsuSc5cPdjAMxsS2Ao8Fd3f93MzBttDtcYAwhdTicBN9Q+aGaHARcCXwL+BpwK3Gpm\nW7t7e9V2KwE3At9z9/sK9CdEaZg/H669NhTWHn4YNtwQTjghTFiw1Vax3QkhhBBCCCGEEMs3uQts\nWWuwXwEfJozHthXwJDDJzF5099OLMObutwC3ZP+zXme2U4HL3P2abJvjgYOBkcD3q7a7GrjD3a8r\nwpcQZeO88+Css8LtQw6B8eNhv/1gxWbmCBZCCCGEEEIIIURumplF9CLgbWBTYGHV+uuBA4sw1RNm\n1p/QdfSOyrqs9dztQEvVdnsAnwIONbMHzGy6mW27NDwCtLa2llKraL1UvJUx5zPPwJgxsOGGrTz7\nLPzqV3DQQb0rrpUxZ19oFa1XVq2i9cqqVbReKt6UM75eKt6UM75eKt6UM75eKt6UM75eKt7KnFN0\nppkC2/7A19x9ds36x4HNem+pIdYF+gHzatbPI4zHBoC73+PuK7r7MHffOfv7z57EW1paGDRoEMOH\nD6e1tZXW1lZaWlqYMmVKp+1uu+22uh/OE044gUmTJjFq1Kh3102fPp3W1lba29s7bTtmzBjGjx/f\nad2sWbNobW1lxowZ764bNWoUl156KaNHj+607cKFC2ltbWXq1Kmd1k+ePJljjjlmCW+HHXYYU6ZM\n6eStpxzV1MsxatSohnMAPeao9tZTjmrq5Rg1alTDOaD796NWu7fvx6hRoxrOAfXfj9NPn457K2ef\n/XnWXbexHD29H6NGjWr6c1UvR/X72VWORt+PUaNGNf25qpej1ltv3o9Ro0b1+nteybHyyivnylFL\nbY5Ro0b1+nteybHeeus1nKOn96Py+vf2e17JseWWWzaco6f3o+KtqP1u7cQ5vXk/Kt6K2O/Onz+/\nkO95hYq3Iva78+fPL+R7Xu2r0RwVusoxf/78Xn/PKzkq3ora786fP7/hHD29HxVvRex3t91220K+\n55UcFW9F7Hc32WSTUp5fAay++uqlPL+qUMT51YwZMzp5K8P5VXWOam+93e8ee+yxpTy/Avj0pz9d\nyvMrgAMOOKCQ86vRo0d38lam86vx48d38tbb/e6oUaNKeX5V8VbG86uKt6J+1+6zzz4N51ia51d9\n8bu20RwVispR1HliNS0tLey22260trYyfPhwBg0atMR72R0NzyL67hPMXgWGufvj2e0d3f1JM9sF\nuNXd18mp91ngsuyuAwe5+z012yymapIDMxsMzAFaqsdVM7PxwF7u3kITaBZRsSzxzDOw5ZahBduZ\nZ8Z2I4QQQgghhBBCLF/kmUW0mRZsdwNHVt13M1sBOAO4swm93wI7ZstOwP0NPKcdWARsULN+A+C5\nJjwIscxx3nlhVtCai4RCCCGEEEIIIYRYyjQzUtMZwB1Zi7WVCBMKbAusDeyRV8zdFxAmScjznLfN\nrA34KFBp1WbZ/UvyehBiWWP2bLj88tB6bfXVY7sRQgghhBBCCCHSJncLNnd/BNgamEpofTYAuAHY\n2d3/U5QxMxtgZjua2U7ZqiHZ/U2y+xOAL5rZkWb2PuCnwGrAVUV56C21fZDLolW0XireypTzvPNg\nwICO1mvLa86+1Cpar6xaReuVVatovVS8KWd8vVS8KWd8vVS8KWd8vVS8KWd8vVS8lTmn6EwzXURx\n95fd/bvu/ml3/x93/5a7zy3Y2y7AA0AbYWy2C4HpwLjMwy+BrwJnZ9vtABzg7kuO7huJyZMnl1Kr\naL1UvJUl5+zZ8LOfwemnd7ReWx5z9rVW0Xpl1Spar6xaReul4k054+ul4k054+ul4k054+ul4k05\n4+ul4q3MOUVnck9yAGBmawFfAN6frXoUuNLdXyjQ21JHkxyIZYETT4Rrr4WZM9U9VAghhBBCCCGE\n6Cv6dJIDM9sLmAmcBKyVLScBT2WPCSH6iDlzYOLEzq3XhBBCCCGEEEIIEZdmJjn4EXA98GV3XwRg\nZv2AH2ePbV+cPSFENZWx1048MbYTIYQQQgghhBBCVGhmDLYtgQsrxTWA7PaE7DEhRB9Qab122mlq\nvSaEEEIIIYQQQpSJZgps0+kYe62a9wMP9c7O8sUxxxxTSq2i9VLxFjvn+PFdt15bnnIuLa2i9cqq\nVbReWbWK1kvFm3LG10vFm3LG10vFm3LG10vFm3LG10vFW5lzis401EXUzHaounsJcLGZbQncm63b\nHTgB+Hqx9pZt9t9//1JqFa2XireYOSut1771LVhjjd5pFelraeul4k054+ul4k054+ul4k054+ul\n4k054+ul4k054+ul4q3MOUVnGppF1MwWAw5YD5u6u/crwlgMNIuoKCsnnQQ//3mYObRegU0IIYQQ\nQgghhBDFkmcW0UYnOdii166EEE3x7LOh9do3v6nimhBCCCGEEEIIUUYaKrC5+9N9bUQIUZ/x42HV\nVUMrNiGEEEIIIYQQQpSPZiY5EA0yderUUmoVrZeKtxg5n30WLrsMTj21+9Zry3rOGFpF65VVq2i9\nsmoVrZeKN+WMr5eKN+WMr5eKN+WMr5eKN+WMr5eKtzLnFDW4u5ZsAYYB3tbW5kUwYsSIQnSK1ipa\nLxVvMXKedJL7mmu6v/hi77UaJZX3s2i9smoVrVdWraL1UvGmnPH1UvGmnPH1UvGmnPH1UvGmnPH1\nUvFW5pwp0NbW5oQ5CYZ5DzWlhiY5SIWiJzlYuHAhq622Wu+NFaxVtF4q3pZ2zrlzYcgQ+MY34Kyz\nlp63VN7PovXKqlW0Xlm1itZLxZtyxtdLxZtyxtdLxZtyxtdLxZtyxtdLxVuZc6ZAnkkOVGCrQrOI\nijJxyilw9dXw1FOw5pqx3QghhBBCCCGEEGmRp8DW1BhsZrammR1rZuea2drZumFmtlEzekKIzsyd\nG8ZeO+UUFdeEEEIIIYQQQoiy09AsotWY2Q7A7cDLwObAz4AXgE8AmwJHFuhPiCQZPx5WXhlOPjm2\nEyGEEEIIIYQQQvREMy3YJgBXuftWwBtV628G9irE1XLC6NGjS6lVtF4q3pZWzkrrtVNPbbz12rKY\nM7ZW0Xpl1Spar6xaReul4k054+ul4k054+ul4k054+ul4k054+ul4q3MOUVnmimw7QpcVmf9HGBQ\n7+wsX2y66aal1CpaLxVvSyvn97+fv/XaspgztlbRemXVKlqvrFpF66XiTTnj66XiTTnj66XiTTnj\n66XiTTnj66Xircw5RWdyT3JgZs8DB7j7A2b2KrCjuz9pZvsBV7j7Jn1hdGmgSQ5EbCozh37tazB2\nbGw3QgghhBBCCCFEuvT1JAc3AWeZWf/svpvZpsB44DdN6AkhMiqt1045JbYTIYQQQgghhBBCNEoz\nBbbTgYHA88CqwF3AE8CrwDeLsyZEWjz3HPz0p5o5VAghhBBCCCGEWNbIXWBz95fdfT/gY8BJwA+B\n/3H3vd19QdEGl2VmzJhRSq2i9VLx1tc5v/99WGml5mYOXZZylkWraL2yahWtV1atovVS8aac8fVS\n8aac8fVS8aac8fVS8aac8fVS8VbmnKIGd9eSLcAwwNva2rwIRowYUYhO0VpF6z3elw4AACAASURB\nVKXirS9zzp3rvsoq7med1Xut3pLK+1m0Xlm1itYrq1bReql4U874eql4U874eql4U874eql4U874\neql4K3POFGhra3PAgWHeU02ppw2WeAJcAoyqs34UcFFevTItRRfYnn766UJ0itYqWi8Vb32Z87TT\n3Fdf3f2FF3qv1VtSeT+L1iurVtF6ZdUqWi8Vb8oZXy8Vb8oZXy8Vb8oZXy8Vb8oZXy8Vb2XOmQJ5\nCmzNzCI6BzjY3R+sWT8MuMndN+5Fg7pCMbM1gNuBfsCKwCXufnk322sWUbHUee65MHPo6NEwblxs\nN0IIIYQQQgghhIB8s4iu2IT+OoQJDWp5BVi3Cb2+5BVgT3d/w8xWBf5pZr9x9xdjGxOiwvnnQ//+\nmjlUCCGEEEIIIYRYVmlmFtEngIPqrD8IeLJ3doola9H3RnZ31eyvxfIjRC3z5sFPfhImNlhrrdhu\nhBBCCCGEEEII0QzNFNgmAN83s3Fmtne2nA2cB/ygWHu9x8zWMLMHgVnA+e7+wtL63+PHjy+lVtF6\nqXjri5xFtV4re84yahWtV1atovXKqlW0XirelDO+XirelDO+XirelDO+XirelDO+XireypxTdCZ3\ngc3drwBOB74A3JktnwO+7O4/K8qYme1pZjeZ2RwzW2xmrXW2OcHMnjKz183sXjPbtY7fl919J2AL\n4AgzW68ojz2xcOHCUmoVrZeKt6JzPv/8Qn7849B6be21e6dV5pzyFleraL2yahWtl4o35Yyvl4o3\n5Yyvl4o35Yyvl4o35Yyvl4q3MucUnck9yUGnJ4di1evu/lpxlt7VPhD4INAG3AB83N1vqnr8MOBq\n4EvA34BTgU8BW7t7exeaPwLucPcbunhckxyIpcZXvwoTJ8LMmb0vsAkhhBBCCCGEEKJY8kxy0EwX\n0Xdx9/l9UVzLtG9x97Pc/bfUHzftVOAyd7/G3WcAxwMLgZGVDcxsfTMbmN1eA9gLeKwv/AqRh3nz\nKKz1mhBCCCGEEEIIIeKSu8BmZhuY2c/N7Fkze8fMFlUvfWGyjof+wHDgjso6D03xbgdaqjbdDLjb\nzB4A7gIudvd/Lg2PQnTHBRfAiivCqafGdiKEEEIIIYQQQoje0kwLtquAYcA5wCeBT9QsS4N1gX7A\nvJr184BBlTvu/nd33zlbdnL3yxsRb2lpYdCgQQwfPpzW1lZaW1tpaWlhypQpnba77bbbaG1dYmg4\nTjjhBCZNmkR7e0dP1enTp9Pa2tppHcCYMWOWGGRw1qxZtLa2MmPGjHfXtbe3c+mllzJ69OhO2y5c\nuJDW1lamTp3aaf3kyZM55phjlvB22GGHMWXKlE4+espRTb0c7e3tDecAesxRrd1Tjmrq5Whvb284\nB3T/fkybNg2ARYvgpZdg3LhLOfro0UydCn/4A0yeDJdcspBttmnlyCOnMmoUHHkkHHIIbLPNZNZZ\n5xiGDoV114ULL4QvfrGdL3+5sRzQ/fvx2GOdG2b25v1ob29v+nNVL0fta9yb96O9vb3pz1W9HLX/\nr9HPVb0c7e3tvf6eV3KMGjUqV45aanO0t7f3+nteyXHWWWc1nKOn96Oi39vveSXHRRdd1HCOnt6P\nynOK2u9ec801Defo6f2o/C1iv3vggQcW8j2vUPFWxH73wAMPLOR7Xu2r0RwVuspx4IEH9vp7XslR\n0Slqv3vggQc2nKOn96PyWBH73YkTJxbyPa/kqHgrYr977rnnlvL8CmD06NGlPL8CGDlyZCHnVzNm\nzOi0fW/3u+3t7YV8zys5qr31dr9b2bZs51cADz74YCnPrwDuuOOOQs6vRo8e3UmjTOdX48eP77S+\nt/vd9vb2Up5fVbyV8fyq4q2o37W///3vG86xNM+vqn/XNpKjyPPECkXlKOo8sZqWlhZ22203Wltb\nGT58OIMGDWKfffZZYrsucfdcC/AqsFPe53Wj99lM81XgFWCPOtssBlqr7g/O1n2gZrvxwLReeBkG\neFtbmxfBiBEjCtEpWqtoveXZ2+OPu//wh+4jRrivssoIHzjQHbpeVljBfc013TfbzH377d0/9CH3\ngw92/8xn3I8/3v2MM9y/+133yy93P/jg8uTsK62i9VLxppzx9VLxppzx9VLxppzx9VLxppzx9VLx\nppzx9VLxVuacKdDW1uaAA8O8p5pSTxss8QR4FNg57/O60RsADKlaVq6zTW2BrT/wdvW6bP1VwI29\n8FJoga0onaK1itZbnry9/LL7lCnuX/6y+5Ah4Ruy4orue+3lPnJkm194ofvPfub+y1+633qr+7Rp\n7o8+6j57tvurr7ovXtx33pZFraL1UvGmnPH1UvGmnPH1UvGmnPH1UvGmnPH1UvGmnPH1UvFW5pwp\nkKfAlnsWUTPbHzgdOM7dZ+Z6cpOY2WLgUO88i+i9wH3ufnJ234BZwCXufn6T/0eziCbG4sUwfTrc\nemtYpk2Dd96BoUPhgAPC8uEPw3veE9upEEIIIYQQQgghliZ5ZhFdsQn964HVgP+Y2UJCS7J3cfdC\n5kQ0swHAlnTMIDrEzHYEXnD3Z4AJwFVm1gb8jTCr6GqEVmxCdMncuXDbbaGg9qc/QXt7KKB95CNw\nySWw//6hwCaEEEIIIYQQQgjRCM0U2E4p3EV9dgHuJDTFc+DCbP3VwEh3/6WZrQucDWwAPAgc4O7z\nl5I/sYzwxhswdWpHK7WHHwYzGD4cvvSl0EqtpQX694/tVAghhBBCCCGEEMsiuWcRdferu1uKMubu\nd7n7Cu7er2YZWbXNj919c3df1d1b3P3+ov5/EdTOjFEWraL1yuJtwQL497/hz3+Ga66B734Xtt9+\nEmuvDfvtB9deC8OGwXXXwbx58Pe/h2322qux4lpZci5LWkXrpeJNOePrpeJNOePrpeJNOePrpeJN\nOePrpeJNOePrpeKtzDlFZ3IX2ADMbKiZfcfMJpvZ+tm6g8xs22LtLdtMn95t99xoWkXr9bU3d3jp\nJXjkEbjlFrj8chg7Fr74RTjoINh+e1hrLRg4EN77XvjoR+Goo2DCBHjppemcfTY89BDMmQNXXQWf\n+Qyst17vffWWsr4HqeQsWq+sWkXrlVWraL1UvClnfL1UvClnfL1UvClnfL1UvClnfL1UvJU5p+hM\nM5Mc7A38EbgH2At4v7s/aWZfB3Zx908Wb3PpoEkO4rJgAUyeDE89BbNnd14WLuzYzgwGDYKNN4aN\nNgp/K0vl/kYbwaqrxssihBBCCCGEEEKIZZu+nuTgPOBb7j7BzF6tWv9nYFQTeiJx3OH66+GrX4Xn\nnutcLNtppyWLZ4MHa7w0IYQQQgghhBBClIdmCmzbA5+ts/55YN3e2RGp8eCDcNJJcPfd8PGPw4UX\nwhZbxHYlhBBCCCGEEEII0TjNjMH2EjC4zvqdgTm9syNS4b//ha98Jczk2d4Of/oT3HCDimtCCCGE\nEEIIIYRY9mimwPYLYLyZDQIcWMHM9gAuAK4p0tyyTmtraym1itbLo/XOO/CjH8FWW4XZPCdMCBMQ\n7LtvfG9LU6tovbJqFa2XijfljK+XijfljK+XijfljK+XijfljK+XijfljK+Xircy5xSd6Td27Nhc\nTxg3btxfgA8QimmrAF8GvgDcDJw2duzYfLMmlIhx48YNBo477rjjGDy4XiO9fKyzzjoMHTq098YK\n1ipar1Gtv/wFDj0Urr4ajjgCpkwJM3726xff29LWKlqvrFpF66XiTTnj66XiTTnj66XiTTnj66Xi\nTTnj66XiTTnj66Xircw5U2Du3LlMnDgRYOLYsWPndrdt7llE332i2SaE8dgGAg+4++NNCZUIzSLa\nN8yaBaNHwy9/CbvvDpdeCrvsEtuVEEIIIYQQQgghRNf02SyiZtYfmAF8zN3/BTzTtEux3PP663DB\nBXDuubDGGnDNNaHl2grNdEwWQgghhBBCCCGEKCm5Cmzu/raZrdJXZsTygXvo/nnaaTBnDpx6Knzr\nW/Ce98R2JoQQQgghhBBCCFE8zbQl+hHwNTPLVZxLkSlTppRSq2i9aq1HH4X994dPfAK22QYeeQTG\nj89XXCvr67asvAdl0ipaLxVvyhlfLxVvyhlfLxVvyhlfLxVvyhlfLxVvyhlfLxVvZc4pOtNMgW1X\n4BPALDO71cxuqF4K9rdMM3ny5FJqFa03efJkXnoptFTbYQeYORN+/3v4wx9g663jeyujVtF6ZdUq\nWi8Vb8oZXy8Vb8oZXy8Vb8oZXy8Vb8oZXy8Vb8oZXy8Vb2XOKTqTe5IDM7uyu8fd/ZheOYqIJjnI\nz9tvh7HVvvGNMObat78NJ58MK68c25kQQgghhBBCCCFE8/TZJAewbBfQRM+8/Ta0t8P8+WF5/vn6\ntyv3X3wxPO/zn4fzzoMNN4zrXwghhBBCCCGEEGJp09Q4atn4a/sAQ4Hr3P1VM9sQeMXdXyvQnyiQ\nl16Chx8O46LNmVO/cFYpmFWzyiqw3nphWX992GIL2G23jvs77QRq8CeEEEIIIYQQQohUyV1gM7PN\ngFuATYGVgT8BrwJfy+4fX6RBkZ+33oIZM0Ix7eGH4R//CH9nzw6P9+8Pgwd3FMiGDoXdd++4Xymm\nVe4PGABmcTMJIYQQQgghhBBClJVmJjm4GLgfWAt4vWr9jcBHizC1vHDMMcX1pq2n5Q6zZoUJBc49\nFz77Wdh++1AQ23FH+NznYPJk6NcvdOG87rpQaFuwAD7ykWO4/364+Wa4+mq44AL42tfgmGPgYx+D\nD3wAhgyBgQN7Lq4VmbNovbJqFa1XVq2i9VLxppzx9VLxppzx9VLxppzx9VLxppzx9VLxppzx9VLx\nVuacojPNdBHdE/igu79lnSsvM4GNijC1vLD//vsXprXHHvszdWrnFmmPPAIvvxweX2ONUFzbc0/4\nylfCbJ7bbRfW97W3IrWK1iurVtF6ZdUqWi8Vb8oZXy8Vb8oZXy8Vb8oZXy8Vb8oZXy8Vb8oZXy8V\nb2XOKTrTzCyiLwJ7uPujZvYqsKO7P2lmHwJ+4+4b9IXRpUGsWURffz1033zmmbBU367cr4yNtuKK\n8L73hWLaDjuEv9tvD5tsom6cQgghhBBCCCGEEEXRp7OIArcBpwBfyu67mQ0ExgE3N6G3XPPmm2FC\nge6KZ+3tnZ+z7rqhYLbxxqFF2iabwGabwbbbhuLaSivFySKEEEIIIYQQQgghlqSZAtvpwK1m9iiw\nCnAdsBXQDnymQG/LNGPGwGWXwbx5ndevtVZH8ewDH4BPfrLjfuXvqqvG8SyEEEIIIYQQQggh8pN7\nkgN3nw3sCHwX+AHwAPB1YGd3f75Ye8smt98OZ58Nu+46lSuugNtug3/9C159FV54AR56CP7wB/jp\nT+Gb34Qjj4SPfAS22qrr4trUqVML9VikXirelDO+XirelDO+XirelDO+XirelDO+XirelDO+Xire\nlDO+XireypxT1ODuPS7AdGCt7PZZwGqNPK8sC7AqYRKG7/ew3TDA29ravFlefdV9883d99nH/WMf\nG9G0Ti0jRhSnVbReKt6UM75eKt6UM75eKt6UM75eKt6UM75eKt6UM75eKt6UM75eKt7KnDMF2tra\nHHBgmPdQe2pokgMzex3Yyt1nm9kiYLAvQ63VzOw7wFDgGXc/o5vtej3JwYknwhVXhJk+Bw9eyGqr\nrdac6RoWLixOq2i9VLwpZ3y9VLwpZ3y9VLwpZ3y9VLwpZ3y9VLwpZ3y9VLwpZ3y9VLyVOWcK5Jnk\noNEC2zTgNWAqMAa4ILu/BO5+dl7DfYmZbQmcC/wO2K4vC2x//SvsvTdcdBGcfHLTloUQQgghhBBC\nCCFEZPpiFtGjCbOEfozQNO4g4J062zlQqgIboRj4VWCPvvwnCxfCyJGwxx6hFZsQQgghhBBCCCGE\nSIOGJjlw98fc/XB33xUw4KPuvnOdpbl+lXUwsz3N7CYzm2Nmi82stc42J5jZU2b2upnda2a71jze\nCjzm7k9UVhXlr5ZvfxvmzAndQ1fIPXWEEEIIIYQQQgghhFhWaagUZGbTzWyt7O44uugeWjADgAeB\nrxBaxtV6Ogy4kNBldWfgIeBWM1u3arPdgcPN7ElCS7ZjzexbRRudNg1+8IMwc+jWW3esHz16dGH/\no0itovVS8aac8fVS8aac8fVS8aac8fVS8aac8fVS8aac8fVS8aac8fVS8VbmnKIzjba1ej+h4AVh\nFtGBfWOnA3e/xd3PcvffUr/l2anAZe5+jbvPAI4HFgIjqzTOdPfN3H0IoZvoz9z9O0X6fOON0DV0\n113htNM6P7bpppsW9n+K1CpaLxVvyhlfLxVvyhlfLxVvyhlfLxVvyhlfLxVvyhlfLxVvyhlfLxVv\nZc4pOrNMTHJgZouBQ939pux+f0Ix7X8r67L1VwFruPvH62gcBWxb9CQH3/gGTJgA06fDttvmSSWE\nEEIIIYQQQgghykqeSQ4abcF2NPBfOk9y8PE6y6HNWc7NukA/YF7N+nnAoHpPcPeruyuuVdPS0sKg\nQYMYPnw4ra2ttLa20tLSwpQpUzpt98Mf3sZ557Vy1lmdi2snnHACkyZN6rTt9OnTaW1tpb29vdP6\nMWPGMH78+E7rZs2aRWtrKzNmzOi0/tJLL12iOefChQtpbW1l6tSpndZPnjyZY445Zolshx122BI5\nbrvtNlpblxjiTjmUQzmUQzmUQzmUQzmUQzmUQzmUQzmUI4kcLS0t7LbbbrS2tjJ8+HAGDRrEPvvs\ns8R2XdFQC7ZOTwityQa5+/O5nti13meBy7K7Dhzk7vfU+Z/VLdgGA3OAFne/r2q78cBe7t7SpJeG\nW7C99RYMHw79+8N994W/QgghhBBCCCGEEGL5oC9asL2Lu69QVHEt47fAjtmyE3B/A89pBxYBG9Ss\n3wB4rkBvXfLd78KMGXDllV0X12orqr2hSK2i9VLxppzx9VLxppzx9VLxppzx9VLxppzx9VLxppzx\n9VLxppzx9VLxVuacogZ373EBWoH+Vbe7XBrRy7sAi2u1gXuBi6vuG/AMMLoX/2cY4G1tbd4dDz7o\nvuKK7med1e1mPmLEiO43yEGRWkXrpeJNOePrpeJNOePrpeJNOePrpeJNOePrpeJNOePrpeJNOePr\npeKtzDlToK2tzQm9LYd5TzWlnjbwjgLX+lW3u1oWNaLX4P8cQEertsXAKdn9TbLHP02Y6OBI4H2E\nbqb/Bdbrxf/sscD21lvuO+/svt127m++2f0b8fTTT3e/QQ6K1CpaLxVvyhlfLxVvyhlfLxVvyhlf\nLxVvyhlfLxVvyhlfLxVvyhlfLxVvZc6ZAnkKbLnHYFtamNnewJ2EINVc7e4js22+ApxB6Br6IHCi\nuzfSxbSr/9njGGzf+x6cdRbcey/sskuz/0kIIYQQQgghhBBClJk8Y7CtuHQs5cfd76KHMeLc/cfA\nj5eOI/jnP2HcOBg9WsU1IYQQQgghhBBCCBHIVWAzsxWAo4FPAJsTWpc9Bfwa+LmXtTlcAbzzDowc\nCUOGwJgxsd0IIYQQQgghhBBCiLLQ8CyiZmbATcDlwEbAw8A/gc2Aq4Ab+8BfabjoIvj73+GKK2CV\nVRp7zvjx4wv7/0VqFa2XijfljK+XijfljK+XijfljK+XijfljK+XijfljK+XijfljK+Xircy5xSd\nydOC7WhgL+Cj7n5n9QNm9hFgipkd6e7XFOivFPz73/Dtb8Mpp0BLS+PPW7hwYWEeitQqWi8Vb8oZ\nXy8Vb8oZXy8Vb8oZXy8Vb8oZXy8Vb8oZXy8Vb8oZXy8Vb2XOKTrT8CQHZnYb8Gd3P6+Lx88E9nb3\nAwr0t1SpN8nB4sWw114wbx489BCstlpcj0IIIYQQQgghhBCi78kzyUHDXUSBHYBbunn8j8COOfSW\nCX74Q7jnHpg0ScU1IYQQQgghhBBCCLEkeQpsawPzunl8HrBW7+yUi//8B77xDTjhhNCKTQghhBBC\nCCGEEEKIWvIU2PoB73Tz+CJyzkpaZhYvhi9+EdZfH86r2ym2Z9rb2wvzU6RW0XqpeFPO+HqpeFPO\n+HqpeFPO+HqpeFPO+HqpeFPO+HqpeFPO+HqpeCtzTtGZPAU2A64ysxvqLcAVfeQxChMnwp13wuWX\nw8CBzWmMHDmyMD9FahWtl4o35Yyvl4o35Yyvl4o35Yyvl4o35Yyvl4o35Yyvl4o35Yyvl4q3MucU\nnek3duzYhjYcN27c5sAC4M0ulgXAQ2PHjv1tH/hcKowbN24wcFxr63Ecd9xgjjoKTj65eb33vve9\nDB48uBBvRWoVrZeKN+WMr5eKN+WMr5eKN+WMr5eKN+WMr5eKN+WMr5eKN+WMr5eKtzLnTIG5c+cy\nceJEgIljx46d2922Dc8imgKVWUR3372N2bOH8cgjsMYasV0JIYQQQgghhBBCiKVNnllEl5sx04rk\n3nvh5ptVXBNCCCGEEEIIIYQQPZNnDLZk+NjH4KCDYrsQQgghhBBCCCGEEMsCKrDV4fTTi9GZNGlS\nMUIFaxWtl4o35Yyvl4o35Yyvl4o35Yyvl4o35Yyvl4o35Yyvl4o35Yyvl4q3MucUnVGBrQ6rr16M\nzvTp3XbPjaZVtF4q3pQzvl4q3pQzvl4q3pQzvl4q3pQzvl4q3pQzvl4q3pQzvl4q3sqcU3RGkxxU\nUZnkoK2tjWHDhsW2I4QQQgghhBBCCCEikWeSA7VgE0IIIYQQQgghhBCiF6jAJoQQQgghhBBCCCFE\nL1CBTQghhBBCCCGEEEKIXqACWx/S2tpaSq2i9VLxppzx9VLxppzx9VLxppzx9VLxppzx9VLxppzx\n9VLxppzx9VLxVuacojP9xo4dG9tDaRg3btxg4LjjjjuOwYMH91pvnXXWYejQob03VrBW0XqpeFPO\n+HqpeFPO+HqpeFPO+HqpeFPO+HqpeFPO+HqpeFPO+HqpeCtzzhSYO3cuEydOBJg4duzYud1tq1lE\nq9AsokIIIYQQQgghhBACNIuoEEIIIYQQQgghhBBLjRVjG+hrzGwm8BLgwAvu/tG4joQQQgghhBBC\nCCHE8kQKLdgWAy3uvvPSLq5NmTKllFpF66XiTTnj66XiTTnj66XiTTnj66XiTTnj66XiTTnj66Xi\nTTnj66Xircw5RQ3uvlwvwFPAgAa3HQZ4W1ubF8Huu+9eiE7RWkXrpeJNOePrpeJNOePrpeJNOePr\npeJNOePrpeJNOePrpeJNOePrpeKtzDlToK2tzQk9Iod5DzWlFFqwOfBXM7vPzD67NP/xeuutV0qt\novVS8aac8fVS8aac8fVS8aac8fVS8aac8fVS8aac8fVS8aac8fVS8VbmnKIzpS2wmdmeZnaTmc0x\ns8Vm1lpnmxPM7Ckze93M7jWzXetI7eHuw4FDgDPNbLs+Ny+EEEIIIYQQQgghkqG0BTZgAPAg8BVC\nK7ROmNlhwIXAGGBn4CHgVjNbt3o7d5+b/X0OuJnQDVQIIYQQQgghhBBCiEIobYHN3W9x97Pc/beA\n1dnkVOAyd7/G3WcAxwMLgZGVDcxsNTMbmN0eCHwE+GffuxdCCCGEEEIIIYQQqbBibAPNYGb9geHA\n9yrr3N3N7HagpWrTDYAbzcyBfsBEd2/rRnoVgH/961+F+Pzb3/7G9OnTS6dVtF4q3pQzvl4q3pQz\nvl4q3pQzvl4q3pQzvl4q3pQzvl4q3pQzvl4q3sqcMwWq6kOr9LStuS/R+7J0mNli4FB3vym7PxiY\nA7S4+31V240H9nL3lvpKPf6fzwLXFmBZCCGEEEIIIYQQQiwfHOHu13W3QfQWbFlR67LsrgMHufs9\nkezcChwBzATeiORBCCGEEEIIIYQQQsRnFWBzQr2oW6IX2IDfAvdW3Z/TwHPagUWELqDVbAA816wR\nd/8v0G1FUgghhBBCCCGEEEIkw/9rZKPokxy4+wJ3f7JqebOB57wNtAEfrawzM8vuNxRcCCGEEEII\nIYQQQogiKEMLtrqY2QBgSzpmEB1iZjsCL7j7M8AE4CozawP+RphVdDXgqgh2hRBCCCGEEEIIIUSi\nlHaSAzPbG7iTMC5bNVe7+8hsm68AZxC6hj4InOju9y9Vo0IIIYQQQgghhBAiaUpbYBOit5jZCsA2\nwCx3fyW2HyGEECJVzGxNd38ptg8hhBBCiL4i+hhsZcPM1oztQTSHmV1oZsdkt1cgtID8BzDHzPaK\nag4ws73MbIlu2Wa2YqP+LLCpma1SvEMhOmNmHzGzlqr7x5rZvWZ2hZmtHtObEMsiqezDzexrZnZY\n1f1fAv81sznZcB9iOUDnzAKS2q9tmo35XbvezGzTGJ5qfAw1s++Y2WQzWz9bd5CZbVsCb0MK0umf\nnYNuUYSeEH1B0gW2sp4AmtnYrEBUu34NM5scw1NfkhWY9jWz48zsPdm6Dc1sYE6pTwOPZLdHAFsD\n2wGXAt9rwlc/M/uCmV1nZreb2Z+rl7x6hILf2nXWr5E91pAt4Algkyb+f9eiZkea2cp11q9kZkcW\n+b/yYmb9u3ls3aXppa8wswPN7ENV908wswezz95aObXWzIrN081stpk9W73ktPYDYJ1Mdxvgh4SJ\nZHYgjIOZGzP7/PJ+El4EZjah0SWn7upmdrSZnVP5bJnZjmY2uG+SiBr6ZB9eFN0Vzs1syxxSxwPP\nZM/bD9gPOAj4I3B+bzyWmeyY+d56F9NikB0P9jezz2XH+XeXJrRKec6cCllh4R0z265AzSfNbJ06\n69c0syfzSFHi/RoUVnx6Clivzvq1s8eiYWFopYeBDwCfACq/oXYExsXyVcUTZnZnti9q+hwwm+jw\nfwv0VWrMbHj2mn3OzIb1QkeFyaVI0l1Ezewp4Ah3/3/ZCeAvgcMIhZpN3X3/HFr9gdeBndz9kZ62\n70HrGcKJ6efc/cls3T7ANcBz7r5bE5ovsuR4dmTr3iAcGK9y9yub9d0MZrYZcAuwKbAysLW7P2lm\nFwMru/vxObTeALZ099lmdhnwhrufnO1MHnT3NXJ6+yFwNPAHYC41r5+70RkZKgAAIABJREFUn5pT\nbzGwgbvPr1m/NXC/uzfUIsjM/gl8wd3vzfP/e9BcBAx29+dr1q8DPO/u/XJofcbd6xaCzex8dx+d\n09tvgE96zc7KzDYA7nD3hk82zexA4DV3n5rdPwH4IvAocIK7v5hDq6vCRvV36rfu/kIDWg8DX3P3\nm81se+DvhALWh4EZ7n5MDl83EQrLVwPzWPJze1kOrdeA7d39KTM7Cxjm7oea2S7A79w9d1HGzNoJ\nE+xcD0xy97/l1ajR6wccCrw/W/VP4CZ3X9Sk3prAJ4GhwPnu/kJ2UjPP3efk1FoR2CfTus7dXzWz\nDYFX3P21Hp5bW3QfRnjdHsvubw0sAtrc/SMN+tkOuB1YSPgh9N5sf/s9YCN3P6qxZH2Dme3QxUOV\n79SsRmYar9Ir7Dua6X2GUDjaAtjT3Z82s5OAp9z9dzl89cU+fChwCh3fg0eBi939Pzl17gb2rX2d\nzey9hP3txg3qvE44nj+THc9XcffjsuPdfe6e68JBplnYcapozGw1wsW8yneoci5zKTDH3c+L4GkE\ncC3hh/YrdD4WuLvXu+DXnV6vzpm7OQ9dgrzeav7PloR97l/d/XUzs9rzh1hYuHDc6QJ6niFMsqLX\nx939oYL8LAYG1flObUDY3y5x4bUbrb7Yr32ejn1uS7bPPYWwz/1tDp29CcX9e4C9gPdn38+vA7u4\n+ycb1OnqPH4z4FF3H9Cop6Ixs2nAr9x9gpm9CuyYZdwNuKHRfXeN5nq1Wase297dH86htRNwDPAZ\nYCV6cQ5oZlcTftf9IO9zG9DeBCCbULHR57Q2uq2739Sg5vrALwjnkJWhFdYkNMg4vKv3pQfNlwl1\niqjF4CRw92QXQkFsk+z2xcBl2e2tgReb0HuSsEPrra+1CCcurxB+/J8PvAV8F1ixSc1TgPnAz4ET\ns+Xn2bozgZ8RfnB8sZfevwdckWP7KZmPlYBXgSHZ+n2Ax3P+76eBfYF+2e0R2fptmnw/24H/KeD9\nvCFbFhGKdTdULb8lXPW6JYfeCOBuYLveeqvSXAysV2d9ZebePFovAQfVWf8DYG4T3v5OOAhXrxsE\n/Av4dU6thyvvKbB99pn/HjANuDKn1p1Z1teAtmx5NVt3L/Ai8AKwTQNarwGbZ7fHVnIRiirP5fT1\nKrBzQZ+LFwknogB/BY7Pbm8OLGxScyXgU9l34W1Cq9PT633+GtDaklBwWgBMz5YFwAxgaBN6OwDP\nA49n3ir7o+8A1+TU2iz7jC4A3qnSuhj4aU6t04CbgLWq1q1F2H+enkPnT4TCrdF5f7sH4QdLM+/n\n/Ow1q13mEfbDdwCfb1BrMWE/2dXyBqFwvEqDekV+R78E/BcYQyhQVl67kcCfc75mhe7DgQOAN4H7\nsvd3Qnb7DWC/nFp/BG6m6lyDULSbSyjYNarzLPDB7PZjwKey2+8lFJibybkYWL/O+g2B1wt6Lf8F\nLGrieRcD9wMfyj5vlc/HIcADDTy/8lnsccnh6d/ARcBqBb02vTpnJhQfK8tpWZ7JwEnZMjlbd2qT\n/tYhXECo7Ecq78EVwIU5tQYA5xBabD9BOL9/d8mptQXheLeAzvuzxXk/a8AXMq21e/letmbLYuDz\nVfdbgY8TWqs/llOz6P3alwnHl2/SeZ97NHBnTq1pwGnZ7epj327A7AaeX9mvLgJ+WnV/QvZduBe4\np8mcdwJ/7mrJofMasEWdjJsTGhw04+054OA6679Kk/tcwoXCTxDOad4inAOeRo5zQOBbhH3mr4Fv\nVO1DTgJOatLTOcDLVd/Plwnnfv0beP7immVRnfuL8nzfCQXIv5Odg2frtsnWTW7ytb+aJvevWnK+\n1rENRA1f8AkgBR34qvS+l30x3wI+2kutX5L9OK5Zfxzwm+z2icDDvfw/V+c8IPyX0Iqi3gEh1w/4\nbOf4YrazfobQAg7CwfjeJj8fWxfwPl6ZLYsJVyOurFouyw4O6+bQe5HwY2oR4YS3qZPvTOsBQlFi\nEWG8uulVy0OEIu8vc2oeTPgB+6GqdZcCc4D3NfH6rUf40TMhu79h9n39JbBCTq0iC1knAr8BVq9a\ntwbwK+BkYDVCAeTWBrTe/ZEPTAW+1IvvwXRg195+bjOtPxBOgkZn+6HKj6t9gScK0B8MfI3Q2uZN\nQtH5YLLW1Q08/2ZCQWDtqnXrZOv+0ISf24HvZ7er90cfBGbm1Cry4sEcYNs667cDns2h8xKhlW9t\nvs1p/gT8dMLFiMnAqdkyOVv3bcJ+7k1Cy4aetEZk3/UvEArg22e3HyW0lDmCsG+/oEFvRX5H/0lo\nOVL72m0PzM/5mhW2D8/0HgDOq7P+PGB6Tq1VCa08ricUYrcjFEsn5NT5ITCTUNRtBwZm6w9vwlPl\nh9MiwgXB6h9TpwI30kARq8H/9XHgqCae9zSwe53Px5Y0cD5J5+JTt0sOTwsqPgp6bQo7Z86+l6Pq\nrB8FTGnS3zWEHhEb17wHBwD/zKk1Ocs7nnCB+uTqJafWPYRC3WGE/f/e1UtOrQeybG9k70H1OVvD\n3yuWLAhUL29m2h/L6a3o/dqjwKHZ7er3czugPadWr4pPhCLYndnrc0/V/TuBWwnn8ls1+bn9Qc3y\nQ8J54Evku6gxu+r7WZ3x48B/mvR2RvZZ+wnh2LAR4aLZ82THw2YXQq+lUzP9xdnfawitlHt67lPd\nLLkK4JneTwjHueMIF1p3yG7PBX6SU2tfwsW8A4DVs+UAQmGs4QtehALfEufyhKLwS02+5oUWJrV0\nvZRijIiI3ABcZ2aP0/GjDGBnwhWrvIwinEw9a2ZPE05u3sXdG+47bWYnEg7kk4HhwCVm9llvvln4\n/xBOTGu5A7gwu30z4YS8aTx/F6MVCC3OaqmcIOX53982s0cI3U2v944uLgZ8P6cvCK/LyWY2yrM9\nUzN41r3PzGYSfhgu6P4ZPXJKL59fzZTs706Ek4TqbmtvEX4g/SaPoLv/wcy+AtyUdSP5AuEq/ofd\n/d95Dbr7fDPbH5hqYWzZjxFOJo9w98U55d4i/KiGcBC8Jrv9AuEgmIczgAO8qnuHu79sZmOB29z9\nYjM7G7itAa2pwAQzu4dw8KyMc7M14aQpD6OA88zsTEKx+e3qB939rRxaJwITgWMJPyoqTeYPIRSj\neoW7zzWz2wnf2SHALoT35XkzO8bd7+5BYm/Cj9p3u/i5+3+zbh/3NGFpV8JJVS1zCK0m87An4WT3\nLes8JvJMwklqHlan/rgv6wHvyaHzNh3jslSzJaEI0gy7Ame5+4+rV2b7gA+7+6fM7CHC8WxSD1rf\nBE5x91ur1j1sZrOBc9x9NzNbQNg3f7UBb0V+R4cQ9ju1vEH917Q7ityHQ2hh9uk666/I+788dKk7\nGPgL4SLGXoTWm7m69hN+OM0kdEU+wzu6RA8GftzVk7rRgnAsP57wA75C5TjV8HAS3eHuNzb51PUI\nPzprGUAD3SLd/eom/2933ErYp+YZS6s7ijxnPoBwcaWWW2j+PHR/wvd9ds0+93FCi+I8HERoudPM\ncaSWHYHh7v5Yj1v2zJSeN+kZd18B3u32u4u7/7cA2aL3a1sQCoq1vEn4XuXhJcK+56ma9TsTju/d\n4u4fBjCzKwnnQg13621Au+5wM9lxKs+x5RfAeDP7FGGfs4KZ7QFcQMe5bl5v3zezPxEuFv6DMNbc\nfcAO7v5cM5rZECMjCRdbFmT+JhF+940h9Ozpdigkdy96HLHPErpd/rFq3T+yIZsmE1pTNspFhAYt\nU6vW3WpmCwnn0++v/7QlWIGa8/eMt2l+DP0vEL4Lw7OlGgcuaVJX1JB6ga3IE0Ao6MBnZrcQToqO\ncvdfm9mqhGbI95rZGHdvplj0AqF1QG1/9RHZYxAOWLmKWgVwG+Gg/KXsvmdjVIwjFPwaIhsD7/eE\nK6LXVz/mzY8r9yHCGFgHZWNL1BYqPpFT7/uEHwjAu2M2fJwwbkMjP/Aq/7ewE3F3H5d5mUkoSr5R\nkO512VhW9xCa+O/t7s0UrSt6z2TFursJLSI+32TRs8hC1lrA+oSrrNWsR0ex7iVCC6aeGEXY53wS\n+LJ3jPV1EOEHRx6eJXyX/18Xjzc8TpGHMSD3rbP+xJyeOmFhcorPEcbjeC/wO8I4arcSTijHEE4I\nezqJepP6BaaBhB/eeXmT+oXWrQmf4zwUdvGA0ELnSjM7HaiMV/IBwvABN+TQ+R3wbesYqNzNbCPC\nD9o8OtUcTLgqWsttdFzY+D1hiIOe2JHQEqiWpwktxQAeJByjG6HI7+jMLvztT2h11zB9UEyZT7hI\n8njN+p2oX/TphC05scFiwr7xT4QLLOdUtmn0h6WHgagvqLM+95g5lR9T2biEn/AcY2V2RXZeZe6+\nMLvf1PG4ivsJ34VLK7azv8cSuqf15KfhCzw5ftz/ATjfwgQ1D7PkOUxDYwFVUeQ5838JF2ourFl/\nSPZYMwwgdCWsZW3Cvj0PlS67RfB3wmvW6wJb5ZytCLLz5icJr0+vC2x9sF97irAPq93nHkjOfS4F\nFZ88x3i4BfB/hON9IxeTIDSi+BGhlXc/wnGvH3AdoatjszxBuFhbmVjg+maKa2Z2Gh3nfDcDRwI3\nV10of8rMjibsY5Y2b3bxf58i/7nkUDrGTKvmZUKLyUb5M3CxhXGtnwXIztd+QGgck5s+KEyKrojd\nhE7LkgvhpHbDOusPpokxrLLnfpEwDtBNhB9D3yJcJXibrOsOoavP9Q1orUjoMjad0OLptez2V2mg\nr3qN1saErjePZl6mEVpSzKDOWCs9aLWTdX8q6H24srulCb3b6BjDak1Cc+RnCE3pv9ykx1XoaIK8\nOlVdoZbyZ3ZCF8sz2efs3XUN6nU1Hs0bhINUs11iNyX82H+Iqi5rhAPWJTm1riWcnH48+xxvnN3+\nD/DzbJvDCRNYLM33YhrhhP4owonoAdVLA89fqfp2d0uT/m4knMzMyPYZ9cb+Wx9Y3IDWNYQTvw8Q\nitcG7E74MXlVE94uz/z1JxTBtsg+M9OBi3JqXQ9MzG5XtAYSToyuzKm1GuEH7Bt0jOXxZrZuQA6d\ntQhdWtoJx4OnMp2pZF34mnjNnqFOlylCl4Nnsts70EAXbEJrhatqPoP9s3UPZPcbHi+uyO8ooWXj\nLMKPjNcIxfCvZe/tEc28dplur/fhwFmEfebXCC0n9wS+nq37dgPP72rsu9rxZPKOF/X57LP1LLBZ\ntu4U4JBmX6+iFpY8Hj9HL47HhAtyrxK6Gr1OaMVwW/ZZGd6L96DpMbvovhtg7nHmCn79j872Qb+j\n45z0d4TzwKOb1LyZ0NIVOva5KxBaYuYdr/VzhK7kvR6/jvCD+0+EY/JwOrqg7UBoCZRXb01C4fZc\nsuERCMNcbNSE1nya7NrYTdbvEFr9rJ+tO4g6Qxw0oHUs4cLnYdn36HBCK+fXCK2N8mitRBhr+m06\nht5ZRGiZ1a+H597Q6FLU65j938+TYwiIqudtSui19Onevrdkx1tCl8f3Z+/JK4Tzm7Vyaj1O6JbY\nZRfQ7H06qkG9jYGvEC4Qdvrt0UTOswiFyJWr1q1MKHKOyan1V8K+f4OqdRsQLiDflUNnE8I50VuE\nc5b/ZLenAxv38n1diVDobGpcdy09L8nNIprN9PFHd3+7p1k/PP/VvUJnoOtCf113b6orT3a1ZhTh\nSwXhatql7t5VS5d6GqsSThRaCF3EKleR3k9o6XIPsL/naAmVzbR3OOFkYyBh53Gtu7/eqEamczFh\nhshv5nne0sLC7Il7u/s/zexYQve7nQk/2M5294aaDZvZAMK4IJ8mdNPohDc4k5qZvUAYY669p9m9\nvIcZvWzJGQ+7kep5xkMza7irsfdN15oeyVpa/oBwFa7SGvgdOgYRXZDNmoS7P1jn+at71hKhp9YL\nnm+WsYWEH3R5r/BWnv/uTH3ZjFndfS5yz9qXzf50uXfT/dNC/54h3sMMiNn+9mpCS9xK64wVCRcS\njnb3l3N6W4MwNsUuhJZxzxK6hk4jTI7RcPduM9uYcEJlwFaEFi5bEYpbe3nNrG0Nag4gHFsgjKnS\nVHfzbEa1HenY397qTZ4MmNnxhDFjbqKjdd2uhPdklLtPNLOvErrydjtTm5l9MNNZTOiOAqHlWj/C\neED3ZrPKDXL38xvw1qvvaB29owhjN1a6m80DxnqOmXkznUL24VV6RihcnU4YoxLCZ/d8woWDbt/b\n7PPQEO5+V4OevgycTSg0fZMw8PmTWQuFozzrcpUHM7uiB28jc2gVcjyu0RxKKGxWf7fGewMz7fXF\ne1AEfTE7XpX2BwiF+Mpr/S/C5/W+PDpVetsRLmBMBz5C2JdsS2ihtUdPx5MarQcI+1ojtGypbf2X\nZ8iX3Qk/3jevlsi0Pc/33cJMy7fT0RqmMhP0dwgzuR7ZqFam9wPgTXf/ep7ndaG1NwXM1FmjeQRh\nn1s57j1LKHj0NNxAV3qbEI4pAwkXbWpb/dZ7TsO9YLyJFm5mVtt63AitQnchFIwLa7WYFzN7k3AM\n/baHVsmV/dz/EcbkzT0zaUG+Pkr4fj8JvI9woXVzwms3vZHfGTV6NwIfJVxwrAzFtCOhENWptZj3\n0HvJwizGNxJ6PlSGVdmEUGA81HP05smO7fsSMgL8y92bHp7FSjjb9fJKigW2d6ekzm53Ra6DXqZd\n6IGvjJjZOMKVxxHu/o+ax3Yk7PCudPexEbxdRGh+PIPwY7Z2DLwzmtRdj6qipDcxNXKms5AwyP8s\nM/slYdDdcdkB/zF3X60HiYrOjwhdV79NuPp2AmFMp+OAr7v7tQ3qHAX8wt3f7KmgFauItSyQ/Ygf\nkt190ju6zfT0vEYKWc2cgN8DfMvdGy161j7/AOAOd38nu90l3nmsrGhkJzTv/kjLcwLThd6HqCr4\nN3tCk108OIzOP7hzXzwogppu9D3+qMipvTf1L978tQmt9xAmM9i6Sus6d296+IJmv6Pd6K1OaPH3\nbJPPL2Qf3oX2ewCaeb2yz+uZhJnA83aZr9V6FDjT3aeY2auEGdafzIogf3H3dZvQrB0frT9hwPM1\nCZMrNTxsQ1HH42UFM1slz4XPqufVnidXCkPV98ONxi/urUgY8+hWd5+X11MP2msQ9kXV+9wfufvc\nnDpjuns8T9Ej+y78i9Blfh41x3p3r9ctviut2wnHpDNqvlcfJOwnN29UK9O7lHAB4nFCK6Xa8+bT\ncmhNA37l7hNqvO1GaN3VdDEmKwwMbObCVNmpU8BbTGhZ+GfP0V09K8Z8knBsWZ+acbry7B+rNPeu\nV9A3sxWAb7r7OTm0ngTuIrQcfrNq/brA39x9SJdPXlLrb4TGMmMqnzXCcAjXAre4+08a1cr0Ci2i\nZu/FflQVxoDbe3EhcxVCIbxXRRsLDVH2IFyQu4XQgvZJMzuEcLFw597oiw6SK7D1Jb058FmBrYm6\n+R8rEAazrrfjbeiHkJk9RjhprjvwvYXxDb7r7lvXezzbpk+uiJpZdwOiu7vv1ahWpjeAUOk/ko7X\naxGhW9qJno3dkkPvH3R0QXsEONDdp5nZcMKMhw0Nom5ms4Aj3f0vZvYKMMzdn8hadnzG3f8nh6d+\nhFYPh9BxpWZcjAJAd/T2s1tVwMpdsOorsqLEPVkhq9vWC3laLJjZxwlXfM+l/rg7uSeaKBoLLWH3\nJHRl6DT+ldcMlt+NRn9CMf1j3mRrvWWFbF/0dcIV1nrfgYZOTLNWO7v3tgCZImZ2G2HWxJdr1r+H\nMBP3/jm0CtuH1+hWXwya4U20ds/OXbZ395nNeKjSeZ1QwHq65nxoK+Af7r5qb/Sr/s8KhG6Z//Ec\n49MWdTyu0nv3gknN+nWA53NeJOn2XCXH+Vo/QsH0eEIXpUprhXMIsyLnagVkZvsSWl6eSce4ci2E\nLoFnuvufcmgtJLRwari4tKxiYWKWHYvY75rZy4T9xX9qvlebEQrDq+TU6+5CnHuOlkBm9hph3/FU\njbfNCfujXN6KxMx+A9zrNS2fzewMwkyNn4rjrDiy4slxhGEg6hVyl+b4cUuQnYc/QRifrNWzcdzM\nbANCV9g8+8hXgZ2y78GL/5+98w6Xoyzb+O9JQEpAAogg0gIGEekEBOlVQOkooFKEACJFyifSBAKi\ntFCMCIggKF06SE8ICkjvLYTeSyChl5A83x/3O9k5c2Z3552dzR5yzn1de53d2Z3nvNPe8pT7BlZ1\nZSMvDVwV62juiUicmFTUfwebLwJbu6oB0s/ot5D/IlbsrQ910KtFDsxse7qqTSbbv4Lq+2NVV1pR\noNuXGvF11Uo86RT1BekafQR1wkU7tgWplQHl4S60aG6ErBhENiKabCOiXbj7akV/WxAnIpXCjakp\nEq6KVFaGE6cqAyqXuQClW49y92SCuj75Skn1MAc1VbD3w2cQ101U1AZNlA9HmZefIqW/ryOFn1Ko\nyhmQslfFvVs5sWerx5k4zUI0fw0qyBoJSJzfF9B1gmXEPeuE9s2C+F3yjvGS2MaFCdB1wGyIf+p9\nlIHyCSJaLuRgc5X5tzxhN7O9i/7W3QsrLJnZQYh37O+Z7Tsh3rlji7eSv6F75J9INr5sZOx8lOVb\naRl9iNYOIv8eKUxBEGwNpn4E/shIW1X2ResiTpYsZgztjUGVfXjdYJCZlQkGjUL32gux7cjgeaoj\nKK8Ld59sZici1dMYAaiqxuME2bEpwQzEk2SPztmWfuaL9uGHoFKgAxD/VILH0DwzdoFWlToeaB65\nLPmiJqURxoSlyH/ey9C+LE/tuB539zL3xiiUYVNFYKNKIR68RKl2A7Sk1AlgKs0tNL55RJkuKlk9\nLGf79SjIXAgm1dVGCRAxWVg7oYz2WAGOetgOicAUFojLQ5gT/dXdP20yP3J3H9Hg+26/R/3/CcD9\nZraZu99bspkfUQvOvo5KiB8PnwtnRzdIaHkPeBo4ISZwkLI7AI2jeUHkovPIQ6m2/4YW1a77UBy9\n2sGGyOpvoPvNNmv4LtbBVnrg81CCFxbbTvWp86dTU7lqZYH2Ppq4vFzn+3loopDnQR4cmkdES7ax\nKmwJbOXuo1PbrguR+UuIdLC5FGFvRxOQh1NfjURR9KJ4Di1mX0IZPD9Bk9WNyVeuaYTtgV+5+19h\nyvX4t5kN9ZqyTyyqcgYkaPnebVOUvJLjDBlsv6GkjHoOormD6sHMNkCL0IFokZg+RkfPQSxOQv3u\nruh+HRJsnYuc2jE4FfhtuF+/KNEWUHCjCJw4CfPdqKnUpvE4QdEswtaGwA/d/Y6mv2wMB/YMz3kl\nZfSh/Od8VILZSvAGM9sFOZjGIdL57P0W5WCjgmfUpMCYYNFQzpKgP1owxJaKVtmHQ7XBoOuBY8xs\nSfJLxoo6KU4ETg0ODwNWNLNtEcn10Ij2FMEiRM5nM+Nxmn9vJBGKuqkFqANDQxZPgv5oYf9UTNuQ\nGEka0yMnxVHEOce3B3Z195Fmdnpq+8PUSpdiUJU6HiiQMtzEVZl3nz2Su1cDhPHqH+QvsGP7oq+j\nfnpNasc8MGR8beNxVCHXACeFZ6pVNdergcPM7CfJ7ma2ABpPcitLiiBksCwC/MfdPzEzK1GOVoVS\nZzYAXxVmQfybWUwkf91WDydnPifP5gaI8zIGZyLahrcAzOw14PstZA+/Ry1w0wr2RWP6pzSeHzk1\n1eQiMMSTvYWZ/RG4zcx2RbzesbgLjXFPooDt8PB8bRG+K4p6CS0DkSDJtWa2lbtfU9SgmS0b2jQz\ncly9i/qkj9G1LjqPrLr/hhbVrvsQAe8BSgudeqE69zwFu6WJVCcM+1WiQIcewgUrPtaPqEBhEynH\nXNbg+8uASyLsPYZSe7PbV0NcSrHtWxb4AyLgvCT9KmErKWHIbv8u8FEL5/BbSNFxpvDZIvffF9g7\nvF8XZf4k6oLd1Pya2PoMEZWmt31KCwo1aDK6Sqv3WspeJfduxubMaIAqreZV5XEipdUdqjzGitr1\nFHJ4RKlFFThvi6Xefye8Xzn2mQ/97fvIwXEjbVT0KnGcnwKDcrYvDHwaaev5vL6oRJv+2+D1n5I2\nHwz9/pJoEjln+hVp60XgtxXfay09o3RXd8wqa34C7BJps7I+PNgYB6yZs30t4O0Sx1uJ8iTi0hub\n2v8VUurNJY4zq1J9ElrUfwD8uaCNs4u8Itr0fHhNRg7T51OvMaFf+l5F9/MawP0Rv/+EmnrrB0g4\nBmBxtNCN/f+VqOM1uM9KqdWmbI5FQZe5y+yfsXUxUuP+Tmrb4mHbhRUca9lnajbkkBiPHEYvoeDX\nbUQoSqfszYmcysn5T+6Rs4HhkbZKK3W2+4UCGIflbD8i5plqYH8P4tXBJxOUVsPnKc9oyTbsgNRb\nZ+rkuW7QvkmZ4/156KPOLvEcLEyYsyMn1ulIGOkyKlw/A/sBd0buMxpl9PZLrikSObgNZRgWtVNp\n/x32bUntuu9V/NUrM9hSKcgOjDSzdFSjP3KM3VDC9P5Ige4tYCb0MCUKdDFRx3akzt+NHDutpqgP\nA+42s7vQBPcpFJX4Dlo0LA6sFGGvsohoiJqdjyYLa6O0/MFo0RddGoCu2zAz294DOXDgjjqcEp7+\nwMVyCVr0eGjbc8BZZjbe3Qulqbv7San3t5jZYijS8ozHR32nQwu7NCYiJ3FZjEcRm6pQ1b2bcBT9\nHWUE5SGmfLLK46wqa2QKzGxh8tPTC5PmoknB8e4+Pvb/N8AX1CLJb6E2PonO5YKRtibQQtS+EULZ\nIx5mJSXwMjWJ+zRWIT7j6XfAkWa2g0dyP6bh1ZfRgzK0f+zV8LrNDvyrAjsJqnhGB6Mx7mnkBE7z\nmn2OyoAn5u1YDxX34aCAQV7G+1vhu5i29Wv+q8K2zgfOt+oIyrMEzAkZ+P5okVYEO6K51YPUL+ss\nDHcfBFO4rLaouK/M4k1qHHtF8AQKVmbnkltRrgx2JxTUeMnMuqnjRdqqnLYBOftO9GqqPzYA1vUU\nv6e7P2Fme6AFaWFU/Ey9B6xnFQnxICf1RGrjcIKL0Ry/cPmku38R+MMFAAAgAElEQVQO7BI4opYg\nQqkzDyZS/BXc/Z3M9oHomGNK/I8CLjcpYI4K29YBtgWq4F+7HvHedpLn7BJ0PG+Z2Qu0oHzbJnTp\nb939PDN7lrgKnmTf51LvP0I8Ze3AtahUMwbLALu56AsmATO4eM4OQJUaRTOkq+6/cffbTcrpB6Js\n2vVREtDKXkDtug/F0SsdbNRSkJdBkbd0Sv/niHsketFW4cBXeeo8SgcdbmbzkJ+iXshmmGCsh2q/\nL6KWXmrI2ba+uz9eb/8c3AucaGbbJZOiQHh5PI253vJwKLC/u48I5I17oGt5JuWclb9G98crZpaW\nbf4UZaDForKJTBqu8seyzlgDzjFJcSeYETjdRMyb/I8Y9aFKnAEpVHLvBpyMUr+/h6JMm6NJ+aHE\nn/8qjzPhHctT7IotbVkQOfqXJyXskPpJjBNxFOonqyg7SPAg4qt8BmVEHBEmzNujjNbC8DaQ9prZ\nzihYMDh8HosykP8WaepM4GSTGEN6Qn8cKtuLwf4oGPFmD5w434sitFU42P6FJnynN/thQbT8jLr7\nsyBRDXefVEWjsvyvSR9uZl8JAZ3YcvFKg0GpdpZSnswinPuWxwKvhi/qNLQIHYSCLee5e8uBkora\nBoBJkb7LJlTKeiBdy1mb4UjgXDP7Jsqm2MLMvo362h/FtsslxLEUFajjeXtoGy5FJZ3PVmCrH5l+\nNmAiGW63TsDFg3d70x82x/rAD9z9lRBTSjCW+IBX0raXUGZdq1iI/PnKDECUIqm7X2NmmyHqma1Q\n9s4jyIlaWESqAbYiPpiTJHrU+xyLc9G87zxyRA7KwCSUsiP1eUwLi2DkOZpdwjJLE1nyWLHztRHK\ncGgmGZzQNYj8HgpIFEWl/XeCMKfZpez+fSiGXq0iamY7oEluyxPIYG9+d6/HTRZjJ4/7aspC2Uso\nILbJ5jIoewHgaXePmfglNr6FoheLUuN1mxIRjcmKCA6hJVwKRu+gkplHA4fOLe4+b4n2zYzKXNKT\nyfO9hMqmmb2BJjIPW1f1loWRqtosDfZtFxH73wvaLOzICBmii6B76wVadAZUee+a2evApu5+j0m9\nb4i7P21Stj3A3VeNsFXpcVYFM7sSZSDuhu7X1VEZyLHIAT06wtb2qITir+Q7N6Mi+cHmisCsLl6J\nedBk8Pvomd/Ry5FIVwIzOxI5OUfQlRNyT+Akd88jSa5ny4BjgL1RFqGhSf2xwFExC1IzO7zR9+4+\nrKCdm2lM0FxYCTNlc1PEmXks+ffIExG2DkLn/991bMVw4LX8jJrZRsDNLkGNhsqeHkEsbRUqTob9\nlkRZ9zNQ4/ecEgyKCXpZRcqT4ViOpL5gRSk19GA7rZY6xuP4sDCzGRBXz06o7/k3Chre1ELGKiEo\nugn5mcN5wZN6dtKq12ncBezk7oU53cxsNUTuvjQh6AscWabvbgfC/CzvfJXJ2p4ZOenfpsX+w8yu\nQsG4bd39tbDtm6hKYry7bx7ZtipIzxNb66AgUMK3+iQKAkVnsYW56HLuPjYzLx2CuKDnjLBViTMm\nzMdAyRA7IMdEgv7B/nruHpPNWQmsuwiDoUqluUjxGRe0NRkdW2JvIKK86DLnLdpXhjXQD7yrCElL\nMLM/o2v6b3J4TN29KIdtpQjnbp6cMXRu4CV3zxMkKvN/TkaUJhtE7HMTcI67X2BmZ6KEmz8hEYrZ\n3f17EbZa7r/NrDDHoLu/X/S3fWiMXu1gS2BSDc0bDKIiMGHSfDtaMF7qJUsFQgZKXZSJ/LXDZlUI\nC9GWI6Jm9gqwYXCqPQIc7e4Xm1Qob/IK5YfN4glgW5nImJSL0pgLlf5MId8lEGhWGLkphaqcASl7\nld27wam2lLu/YJKr/qm732Fmg5BCWOFyqqqPsyqY2dto8vlQxom4HvBHdx8SYauR0EUpx3yVsAoV\nvYK9txE31oWZ7dsCI9y9sDpVat9Z0ELoE2CsV6cYFg0zyxIST48yFBdDmTx7lrBZpQM82891sVni\nerb0jKYn8VU+C8HW3FnHUIjk31rG+VRVMMjMDkOL2sNQFuYSYZzaGtjH3VcuaOc6VNp/FjnZFB6E\nnSLblquWigjUY9VSE5sLogXk9qiq47vu/mHDnfLtrIOoKJ5D1+AxlIFjKKOicKZHzpg3GXHpVRIM\njoEVVxSMdWAtjIKrS9LVmejBVpmg784o+/VTpEjdJTsopv8ws/nR9fwuXYO/jwGbeITitzUhPY9s\n16+AU1C2XhIEWgllUO3r7qcWtRXsXYc4yH4X5qVLoYqIi4B+7r5VhK1KnDFN+tqJKFiyv7tfW7Rt\nVSFnTEnK1EfHOL6DrR2K/K5oX2lmTwE/8XJVTvVsjgO2jwkgNbG3FRL1yXM0Nw1KV+18NSlQ52E2\nYDmU/LG6u99fxF6wOQQFkW81iaX8g1oQeSd3f7ihgYqRCtg0Rafn9NMSerWDzcwGI96O72e/osTi\nMQyiPwW2QQ6QG5Cz7ZpOLqqmJkI2w2weX+KCSW3ss7IRZDO7ELjb3U8Og+DuaAL3A5QhFsURYmbn\nAHu46vvT2xcC/umRfEZVTWTM7KfArxBZ9Jiw7dtoMXSGi/emDzkws3uBQ939RjO7GjkoD0JZRlu5\n+yIdbFslEW4zmwAsE5yIzwG/cPfbwoLmsUgnYsMoYJl+zZQVd6tXk+3768ymLope7n5MpL0JqOxg\nbGb7osA97j6wgI1C/BoeV3bdVoTspK+4+29L7NvwmfFQYvllhJn191AWGrIz6sILlI+msh+WRmqy\nufyv7v6TnN3r2ZweOANlRTZyUBa19wzijxmZCQQtBvzP3bMKl/XsfIAEjCpbTJjZGUgUYk+6q6Xe\n7O5Ryt7B5vyIN2lH1O8uVtLBdg9wvbsfnpw3VB50PrqmpxW0Mz2aO/4y2w91AsHpPcTd36nSAW5m\n1yDn6FDEU7kiyrQeDvyfu/+3RFvfQPfCMV5eBT1tz9D9NsVp7eWyxEYjDsdfIofA0shRdB5wirvH\nqNa+go7vz5ntewAHu/s3I9u2BOItfgBxFydOxTmQQEzh/rsNzpjn0Xg8rumPm9vqj7L+6jl3SmfU\n9hSY2Q+BvVDf8UJFNl9D1UBPV2Brb+Bo4BykIv93lGG+AnCquzflKk85X/MyfKOdrybuzDy8j0Rq\nTqtiXO0kzGyN1MeFUGXFOXSt0tgBOKhM4KsP+ejtDrY70AT3GPKjLaUmhmFQXhM527ZEkdbL3X2n\nCBvbocF4ECIffNHM9gGed/erCtrYBE34Jqa8/rnwEun4df7nU8Dgos5JM+uHBCBKl6OY2bzu/pqZ\nfQ2p57wcBtODqEUNjvRMrX4Buw8i+e6fu/v/wrYd0ARulMeXCFQykTGRgm7lmVI6M1seZU62gzy4\n47AKSknM7OfAdO5+TjhfN6Dz/zkqT7y4wiYXRsUR7jvQBPwaM7sYCa4MQ8/YKu6+eNXtj0GYNC+A\nnMu3hdfoqiaE4X/sgRaFURxtIcNromfKuczsBNS37FHARjvKrhtGIFuNOppK9e/2iHKgdiOMo62I\nTPQ4pLIfDkeOhFz+VxdZeIzd95BTvQoH2yfIyfRixsG2OHIy16UyyNi5F2WV3dVqm1I2x6Gxb3Rm\n+1pIKXyugnbSJaKrIiLrvyNHWCnHTDhXy7j7s2Y2HjkXHzdlJV7l7gtF2Hob+H4ZB5uZvYvmUeNC\nOxr1Gx1zKoRruba7PxLu3xXdfYyZrY3UK7OCFkVsvoscMj3KqR8CN98LxzcBzemfNLPvAee6e2H+\nKTP7EN1nz2S2D0aCAoWez8y+syGndboM7VR3fz3STmXOmGCvC1dlavtXgG1iAvkm+oehqN/9PXL0\nLITEOY6MCWKmbM5I9/loSyV2ZvYXpHYa7VQMz/vMKBP3Y7qXSJfJjN4fcazu2epYHNaHw9z9wszY\nciQwh0dk0FfpfG0XrCuVwVNF2tqsz06j5PUcCfzNu1dp/BTY1d3XjLXZh3z0VpGDBMsgWdqotN5m\nCJ3QrcCtZnYaKpHYAU3mmsLMdkfcJScj51OyeJoA7AMUcrChFNp5UBT1yga/iyJRb4SYiULAoejc\nHIAysBI8ho61CN/L42a2h7tfkGrHJDSItoIVgT8Ao81sOCp32RDYz93PbLhnDtz9sZAJsyeSSZ4F\nqcnETmS+Qf6z2x85KTuKqiOF1qSUhIh7193PS72/31SKsxjibCgy+LVr8XIScA21CPdKpCLcEXZA\nSlYzhfeHIXWre4PdbSJtYWarUJ9T5Vex9tx9UDjvayJ+uEORku7LyNG2Y6zNHBRW9LKuJQIODDWz\n9RHfEUgQYwGU5t8UsU69gsg685NMvR2Qs6ZVrEgEka+1iZss2N4e+A01kYmnUTbiPwvu3zYHQ4gE\n53EfjS6yv4eSVJNQRWX8r2h83wz1I62iKuWyXyFl5CPReJ5d7JVZiLaslhoWsNugsr+zEcdWFYu0\nj6iNda+jzIyE+y62tPw8YGckahCLfdH8AjSHagsqcID3p9bOccC8KGPkReKUUtM4F9gazdtahpmt\nQH0OwcKcelRHeg4KzG6OhMDS2BQ5iqPhEmg7usy+GQwHfm1mLTtjAv6OgqBZ9eFZw3cxlTI/A3Zx\n93+b2RHAhcEZ/giabxVysJkqDY5F89u8gFSra6mfAyfQVa26KNrxvK+KnoENzexxuvfjMZn4CwB3\nhvefoOsI8E8032rqYDOzlYE504kEYc4wDAWnr0SBnU5ScSRUBttRux8mmVkRKoP0NZwTzY9vpGu2\n2Q+QKm4ZrEy+4up9QKyQVx8aoLc72J4gfuLTFCai25+G1xLowWia+ZDCXmgguNLM0hOs+1DHWwie\nUmzxCmXCK8b2yGs+0szSynEPU1xV5hDgDDPbHJW2tKwIBuDuE4HfmNnHSI3uC2CNJJutpM0qJjIj\n0fEOdfcHYEr22mlAWan2KnE4DSKFJeydgkpI1iGnlKSVhoaB7oGIXdKLl32pQKUpoCpZbzyVGu8q\nIV7YzOZFHD55qmh1EfqfPyDC6G5ZvmXh4s07N2TYrYImIj9Dk8sdK/gXMYpe2UyJhGsjKX0cF17f\nraBdpeD5WcuXhgnv1hQLRGBml2Q3IYf9SsQtSq+lFrxptKiLVcDdD00c/0zXEsDTzexr7l7EgdQW\nB4OZ7Qacipz9Z4TNKwE3mdne7l5Y+dSrL8MYCxwWnOF5yuMx2RlVKZdNQBngozLbE1XjMgvRKtRS\nf4kUDp9DJflrWFf1RKBUCfdd6F59EmUjDzeJT2xBzVlfFNMBO5nZuuRfz7rOnfS91Yb7rGUHeAqP\noYyp54G7gQPM7HNUOlZWtbp/sPMDpBCZdQbECE0cjOYvY+jOIRg7DibK2WNRxvaRpoqL7YhUzkbr\nlkPMbE26crCtgu65KTx5sVlZwTGwNQrQ3VQkg9K6UyKsTTXOGKCbAnqC+ejKvVUEiQo9KHN4tvD+\nWuKcFcchh9PuyDG0B/BNJCpVxiGeRffOqCDa8byjfvyKimy9gSpGXkR98EporTeI4sd9OEpguRYg\n9LFnoZLHJ1Hf9BoS5+oUTkRjyyZ0pzIYju6dXKSvoZldhrIZ0+XgfzKzPVHpepmA2stIQfSAzPah\n1Lgm+1ABenuJ6NpoAD2YfMWhqAhrmID/FA10TyHujQs8UkCgQYnGYMQlNlMTEx2DmW0BHOHuWZn5\ner+vqhxlEOpkF0fOyWtKHkLa5vSofHgP1Cmuiggvd47Nygj2Vm/0vbv/p6CduZDTZQNq9+x0KMqx\no2dUdaY2TCWse4dIYbpsZm9gJXf/aaS9ykpJQtR9K+pHpTvCi2WpkqCwYNnLxRO3GOLtG9Chdr2K\nBvhCDpyCNtdG2WtrAkOAZwhlosBtMdkkVqGi15cRVkCBOPP78+h6vhKC5lExfZpVzE2Wsvs8cHi2\n9MdUmn+Ed7D8PWRYnuDup2S2740UiOdrsn87M+uqFoeoQrnsHhSUOoV8kYPbYtoUbC6BxrnSaqkm\nbtWmE9/YTNTwLM4SxqkBaM6QUFTsFzMPtPq8QCDy7LpCNdZGxbgGDvA9ELdp4QVfcIINcPfLTSXq\n16L51TvA1u6edcwWsdnovLnHCU28CfzW3c+JbUeOrcpIz5s862k0fO7NbAHkIFoOOYB3Bm4mOE5R\nhtGGzealVpASITSo0DNl7eGqHIP44e42s9uBa939GJN4ywh3/3pBOy8FO6NNIlLLufszJlqfbd29\nYUZ3AftT1kEFf//V5Dlu9uyXzBquDGb2N+Bldx9movE4HvUjQxCN0s4FbLwObOzu94XPR6PEh1XD\n5x+jMtSOUaFYdVQG9crBvwU8VHTel9l3I+AyNO++O2xeET33W5ZZ2/YhH709gy3J9hmZ2V42wnoo\ncCFyLrRC7Ps8ymjJTsg2QB76QrD2qT/thlQ/P0cErXcnzg40QYpJ266kHMXFPbN28OxfbmZP0nVQ\nLqRQk8F9qOxkTXe/KzhnDgj2z/b48rjReU1PvS90v7mU5zYylZsmWX5PeUW8FxWgqkhhgipLSU5G\nkcZbyVn0xcDMdsybfJvZdIhw/KAIc5VFuENGx37UL+uMmXjMTP592wpuQU6d4cDmHsmNmEG29L20\noteXDeE67w28WnQfd/95Ff877TSLcaAVwDeolZCkcWf4rina6GCYHanjZZGUIzdDu7Jfqdrx6CKY\nX69FM0sAy4Ys2krgolkYTFe11AuJUEv1akrQ8+w+l3r/EfllOA1hZvu6+0nuvlad72dFJXONMIHm\n91bZOe5ewO4ZB/jVIVvpCCIyKtz9xtT7Z4DFzGwOYLyXjPzXO28lMZmaE7ElJM6A8P4tNJcva6uq\nZ/0EVNL8S1TueCMSYlgN3RunoWva0CkZ64guiGRcXya0K5erMtLmFWhOdDcq3zvPpDq7AHGZQHNQ\ny7B8P3wGuB2ds5bg7rM2/1UXjDezb4T7qt6z30rWcDKnXRNl9V/g7h+YqiLe9zhBmF0J81F3P9XM\n3kGO5qupZYU3w+x0pQlYA43BCe4lvuy6arRMZRDwDir9Hp7Zvmn4Lhrufl0YQ39FbQy9BjjdKxAe\n60MNvd3BVuVgDLBA2YlBBicCp5oINA1Y0cy2RaT9QyPs7Iuy6D4N7+vBKc4/cCAqIXkY8dBsGiII\ne6FI9RnuPj6ijVWVo2DiddoCGI946r5ovEdT3IecpR/BFK6RY83sJhT5i0VWfS3hUToKlblGITjU\neopTLY1X0GL4JeBZYH2UAbECUIYXocpSku2ALSqK0vzJpNq0a3LPh3v3AlTCGuNgO5gaH8UhyEl9\nGiHCHdmuM9AE/kJaL+v8O3J2H9uCjSwOQNxrBwF7mFmSvTa6aNQ2gQdOq6oQMiAaZRYVzoCoEjkZ\nT4bul49RWW1RO0+jLNJ3M9sHoozhRUu2bw3qO3R3jTD1DFrsZctVt0bPQhG0y8FwLfmT3Y3Jd7x1\ngXct3Tsn4v9OVZjZ2SiT9NzM9q8ivrmi/dF9aKFTmYMtla3RjQPVzL6VjfRPTYTsjPOyWQuR+IOZ\nvZPN4Az2B6CFZDMhkqLz2iVjG0cFDvAEplLT+9z9iWSbu79rZjOa2U/yzsFUxkkoM6+SMvMKnRRp\nm63w4K0ObOLu95jZ9Sh4uVNw1GASGssmHxRt19epBT/HeGRVhbeBq9LdD0y9vzhkoq0MjPW4ipfn\nUAbdS6hS6SfAPWgcmFCmbQ2CQg585o1Fb9amRodR9Zo2WVfdgByRM6Asxw+A34bPhQIJ4f4/GPFe\nvgLg7hcBF0U26U10/l82iV0sR1ce2lnJVKN1AFVQGRB+/zdTOXiSbfY9NL/fpWzj3P0VdC360Eb0\n6hLRKmBmSwGPubiTGpZFuvsjEXZ/hqJHCQ/Qa6h0prJSrTIIadZ/cPdzQxnJbYhvZOvEEVXCZhXl\nKLughc8tiMvq7TJtifh/M3hFJJphcXqiuy/f4DcnAr9z94+sKyl7N3gc+W7lMLNj0KTxDyH9/jwU\ncVwAOCk90Slor1EpyTbuXngSGMorNqwiu8nMFkHHNj8i018U8XNcicoTYzlCKkFwxmzmJUqwcmwd\nj5z695DPaVN6kA6Lg2VQBHItJCDymjdR22tzCVQ2kj19aOMSSO3t1zH2qoKpRDKNJFPv7piAhkmN\ndJ7sgsfM5kZCHzOUaNshKEjwIPlq3BtH2NoSuBj140n2SCKy8RN3b8oFY10l6Rui2TNiZuks5YGI\n3+U2apxaK6H79zh3L5LFlti9BfUdl8feow1szoc4X/KEZWK4pyaj8rCzgH08qGqGe+Q1L64Q/mM0\nhzmefAqOwvOhlM3/AutlF9whsDHSm5TpthNmdhUin34bLRrPi61kMLOtUPBua0+pY5vZLGiR+3WU\nUf9ayTbOCmyL+vTli17L1P6PIefQHzLbDw1tLuy0C/fZR4jW4rLU9qj7LMfuEOoLLBWmgDAp3P8b\njetP0AKfWI6TYlEXFcopiG81KtsxZF7tS62ccyxyfhcmKQ/n/xvu/mb4/CGwVBLkKnMdwth8KhIR\nmULujvr0PTo1J6oSZrYvMMnd/2TiSLwGBWymR6XgsaJUybVotBh/BXGMDfMclWMzOwzRFzQizy8F\nM7sSOdR2RnPuhMZnTeBMdx/caP+MrQ+BJbwFxXiTcODSyMG3GRJ6mjdxQoa18z7uvkLZ/9EqrD6V\nwWfA+l6AyiBl63uoUiEtrPQnd7+7/l5NbQ5EZaF5AdFOBzamGfQ52AAzm5n8wbjpBDC9YEl1knlk\njV5mwhDaNktsBCjHzhLunltqZmabuXsjldH0bz9Bk4OXw+fPEHfU/Y33bB/M7AbUWexTZedg1ZYA\nNvo/i6FIbt16+pBZs7m7T7AKeUamBsxsJQLfSGSksJHNUqUkwVGxAYrUFiopamKvHyo73QNNJHfw\njPz11IaZvQhs4O6Fy8kb2GoUbXN3/34LtpdCEf21UDR9IPBEs0VagclouoGVqCObVMdmcfeWRDU6\nBaspfV6LyuvSC53+iDB3A3ePVu8zs9eAg6vKyjKz5VCJc3pCOdzdYxQsK4GJd60I3N0XiLB7CnIC\nzIYW8ecB13mkCEnK3jqozOY5VPbxGBKWMeCBmDEhPF9rI0Wx55Fjc3wJB1u3hSC1+VHZ+dD1wcYm\n7v5F2PYdJKRwSacc4Kn2zQ78GHHxrkZXLt4XCtoYiqoBfujieRqAnDPzIK6haOeaif91Z2BLFKy9\nHLjM3e+NtNOyAzxlazISKfo9clAfEbaXdrCZ2TYo+/tGlDl/E3KQzQ1c4RHljGb2Z+SIzKWTiLRV\npZPiSNQ/jqCrsuCeKIB5WEE7XYItluH+KulguxhVZeyVadspiDMqSsXcqlek/3ZoW3psGeEtlLEH\n5+nywDNlggbBxnYoa/scFMwErWl2QCJhX0PPyvFZ53bYfxJyllbOv2yhjNPFe5zmyV4IzdcKlzyG\nIMTl3oIYg4k65XLE/fghmnNfkfp+JHCXu0dXBVWJsHZPUxk8icYCqlh7tNCujUM7ZkElzl3EW2Kf\nqT40gLv32hciwr4WLYy7vQraWJCao3LBBq/FI9o1ChiYs/2riIy6zLG+CgzK2b4l8FGEncnAXKnP\nH+TZLdG+Iah8bzsUWY3Z92ZgvjbcH+8D/wJmT237NlL1eqGEvaUyr6WRs2c0cHvV7e/UC0loJ+/n\nR2XAxwOrl7R3NiIIzm4fAJwdaWsmtFj5AGVVPJB+lWjbxohX4fbw9xYUTSuy74PZ/1/vFdmmX6Ay\n1Rk6fS/Uad8VqBxlEorunYJKu+csuP8aqdcOKGvqjyh7Z5Pw/jU08aqqzd8C3u3weRsI7I8cH39D\nC4/ZCu47Obwmpd4nr4molHuTku16F1ikxWPrh0qH70AcKscCM1Vwzv4T+p91gBk7ef3qHPP6aFH1\nXjiPf0VOlFhb96DsBkLftjCaQF+FOLNibE1Gke05w9g0Fi1I56bgvCjYaTQfWrDkOZsp3CMXI0fd\nEsj5cWKnr2dOW+dDGY9PAl9E7ntAuCfWDPfws0TOcZBD7sBw/d5EDpmJRMxF69hdHjmE7w+v8xDX\nXqyd5D5bKfThl4brG3WfZWw+gjKl0s+BhedqWKStD5CTs4p74R3g2+l2hfcLAR9H2nobkelnt28L\njIs8/6cjSpoTUXbNWanPp8deB5SRuGrO9tWIWGek9jsSjeX7o6zaQ9HYNw7Rt8TY2jLc//9LHeOd\nYduWLV7fbuu1yP1vQg7q7PafoMxc0NroqQbX8utV3Ks5tscnfUbm3l0VeDPS1i/Ds35CuF83Sb8i\nbc0G9M/ZPgfwlXacixbP4wzIMf5Gk999Nf2+0atkO55GSQEzd/qcTOuvjjegowcvL+7tyLnzISL1\n/TmKOlY1sBZ6qDL75HaWaDIysWQ7hqFJ2jypbVuHAfHHkW1rNCifSMRkF01C/xvsvhtek8N1qdxp\nFnnOFgmD8Svh3tgjnK/zKbiwzTl3eQvcO5GSaseOtaLztSQqBZ0UnqFlkCz3B8hZ+QUqXYy1O6nO\n8/A14hcul6AJakLge3j6FWnrDMRvuD9MUa+8Dk2mu02WcvY/vOirgK3/hfsoeb2HJkb3ZrbfWfLa\nzoccWi07KRC3zWbAHBXYGkn+YuOniNOtqnt7OxTNr8Reif8/JNxXr6Do7eVIUn0cUjFrtn9/xLn6\ncrhP+yevCtp2Aspga8XG70L/cAMqsf6ESOd5HbuHosXLh+FZvR1lzKxH5AQTcZZ0c/oBM1Zw/DOi\nzKeHKOFcCH3sIuH9eOC74f3SRAaD0v1tuGf+hriFdinTtqpfyNH8EAp+vYmyOjrappw2Th/6uEvD\nvfxqCRvHhGvxLDB/5L7XhDHgAuCHyXNOCw62cC9sD8xd0TlK32cLhGv6IOIYKutg+whYKLx/B1gy\nvP8O8HqkrRepaF5GtU6KCcDgnO2LAhMi7IxG2XkNX5Fteyk555ntSwGvlDhvzxLWYpk+bm+UFRpr\n68ic7cOAZyPs/BaVQyefLwn38qsou6vM/fFxnWs6mOCARWh5u2kAACAASURBVLxjuc5YMokPVb5Q\nMOOvqWswCAVvRgJ/j7SVXfukXx0fWyo4VzOgAO99aL69Wdj+C+QofhkpEzeyke4XkzVj9lX6fIU+\ncuFOn6ve8OrVJaImud9NXSSf7wND3P1pM9sEOMCD7G8BOzOgxXqirHmcu19pZr9A6b2TgD+7e0Oi\ncKtxuD1EV+JK0GJoA8QvtlDRY8zYH0GtHGsDNHHezlP8FwVsjKZ5eZZ7wZKUUN45EGWbjAnbvo3I\n1d9399JqS1WgyhLAkEqexmTgbY8kcDWJX+yFrmVeDX2sWmolCOU7X6CFwXZIpOJGamScI1B24koF\n7X0VOa7Go4lGmlevP8oeO8bd541o40fAD9z99qL7NLD1GPAzz/DsmOTHj/USEtottKUw/5NHlDYH\nrobzET+ao0ngc2Z2FoqW/za6sV3tT+8ly+LC/h+jSe3YzPZFUUlKjGITZnZ5dhMi7x6CysIrFVUo\nisA99Qywi9dK4xLnx8LuvvpUbs9xqY/9kRDHA+Tz9B1QwN5Y5Cj5a/i8LiqdnMlzOGdKtHc6JLKy\nBsoKWhuY7O4zRtjILcMxszmBt7w8Z9Q8iLPo54iw+Z6ifWTKxhvAWu7+pJk9ARzo7leb2dLAHTF9\nUbZ0LGzbD2UV9os9TjNbnPzyrqvz9+i2fx7n4jdQ5vq1KEsrsVkJn11ZmNlayLm/JRqXL0f95ygv\nMNnO6X82Qlm+XZSCvQn/l5l9gYSrTkv3jWY2EfWXT9TdubHdj4HvuHtW9b2MrWyJ4szoXK2DOFfL\nlIi+gjhWHzWzR4A/uvuFZrYycIO7z9bERNrWL9A8+RfeIrdVKJ18z913DWV2S6H5zFWI+zKm3HQE\nCrTvl9l+Auov92ilra3AzHZFgYLt3P2NsG0e4FxUGlhUKTKx9xG6314K67UfuvsDZrYw8GDk9fwY\nccw9k9k+GHi46FzBxOP7M3e/08zWQw62rQllrO6+ftE2pWw+jc7PgZntxyBqmG+buAWvcvdv5uw/\nGTnUG/YxXqL8z8TteSOaCw1GzqPBKLi3enY87M0ws2OB3dDYtAqqkvs7ytL9A/Avb6K6buKQfcel\nmL1Go996CZ7lMMZc5O6XxO7bhzj0dhXRAaisC7SInwulTz6KJrpFcSRdH6p/mVnyUO1HgYcq4CHU\nQToqE83iE+RYKQV338vMzkcEzd9EmR9XRdpYs+z/r4M1CPX9qf8xxsz2QpltncYP0eLnfyhCuLOZ\n3eYleFCqmJQGnIVKiy5FpUE9xUu+ArC2uz9iZg8jlc+/eI0kewQ1cvAiSNQAnXy1VKerelARvIyy\n6arA8p4jdOGSH7+ljEEzW54aP8jjXpB3KsZpFonhqGxnUZRdkOBSVPYb7WALTuvfonKBec3s28Fp\nNwx43uO4vF5GDtysE2do+C4WWRLmyUgF8TCPEF1pA4aQcq4BuPsXwdF1X4whq0bxc+XM58eQA2VI\nZnvRvmkBpJCYtOMWM3NgXoLiWItYGGXYLo0Wtx+g0rsYJMqjWSxB12BYc0NyGm2JnDFrIu6081Fm\nxLOR7QL1q6uicsTrgOFmtiQqvY7pc0GBmy7H4+4nBofFKkWNhEXwFei8O0zhpk3OYVEHSj1VWEN9\nyG5QShW2UpjZq6g86QY09l2TNz40Qbb/KcvnuSri+7rfzJ5Ewgmxan15uAdxbFUxlxmGMksBCE6s\nzcM4UDZg8B8U6H4UZTieYmZrh22xiph7oyqGN01qltnAQcwaYX/gxuD8nhFlFg5GWXbbRrYLNA9d\nn9qz/T3Uh/7DUiJYWSdcFibRqHEubt/ZUBZUK0q8uyM6hZdMKp2Edn0GzGVmu6XaVuT8ValIPxqV\nqmaPb1Xi1hrzUJtb/AhxP94U7pGyxPP/h9aNG6KqA9BYuhhScQcd88UNbBxO9/6jZbj7KyFQsw0a\nO2dBa5DzvSCXWHgG/4wUzN/PfDcbyvbaz91vrLTxUx8/BrYPwa0lUMBxOhTUKDQXcvfbzGyymd2L\nAqgXufsHFbbx38DxIfCVJz5UKPDVh+bo7Q62MYhT6wUUJdwtdJJJnXhRtPxQBQxCE8XnEMFlOmPn\ncxQlL+KoAyBk4mVxORpkLgQ8+U0HH6qXUTlFFv1RSm3HYGZnII6nQ1Dp69yID+xRM9s9NgJgZnsX\n/a27/6nB1z8CNnL3Oxr8phOYA5WE4u4fhuhjWuFwPJLQLoq10PMwCi1G04u+z4EXSzg69weOM7Nf\negtKRgDJ4snM5qKrJP3bHkmaa5K1vwgtthOp94EmQYttPEIV18z2cvcROdsNpfTvGNG0DVHU+Bnt\nPgVjEH9MGRyMFn+HoVLdBE8Cv0acVEWxL3BZmJgmk9sV0eJly9iGxWQSTGW8jxYqWfXb+ZGzqBCs\nieJnUbj7amX2a4DpUAlnGhPJHxsKw8wuQEGcGdDi+zaUYftI0bHZzN6m5uh/Ijj+EvRHXDCF1fsC\n3kT94cXAQe4e5STNwX5o4QNaaM2CsirGhu8KwcymR2Pcj9DzOAXufgvimCyKU5BIwjrh74qI1204\nWlAWxVoRv+0kjkDB1AnNflgPVfU/7n4XcJeZ7YPug53QHKYfsJ6ZvVxy0fYX5LydD/GvdVGO9wiS\n93rZwO4eGzRLY0/kwAJVj0xEAkuXodLwGBQS/iqCKpwUKSyBnEwgByAom2hc+G7Kvy1ga3k059gd\nZdj8h+4OqBhUds4CrkD9x92oAuI8k4LqAohqIgZXA8eGIGZaBfrHwOHp9VKT9dB4NO6+jDIcDw3b\njZIO/rB2XAwFCxYNm69HJYYvhN+cVmf3BBe1K5ssBPbOa8HEPkjMo1tw293fC2utvVCm3JcZSb9I\nyED7DAmPxM6z1kBlpcOBk8zsUuAsd68i6eTM8DdPDKWjQappDb29RPTnwHTufk7odG9AToLPkXR4\no2hB2s7niOj/1fD5E2BFd3+0TU0vBMtX8cqDl0nHr/M/N0X8ZIXUPMPvD0bEtPeFbUPQYHqsF1Q3\nbQes4hLAkFo+FzAzKScK4l9IO1Dc3RduYOcJ5HQppVjULoT7be7EGZSUQrj78+FzKXUwU2nty15N\nqdh4dP6nQ+c9G70pnEJvUncbgcph05L0/wD28oiyklBCsjBy1D8Zti2OSiuecffCUW4zG4cUdc9L\nbTOUxfA9j5dVXyY42NIKUssDN5csORiLiNdvydj8DuKImz3S3nzAr+iq1nS6B6XjMiibSdgumNmf\ngM2RY+LOsHkVlEV4mbvvU9BOpYqfdf7HLGjRNsYzpbsN9pmMFhTprISNkXN9yiLem5TG1bE7DjmN\nRiExmahyr7CgS4jS96drBuzniOMsauIbSotGVtGnVY2QibWut6hCHPqhJKP5PTQnGhOyGYa7+7JV\ntLcPxWCi3tgZjVcDUf+dF4RtZKMlZdjgxLje3SfWCQBPsekVKY73BJjZnO7+Tng/P8q6ngm4uqJF\ncyttuxhV36zrkSqfUxvWgiJ9Veshk7rsj1DwYlnE+fehScH2gMjMxkpgbVQRDfYXQU6yKXMi4E9F\ns62tibJ9cC7e5BFK3D0R4TrMU28NVMLeAFR6vCO17MuzgHM9lGD3oeeiVzvYsjBxQCyGOBHGRexX\n9UO1A0rb/nf4fBwqOXgClXVWVWpYOczsKcTTVMiJknF4JOVPyftsdHSqygeb2Qxep8TDVNYWm6X0\nU+QI2Nm78s2dCZzh7ucXtLMhKl/4ZU+6F3IWydkF8gxokC3DrTIQLQ7SA/zZ7h6VEh+erbrwCPnw\nEHVbF0XNk2zCVRH3zc3uvnuErffQBPfezPYV0cRjYISt76MysSSrtj8qP1se8TQVLrkzsxuRU+Ko\ndL9mZuchnpzNi9pK2fwEkUe/mONgu8/dB8TarAr1MgkR4XNUJmHF7foKcqb9klrm+USUAXhgvX4q\nx867wApFJ8YFbV6I7pFTTfyQD6Eyocmo5PGKAjb+XuR/xWb4mNnsaGK6JooKfye0bzQSwShc9mtm\n6wC3eapMd1qEmR2MsiiGtnKsYWxfLvQXzwZ7t4bF2qMeyY+YsT0z+bxuUzXoZOKz2dHd37fu/Gld\nEOscbhfCeLAxsFMJB1uWR7YLms1HLMW71sThERX0DbaKcANHVe2EecdWKFPseHd/18yWQ8IErzbe\nG0xl2tegjKexKIPtBkRP42juu1XZQHIILhEzpqf2vTW0YXaUVfcwgcfLC3IoN7BdSYCqgWPyGneP\nLfGvBCHL99fomp6THJuZ7Qt84O6x2cyJ3YEoyzePuqFhwoLl8GZWBTP7Acr+e4jaPHcVRLewsbvf\nXMDGp8ASXqf82FSq/Ki7z1RNqzuDAmsgoNxYEM7RL1CAZB7EKRnVf+fYnNEjOcD7UBx9DrYKUPVD\nZWZjUIbHKBM560gUPfgRUk0s/HCG/ed092tT27ZH/BcDUEr3XkUXaFWjmcMjjRjnR09EWGRslZ1s\nhMnIpe4+qKCduRCx6uq0mIVVJdq4SB6CUsc/QRwwID6KmYD13f2Bevu2EyFDYyt3H53Zvhbi5Zgr\nwtYHwGru/lBm+7JoUZ9H9N3I3gao/OwniI9sKeRciyqpNZW1jEKlIxshXpslUCr8KrFO5mDzAZTB\ncn7GwXYoIqguzPMU7JWemObYqiyTsB0IjoWkLOjZEtlYJwDvuvsfKmzTG0g45GEz2xaVoC6Doq47\ndSKiXw9hknoo8DMiCfvDs/CFuz8ePv8IHeMTSAAjSqzDzLYiEGPT3VEUdc6CMytvMueo9PYZtBhs\n2kebWVKW9SHiaCk7j/kves6vNJXqzo7K9HZF/JVLNDSQbzMhjd4w7/sywZtWEMa8vd39g2bjX+y4\nN63CzAaVDT43sLlpg69XRgHJfh4narIUKol+D9EhJFyhv0dk9tsXsFGp8FOw2Q/1YftTKwv/AJWT\nHe2RWbFmdgSiVBjr7kfE7Jtjq5IAVZWOyZ6+Bgrt2RgFQWdBGdLpvtw7NacHMLMHgRs9X4Bh/SJj\nVVj77F/vepnZFsAJ3qBy58uAdq2BUvYHoLnLH4GBJZMV+qOqsV8i2qNFQ792FMrGP6tM2/rQHb2O\ng81SBKDN4E0IQlPIOn5aqVUHDSqJp38z5Hz5q5ndgSLvMTgcDW7XwpSB6yzEc/Qk8BvEdXZEi20u\nhZ7uNKtyEYTIWvOeuf6ooyuKC5FIxcGIy6dHeMnbuIA4CUXQ8hQUTyaCENnMGqagu/tLjb7PYGZ0\n/rN4K3wXg1GIkHnbxAlmZt9Exx5Lzoy732BmuyBC06eBNbxESnlwmiyKHPz9Een8LcApXr4E8/fA\n30zqYv2ATUIm505AbEZFw4kpKteNwQZkyuPc/QlTWXhHRA5CxPwTVKr7GHJ6lEU/4DchG6uU4mcO\nBlLjR9wAlax+aGZXo4Vlx2BS+EyUQ9cEFkcLv2sQH1sMzgSOAx43s4WQs/kaNOGdhTius70RR9Q5\nwKbIabQIChycGtku0GLxELQITYIQK6LrcSowCDjNzKZz9zPzTUzBBMRZ1Sp+jxawoHnINYhM/B20\nYC6Dk9H99j00F9ocjZ2Jw2GqIhnzzMzQMb7t8ZxaXyqY2XZocTYIWDlkIu+DBGqKiGY9ayoZG4Xm\nprcWyQZrhLz/G8aUY1DA+3zyOYca4UTklD4gBIISXIdECoqgauEnUL+xM1LPTWfOH4H45w4paigE\n8L7n7hua2XVmtkw2yBeJEYhn97s5Aao/UVzQ4Tg0zv0MOSavRXOZtGPyQIpxvh2G+opK1kDWngqj\n4YjG4ODYoNlUwHfQGiiLs9G8sAiuA44ysxuyGVNmNhMav67N3fNLhHatgcxsdTQ/3hJVBlyC7uEy\nOARxix9AjY8NJFS1Twt2+5CFu/eqF2FAL/LqYBvfApYN7x9EktegCfiHkbZeB4akPh+NynmSzz8G\nnoiwdwCSAk8+rwLMkPo8K5pANLLx1fT7Rq8O3yt7o8jgCJSdeDriqpiAIoWx9q5B5LTLpbYtj0gx\nr46w8zHK+unYuZnK1yEpKcxuXxz4ONLWZMSTlvuKtDUSDXQzprbNFLbdEmlr/vCsf47Usp5Fzo8H\ngPkK7H9BnderoZ1TthVsz2HAzG28pmuGfvad8GzdhbLXYu08jRbdlbQ1PO/L5GxfFni/XeejQLue\nq+KZRw6Oeq//lLQ5FpVRJQ7ndcL2pdBCpCPnLLRhUmjTpYhEeckWbL0HLBLeH4BKt0GL25cibT2F\nFmPJPbdweH8k8OcSbbsEUQZkt++GHJ6E43+0w9djDkLlRMn9X0dcbiCH+qLh/Sak5jUdOK5+oe8e\n3MnzOxWOc3fEF3sImock9+2OFJwzh77/COT0+CQ8o2OBM5Djde4W2zgvWjh+juZcS5S0k37e08/o\ngsCnBW1MBr6e+jzFTvg8N/HzjteATXK2bwq8Gmnrh4S5FXLI/6zFc/8eoiDIbl8RmBBhZxyiowAF\nLyajTL/k+8WK2qP6NdAY5DQFZUd+hBxsVwOXlzxvH6Xvi570QmIOP87Z/hMKjnvhPn8VqcEeEO7V\nTZGa/Evhu5ae+2ntFfqxg9EcdzJwOyoRHdCi3WeozdHS/dpiwPhOH/e09Op1GWzu/mVQpLoZZXg8\niLhQrgvbv0u8PPrsdM2yWQOVsya4Fy3ui+KPKPKTRGmvR+VAz4XPM6NJ/a8a2BhvZgkh5wTyM7CM\nziua/ArY1d0vNLMdgeNcqbRHooVCLHZCkbz7zCzJHJkOlQwMjbDzFHLk9BZUoqAYkCXWnj5s24+I\nyG/Ar9G1eyVEp0G8FJ8hOfnCcPeXA7fLutTI+p9w96LZa1Zn+3+afF8PhyOHcluiqa6y2tEVmPom\nItutqp2VZhJWiKOBP5jZdu7+btNf14FXr/4Jyky4AD2nb1C7rqujqGgnsZSHks4KYNSeo3Wpjcsv\nIfGaGCxATaziE2rqyv9EzuY9I+1thCbjWYxE2RGg9hbKKAwZwmuioN4FrhLIeZGT+cMm+zbkIgu/\n+QLdKzd7HFn5ABSABKn5zYUWII8CHStFdvfJJvGWOZGzaFrFXiiT/EozS5eM3QecUMRAuu83cTZ+\nn1qG6Q7A9Gb2lLt/N6ZhZjYbegb2QnxR63hrAgKfoUBvFovSVZSqGbLz21YrDuag+1yIsC12XnoH\ncLSZbY3WCpjZcJRRe1SJcbUfmazogIlk6BuaoEpF+qrXQNkKo8u8fIVRghuBIdTWUT0JZwJ/NbOF\n6Sqw9FuU5dkU7v6miRv4NLSGTMZRR8e+h7vnVYP0SoTS8nWRo/kfiG86mo6lDr5JvlJwP1pUbe9D\nV/Q6B1saZnY28GvPyJWHOucR7r5TZ1rGHqi8Yn5gSw8knyjb6cJIW2+iVP6XTUTZy6HFc4JZyR8Q\n6yG7UI9duAOsTa2kqCc7PCtdBLn4JzYKJXeJE+Upd386sl0HAsPN7BC0sMiWeXWTwv6S42LgLDPL\nU1CMeh48owgbcJ9JXfE3QNPFYcrWY2Y2GJUxJNfzQuB8L1gmlOYHcYWRbg4L2WHAzGZWiB/Eq+cG\nK/NcFzdu9lVgC8R3dpK7jw8cV2+5++sRpqqemO6JItEvmFlSAjs/chT9vKL/UQZ7IuGA10J5VZYX\nK5azayHkPLnDWyS5dfcRZpYsUm5w90nhq5eA37Viu1V4jS9tLuDbYfMYLydWcT9wkJndgpwBSf+/\nEPml4o3wBlpEvojO00qIZHwQ5Z69d1Ep3EmZ7RtTG2sHUCAgYSKyvwGNfzOggN8HaEE1AyoPbIQi\nwjP9EO/TUDM7wd2Llu+NQdfxBXS+djOzF0KbYvqNduBA4Hgz291Vyj0tYhDKtM7iM2rlwIUR+p5R\nZnY7ymjeEAVoF2u4YwZmdgC6P99AmaFFSlWb4WrgMDNLyuM8UEwcS1wJ9TlmlozfMwKnB4cR6HmK\nxcOo79k7s33P8F0hmNkcaD41HyqhTWgRFkdOyvXMbFWUibySu/+pgNkqA1RVOSarXgN9iBzpL6FA\nauJk+pTyge9/o75jcfLn9FeXtFsFjkL9//7IOQa1ktoi9wQwRQBlI5Pw0LfQODfW3cc33rNXYiKq\nCrg2NZ+qCk8g4adsss5W5PftfSiJXu1gQ9GyA+k+6ZwJ2B5lHE11uPsEUs4bM5sVcRdsiJxsv48w\ndx1wjJn9FkVbPkblQAmWQuVoUw3unua+eR54OTgXpiBwmsREldqBqhdBAASHWqxTLY0bwt/shKUn\nZP21A/9HjU+rm4JiRf9jDOJLKQyrKVydGSbeQ9Hibwhdn7FGyOMHORNlOnaaI7Et3H5mtgTicfuY\noMSFItJbo+jaDhHmKp2YpjIJ16Gmgvaku98SY6cNKKUyl0VYVF0IrIeu72DgOTM7B5Vz/l8Zu+5+\nFxkuoQ4vCoBasAyN50kGxSQz+wdyXMdkaOyLzt3WwLGpwMiWwP8imzYKlTU+iLJFTgp8n0OIcPKn\ncBTiWFuLrkIwG1FziK1HMd65U1BG0tKohDvBFXTlbMmFR/DQmIQi/kJxfqxTEJcpKAhxAwpwfI7K\nFDuJf6AM/ofN7HNqWf5A58SHKsbzqGIhuzjbgJqDpimCo2MlFGBdE3HqvYwyrvcknh/xGHS+nwF2\nsDriWR6n3rc/Ki1/G60JbkPqff+jeLZ7EX7mWJ7QA4B/m9m61PqdldFYulGEncPQeLlINnvIzA5D\nnKP/RE6krDOvHvICVAugsTk2QFWVY7LqNVCVFUYJkn41rx/s6Jw+rM1OQmPUrGFbbOVI2t54lDXY\nhzrwFtVBm+BI4Nzg+O4HbBH4KrdHIix9qAi9UkU0ZE8YWtQNpmu6dyJhfoy7z9uB5k1BIDbcGU3g\nX0MT78vcvXDnZGZfC/utiiIvO7j7FanvRwJ3uXuhCYNl5KAtpQIYPs8NvOYF1U3MbBKQlIumt8+J\nslk6NrCY2d+Q82+YieT8eJRSPwS4okyGo0lWfRPyRRMKkWSb2RoNvl7S3f8c264vA6xFBcVgI1vy\nYWjBdgTiIlmmgI0qFa5eR1Ln94XPRyNBglXD5x8Dw9x98UIHyJRn5xjkJMpT1mwqwBCe8/do4mQr\ns2g0s5vRhHt/VFaYqIiuApznBdV0U+1s0Lwolch+aJG+BcpKcrSgvBT4ZzYIMLVgUn1aBXgkBF9a\nsXUOcmLugq5Bcu43QIqPhcqyArHzMHf/KLyvCy8nnFAJzOwMVGqxJ10Jwf+EyhN3r+B/DEDqooVV\n6MK91s9roi3boFK5scAZ7v55iXasgo5zSqYeysS/s/5euXbeAb7v7mOsq8rvQqhsPVbApdH/GojK\nX2IcH+n9Z0bZTi+5+7iq2lWyLQ0DA97DBZ2KwMyGorFyf0SGPRSNyQcBQ939ogI2RiGH2vPIafVf\npJRdOgMx9GtN++cY52/K9irI2TwL8EAPCLYQstz3oJbp9yTiPi6sEh4yP3dz9xvrfL8Bch4Nc/dh\nEXaNFgNUVqEaYxvWQAOpVRid5u43hO3DgInuHpMA0Yc+THWY2WrImTulXwOOdPeOCHlNq+itDrbJ\nNB+MD3P3o6dGe9IwKevtiBxrXyWQF6NJ7hMt2J0NCSRMymyfI2wvNKEP5+5QNFCB0uWPR7XioHTr\nIyMcbJMRueXbme0Losl8dNlBVWiyCDrd3WPSyjEp912NytkWQ2VnCyEnzwPuvnbJdiYZjkMREey0\nlsE2BWY2PyjTqOT+ec++oej5Nu7eNBPFxI/wBXJibYeiPjfSVeFqeXdfqYCtTxEx9svh8+3A9Unf\nExa1j7p7Ub4RzOwatMg+DZVNdTled7+4gI3JSFGoYblXmUWjmU1ApMPP5Czgn3L3GWNttoqwKLgG\nZQA8jPhsDC0SlkQiJJtN7Xal2vcp8B13f75FO68jMYmHMud+YeTAm6Wgnf8ix/CE8L4e3N0Lq/xW\nDTMbh5zdozPb1wIucfdY7jRMpczJ4vEJd38kcv/pEFfU2e7+Suz/bzfMbDywiks9N32PrIoCfDGK\n121ByIAahAItX3S4Lf1QlvWmKGg2Ejklpkk1UTP7GXKyJcGu14DD3b2Q+pyJf/Z1lJU7GjnX3mm4\n01RGTw22VImQHbZIvT4oBINfcPemlU4mJch13D3JxP8jXbPMvkBrqpboCFpBVWugHLvT9Pw7JE2c\nQC1g26V6Z1o73mkRJtXyv7r7p6Zqm25VY32oHr21RHQt1EmMQtlhadLoz1Ga71RP5w8L49VR2dM+\nBD4bM2vGedIU7p67UPZ4wuyXqDkSQGWU2+X8piHMLOEtcCTfnM5G6o8inK3IhZdGiGid6u6XI/UW\nAEJ09qIQERuD+KNi8EfgBHc/PCxctkSEzedTK/uMaWdehuMesXZ6OsKC9HBUpjBL2PYhcmQNi3R0\nrk1Xh9NklMH6TMRCbQWkIvWISdxgVxQ9nhzaNoJMuVwDVM0PAiLxXdPdH4jcL4uLspmlFWEi4Tpm\n8C1qjvqGMLPrENfOe+HzgcjpPSF8nhP4b0Tm346o713H3W/N/K+1gSvNbHt3jy3nqQqPof6mJQcb\nup/ySOpnR2NfIbj7ama2sJm95+0RTqgKibJpFm+F7woj9PsXoIy45BwOMHGy/bSok8DdvzBxRlV6\nLwWnwLfIz1r9T+5O+bgJzT92TXY3s1lQSeZ1dfeaCggZayOolZEvikqcRyAFxUIiDhXjENRn34J4\nmH6NrkGnOHzbCnc/Hzg/XItZSowRAxEH0JqIN+1CM3saZbONRg63MhyJlSAEW66mFmx5lFqw5Rzk\ndOtYsAWmZFGtSP6zXrRfGYech/Wc/IOoCYo0ww5IkfTa8HlP4HFqZdKLIadqliNyqqHCNRBQ7fzb\nVJJbF+5+ZBm7FeEcVHFzFDkB2z58KXAicBEan55HVTvtmNv3IQ3vAVKmnXohue1+qc+zoknl3URK\nZ1fUni/QgzA4s30isHinz1fFx3preE1GpTu3pl43Isn2jkjehzZ9gZw3ed9HS6uH/T6gJvs+Hvhu\neL80ihQWsTEP4h0bixaOI6bF+yNzzKeFY90N8WUsuKB8GwAAGwVJREFUFd6/jlL0O3F/fD1zXRdO\nfS58f4RjuxMtOIajSe9XUt//DLg3sn1PoayTVo5xUvoYKz5/ZyOS6OnCuRuEyhbvQ4qg0e1Dpaal\nrkH4/U3AgQ2+Pxi4cWrfa6n/vwHi6/oRmhx9Nf2KsHMDcER4n5x7Q9xil7VyjyAxkrk7dY7qtHEk\nygKfMbVtprDtlkhbF6JSiiVT25ZC4gfnR9q6CpUqVXWcK6HM6Emhf0q/osYqRHr+OCJDnoh4nsah\noFJb+oSItiX8cEm518Jh+6bAgx1q01ikNp58XheR/vfrRHvafKyjgIE5278KjCppc1bEL3wc4g/8\nDHisg8f4izCerJXz3drhu+072L6NQxsmAxPQXDJ5vRth52zk1PxKznczIGfn2QVtJRnNyefsnOjn\nwP86dc4qPPdtmX+jsT39egwJGb2Hqls6ecwfAMt0+tz3vVq6hi8BuyOfx2QUyF8g79Xptk5Lr443\noCe8UObCuWHC9jQq/VqhA+1YCZFdvo+cfHsCX6uiA++pL0TwXHiBOJXaNBll6b2HiJ0HZL4v62B7\nA5V5gRYvm4T3S6MU9Wb7XxPadAGKFvYP26fZ+yMc33uorC27fSPgvUhbk5ATd46y1zTcH3OlPn8A\nDCpp62uI2HlyeO43z3w/Ejg68hh/iDJN5mnhnHdxIlZ8PWcP12AccmQ/jxZVt6OMiOj20YKTM/z+\njUaTSGBZ4I12nI+I453iMEm9ohwoqNz17dCXfIaimo+iaGZUQKPZNegJL2AJ4NVwr40Mr3Fh23cj\nbU0AVszZvhIwPtJWonp5Aiov2iT9KnGcDyGn4XdQhtBs6VcJe9OhRfFxSIRgKDBTD7ieLyJFwy73\nG8rce79DbfoMmD+z7VNgvk6frzYca+64gDKpJpa02Q9VLByIgqsfxfRpbTjGnh5seRo4GZi5RTvz\nhXHvRSScsAlyVB+IFuRvUnDBHfqyhVKf3858XpTIuVpPezGV59/IaX05sF2Hj/sJYNlOn/++V0vX\ncNcwTk1q8IoOxvW9Gr96a4loPa6zGYDNvAWus1bgQYnNzPZBSmU7oYy2fkgy+2VvQb1lasDMhqCB\nv1BJipcgnZ1KuAot+K9C12RTD0IOLeAuFHl/EjlAhpsI87egWEnhhoic+zR3H9tiW75M+Ax4IWf7\n80SUtQUYes7vM7ON3f3xzHdFUYnClYuYe/V6/CDAj8kv6WuEv6FF9qtm9i7dlTWbire4e79mvykL\nl4rUWiaxjjTJ6o0eZgMdwBzklxImeBM5BjuFtVrZ2cyWcPfH3P1Rk2LUXui+mANREoxw91craGeP\ngrs/ZmaDUSZoQgh+Ico4i+XImg71RVl8Sjzdxl/C3zxhGydeNW4w4pp7JnK/brCaOvJ5gfNyF+LV\nkduFucgvbUkEZjqB6dA9kMZEYPoOtKUtMLOlUh8XD/PnBP1Rhm2h/iOUMg9BJaJrIQGXAWH/W1GZ\n3a2tt7o0lkIOp3q4nuKqmu3AN1Gmd7TIUxru/oqZrYz6oj9Sm/84Usrc092b0r0EDCQ17/Hu3Jb9\niFP+7ImYqvNvd3/fzA5Hjr1/tvv/NcA+SIV1N3d/oYPt6ENJuPtfzexClMH2CMqy7lG8l9MieqWD\nrZ1cZ1XA3T9C6dtnh8XQziiqdIyZ3eztlfBtFf9E0aqiIgcD0LHVUzyM5TmrDO7+pJmtgBZk95rZ\n1t6agtR+1LinDg/vt0bp5kUURFdF98L9ZvYkOtdNVbumAfwZ+J2Z/cKDUp+ZzYC4b2IVUx1xZhwI\n/M/MtnP3q1LfFUGW2P+8nN9E8St5tfwgR5TYZ6rAzKZHPC17uvttqESlDJzu16uVBXZ/lE1XD5Po\n4HgZzlUreMTM7kXO14s8QhWuUbOo9hpUjpSz6MxA7juU8s6iUcBJZratu78Z7M+DSrtHxRhqgwP7\nbpTFVdrBZil1ZDPLU0fe18wKqSO3Efeh7JER4XNyvw1FpaydgNE14ALdgy54SaXUHoKHqD3veff6\nJ8hpXwQT0D31BnKk7QuMdvdnK2hnFejpwZYbUf/VasAXl2jOhmY2O3LSg/hoY+cdr6Bs4TF1vl+K\n+lxvXxZ0Yv6dZCFPVQShm/RYPgB4NnBlZwO2U52vvA/xCMk5j5nZL4A7PEL1vA/l0FtVRL8gJxIR\n1I1aUutsF8ysP+Je2KknO9hM8uHTu/uLBX9/ISJl/yf5ioenVN7I5m2ajMrr3gqfDUX49kOkvBcA\nr3mH1HOCUzLJcFwROQf2Q3wZPTrDsSjM7PLMpoTX5uHweWmCYlvMwiV9bc1sV9QP/B45Hl7t1DXt\nTQjKjiu1km0TruP11DKKNkYLv3QW4QZFr2eOvSyi7FWNQKhcF80yhk2y7L8AtkJBjEuBs9y9dEZS\ngWuQtG2qOxbSziIUwMg6i2ZGGV+FnUUmZetEofeFsHkhxHm4cZFsj3ap7ZnZ5qgfOx6V/GYXQU2V\nTq1CdeR2ISiZXo+CGjsirtbFkbr3Gu5+fwfa9Pciv+vB2fpNEe59Q06dFVEJYILPgbdysq/r2doN\nuNXdn668oRXAzCahOUKu0EJQVZyq8z8zS8/55wIOQ/Qqec/61VOrXQnM7BQ0R1s+23eFPu8+xHn5\n66ndtqrRjvl3UHnssglxrW4H/Mfdty3f4lLt2aHob72EknwfOosgkrIVUoI+3t3fNbPlgDenxUqG\nTqG3OthWQpGIrVG5XhKJeJ0e6mDrKTCzrxb5nbu/X9DeBOCH7n5HSw2rEGGC9Q3PqGOZ2TbIEXMr\nsFGZCVbVHVsqw3E7lKbf0zMcC6HoogXiFi45ztO1gH+hEsV1pjUHWyjH6ZJ55SXl6KtCmIx/6O6H\ntGCj0kVtT18kh/s2iymDd4QjcQDwE+ScWA1lPJ0FnOvub0S2qcees3Y5i0KwZQNq5aZPElHaHLLk\nf+juG4fPH9Bdbe94dz+xjol6duvdHwZ4kfsjOL4TdeRZECfkConTyswWA+5y94ExbasaZrYwcBBd\ny8uPdfdHO9muPkwb6InBljrPdx4KPetVIzgdH0LO1j8jnjhQMGJPNAdZNsn8nVZQ1fzbzJ7PbErU\n7UcBf3D3WJqQlhGSOvZHvHxfQfylw0pQK/ShByGU+9+C+AQXAr7t7s+Z2e8R5+L2nWzftIRe6WBL\n0BsygapGGOgb3TSFJ/TB3vPIWfVkFe2rAlknTOa7ZYArEalx1ESmnR3blyXDsdMI99uQUDaWbPsW\nykxZdFpwsIWI8VHIkTIvGW65Th+jmZ2MsqmeQpHtbMZTI/6bXgkTR18a0yPhhaOAQ9x9ZAmb30LX\nYTukjnbDtNJ3VO0sypQ2l+bfMbP/Ase5+zXh8wcoqPdc+PxzYA93XznS7oKNvi+SUZ4TfMi2bapn\n7mTaNx3wU+TQnKYW6l8WhMyWce7+7/D5OESg/QSwbdHKhZ6Mnhw46Mkws0FIFX09uvO5/cpb5zDu\nsSg7/zazfd39pAbfz4rG5VUqaGYUzOx3iMrmFsQx+QPgQnffaWq3pQ/VwcxGAve7+wHpMd7Mvg9c\n4O4LdbaF0w56tYMtjWk1E6hqmIjJp3xEZP1DyRDcekHOoLCg2BTYwVskba0K4RjvcPdcTiYzmxNl\nIUTxbJnZLUhyu69jKwEzmwtFRAHG1CvhKGl7RmDuaWSBcDJSWB2GVIn3Q4phOwEHdTqlPzgZ6sHd\nvWE5ZB9qCH3Vie6+fMn9ByABgD8CAzvtfK0K7XAWVVTa/DqwsgeyaDN7Gzn+ks+LAve6eyd4dyaj\nPvDt8PkDYCkXT1PHHWyhDR8jJe4vfT/9ZYSZjQF2d/dRJoL8kYjH+EfAF/7l5pnr0TCztVF22ErZ\nCpEQgLkT2M/db+xE+1JtmQPxQUI5PrdeAzP7BNgtby0RxuYbga+5+2Lddm5/28aibOq/hs/rIt7y\nmdy9aEZlH3oYzOw9YDl3fzazDl0Qratm7HATpxn0SpGDPLj7GOAAMzuIEInocJN6JLKOs1BOeVcL\n0an9Ubnkm2b2At35JJYrabc0mjkHQ/ZTlHMtYAVgt5ztr6IMkj7kIEw0RgDbUxPBmGRm/wD2inXM\n5pXpIh6faSUrYnMUSR1pZqcj7pNnzOxZJPDQEQdbKO163t1X68T/n0bxJjWnc2GYON12QvfDZKSi\nfVa1Tes4qhZgOB9l/JUubabNantmtjiwACrpmQIvzst0jlWgjtxG3IMyN/scbJ3B/NSENDYDLnUp\n1N0BjO5Yq3oH9gHOzDrXQCJJZnYGEproqIMtzKfu6WQbvkTYDvinmU1I99Eh6/oGJPy2Rr2d24wF\nUKk0AO5+i5k5qor4sgtW9GZ8BuRRPS1KV27NPrSIPgdbBi6i1ivDqw/tR286z30dWzmciCYZGwMJ\nV9+qSKBgOLB7UUM5ZbpnAu8CW6AJxbTAP/A1ROwOKo1LFM9Go3PWKYxFxL1JVtHFwN595V7NEe7b\nLpvQuTwQcd8UsTEv4l7bEWUY3AnsDVziUq6e1lC1s8iBPUMkv2xpc1vU9oLz+gpgSWrca0mboZiq\nd+XqyG3AX4DhZjYfcD/dr0FTMYc+tIQPgTmBl4D10dgMKiGbqVON6iVYGols1cNNwP9Npbb04f/b\nu/8gO8vqgOPfQ6AMiQwQHAPTQqEFi4JhFHQsULW0QEUoCiPCtIQCQ39MGAd/lLbOVAEBUYsdW8ZU\nBxGslkqF0oCBQIVQ+dUipMigUHBEKCABS8EUNIGc/vG8V24udze7uXfvs/ve72cmk9z3zd45hN13\n9znPec4Zgsz8erPhe1lEvCszVzUbytcCiyiDW56oFN6WlK/rbusp7Sk0dy0HPhoRxzavM8qE9U8C\nV9QLq31MsKmqzDyrdgwj5INt8xxDmfi3quvaiqa8/nKmkWCjLAgu6Tqm+/P3o0yHbYMfUJKFj1AW\n8kcDd1J6aExp+MgMiZ7Xh1OalWvT/pONEycddzCFauum6f9vA09TkiQXN1XbbTUTyaL9gE4Cpzfh\nOdXquBXA2RHxjew/be9jlGM40/VZytf9bzW/v4WSCLmAKS6650hPqX9sfu+3UZBMLZGozXcDcFFE\nrKZsDK5oru+NVYUzbRE9Jzx6vEiZMKo5JDMvao7V/ktEHAWcTakSe3tmPl4xtGDjTSp45UZVlSnh\nGsiHKFPkn6JsitxMOUF1O4NV56uHCTYNw0BHb/od2Yt2jgzuPNjW8PKDbWd8sG3KfPof31zT3JuO\ncTim+/eU/85bgE8DV0XEUmABJrTmqt17Xm8AnupN0kxiPeUZe01Tpd1qw0wWDflo83mU4SMPRMRE\n0/bO24z3/XXKUIenm15qGzLzlqblxd9QjlW2Qe/XgUZrKXAO5ajoMV3DgvYDLqsW1Xh4jFL9OlEP\nyMVArWonDSAzP9Uk2b4JPAy8IzNrH8Ps10qk30aV5pDMfBY4JCIOpGsSd2b+a93I2scEm6YlIq7s\nufSKHQ2Y+q7GmBzZAzZ6sB1E+WHoVZRpLtOeADhmbgfOioglnYRCV7XH7dN8r9Yf083MT3b9+dqI\n2IeScHsoM2v2RkmG3xer1ZpG4jtm5jVd15ZQBlgsiIirKH0IfzbRewA4sGcgQzvanJlPNkNtlgHn\n03/a3uYcmZ4HdCpyn6ZUQDxAqSqado++WWxtJ6kTEbsAp1I2q5Zn5mTDUzQEmfm/lEQw8PMph8cD\n76Qk2c6pFNo4WAF8PCKum6D69SzKpGPNEX3WU+spz+/PRrxcrF6jSmyOVDRrGiJiC0qLkKMp6+2k\nVLz/KCIinXo5VE4R1bQMe4T5OEzWnGCRfCLlB6L5lD50m1wkj6smQbSS0jfpnubyvpRk2aGZed80\n3usiytGpYymJ3MVAp+/iv2Xm6UMMvYrmCPJVmbmu5/pWwHsy8/JKcW2g9BbpfJ4fCdzIK/soeeSg\n0RztXNVJmkbEG4C7gUuA7wF/Cnw+M8+sFWPbbWoi6QDvO7Rpe81k3gsy86qI+AdK38VzgD8E9s/M\nvQeJtbbm8/5qSuXUg8BxlCbgCyiLhPmUNgLj1NO1mmZIyimU9g2PA1cCV2TmnVUDa7Fmiu/dlJ9X\nLuTlPo57USoL51GmA9rTdI4Y9npKmkiUjO3VlNYs9wD3Uzb4Xkfp3bo8M99dL8L2McGmqsZhZPAE\ni+S7KCXYLpKnICLmA79H+WESyr/bVwEy84VpvM92lGO6+wPbUhYHnf4Dh7eh2Xsz2XfnTkKg6/qO\nwJrMrNKnyB8mpy8ingCOzMxvN6/PpfRmOah5/V7grMx8fcUwW22mEmzDFBGHAQsy88qI2INSyfJa\n4MfASd2bO3NR8z30RUrV3wnAEZRNl1Obv/K3wH6Z+dY6EbZfROxEqX44hVIFfjnwx5Svhe9WDG1s\nND8XL6P0U+2ufl0JLM3MH9SKTdLsFREnUXq1HpWZN/XcO5hSZHBaZtYeZNQaJthUVUSsAQ7LzNU9\nCbZDKI24d6kc4sBcJA9fRGxN2bU9IzOn3Tutzf0HmoTAosx8quf6PsC3MnOH/h+p2SYifgrsmZmP\nNq9vAa7NzHOb17sB92bmttWCbLkmYb1T5+up+T61eDYsZiPiA5n51xPcW0g5cnRdZh442siGKyKe\npvSY+05EvIoyrOXNmXlXc38v4I7M3L5mnG0VEVcDb6MM4Pgq5XPqpYhYjwm2kYuIHSjVrwE8mJnP\nVA5J0iwWEdcDN2bm+RPc/whlXXrYaCNrL3uwqbZxmKy5Axs36X875ahcx52Uoy/q0iTRzgQOAdYB\nn2qOQJ0EnEs5KtF3cTnJey4BvpaZtwK3dl3/BeC4ubx7ExG383KfsxXN4qdjHrAncFO/j9Ws9SSl\nsfujzefomyi9Bzu2ZfLJchpc7zS1gfqODtl5EfHjCZ5bP6NUtuw44phmwkLgRwCZubb5t+9OKjxD\n+VrQzHgnZVjGssx8sHYw465JqHkcV9JULQbOmOT+tcD7RxTLWNiidgAaex+iVBF1T9Z8CFhLeyZr\ndhbJdC2S7+i67yK5v7OBP6E04dwN+KeI+ALwAeCDwG7dDf2n6EvAdn2ub9vcm8tWUb5+gnLk9eau\nX9dRvtZ+v1Zw2iwrgPMj4jeATwDPA93N3BcD368R2Bi5lPL96dnm11coR8uf7flVwwnA5yNioyEW\nTZXXSuA1wME1ApsBDkip5yDK98i7IuLfI+K0iHh17aAkSVOykI0LPXo9SSkG0ZBYwaaqxmRkcGeR\n/GfAu3GRPFXvBZZk5vLmeON3KM+sfQeYdhP0X5j9EvUWyUORmX8BEBEPA5f2ThrTnPSXlAbiN1M2\nHU7sGV5xMnB9jcDGxWzuCZiZX4+I7YHLIuJdmbkqIhZQdqMXUY58PF43yqGZrIpw60oxjYXMvAO4\nIyJOB95Hee58hrJJf0hEPJqZP5nsPSRJ1cyj9DGdyEuYExoqe7Cpiqap4oXAWzPzuZ572wG3AR/M\nzJU14humZqf3SsoucGeR/M9d979J6R/Tloq9oYiIdcDumflY8/oF4C2Zee9mvNdqSmJtX+A+Nv5G\nM49SYXhdZh7b58Olqppn4trMfKnn+sLm+rr+H6lxEBFnUCq+j6JU/v4iJbn231UDGxIHpMw+EfFr\nlIEHJwDbAzdk5u9O/lGSpFFrejNfS2kd0c/WwO/UGoLWRibYVEVELAdumqRB8/uBQzPziNFGNnNc\nJE/PMJuLR0Snb9XHgAsoic6OdcDDwBVt+H/QJCInfLBn5vwRhiNpBCLifMpE6oeBd3QGY0gzKSLm\nAUcCJ5tgk6TZx02q0TPBpioi4oeUbPn3Jri/F3B9Zu462sg0W/TZcTkSuBHY7ObiEXEiZchBa49P\nRsT7ei5tBbwROB44OzP/bvRRSRq2iLiy59LhwD3AY90XKw1gkCRJGjsm2FRFRPwU2CczH5rg/h7A\nvZm5zWgj02wxkzsuzbCJ19Az6CUzH5nue80VzQTVozLzmNqxSBqcu9KSJEmziw3tVMtjwD6UiaH9\nLAaeGF04mm1mYlEYEXsCFwMH9N6iHKtsc/+BW4HP1Q5C0nCYOJMkSZpdTLCplhXAxyPiut7jehGx\nDXAWcE2VyNRml1AGHBxBSeCORQlvRGwF/BEmrSVJkiRpRnhEVFVExCLgbspo4AuBB5pbewFLKZVE\nb8rMJ+tEqDaKiP8D9svM+2vHMlMiojdxGMBCSmJxSWZeUSUwSZIkSWoxK9hURWY+GREHAMuAT1CS\nAFASAyuBpSbXNAO+C7y6dhAz7Mye1xuAp4DbMnPN6MORJEmSpPazgk3VRcQOwB6UJNuDmflM5ZDU\nUhFxMHAO8BHgXmB99/3MfK5GXMMQER8F/iozn68diyRJkiSNGxNsksZGRGxo/tj74AsgM3PODjmI\niJeAna1SkyRJkqTR84iopHHym7UDmEGx6b8iSZIkSZoJVrBJUgs01XmLMvOp2rFIkiRJ0rgxwSZp\nrETE9sApwOuaS/cBF2fms/WiGlyTYHuWVx5/3UhmLhxNRJIkSZI0PkywSRobEbE/ZUrtC8B/NJff\nDGwDHJqZd9eKbVBNgu10SpJtQpl56WgikiRJkqTxYYJN0tiIiG8BDwGnZuaLzbUtgYuAX8nMt9WM\nbxBNgm0nhxxIkiRJ0uiZYJM0NiLiBeCNmXl/z/XXA9/OzPl1IhucU0QlSZIkqZ4tagcgSSP0HLBr\nn+u7AD8ZcSzD5hRRSZIkSapky9oBSNIIfQ34YkR8GLituXYg8GngsmpRDUFmumEiSZIkSZWYYJM0\nTj5MmbL5ZcrzL4B1wDLgzyvGJUmSJEmaw+zBJmnsRMR84Febl9/PzOdrxiNJkiRJmttMsElqvYi4\neCp/LzNPnulYJEmSJEntY4JNUutFxAbgh8BqJhkGkJnvGVlQkiRJkqTWsAebpHGwDDge2B34EvCV\nzPyfuiFJkiRJktrCCjZJYyEitgaOBk4GDgC+AXwRuD59EEqSJEmSBmCCTdLYiYhfBv4AWEKp5N07\nM9dWDUqSJEmSNGdtUTsASapgA5CUfmzzKsciSZIkSZrjTLBJGgsRsXVEHB8RNwD/BbwBOA3Y1eo1\nSZIkSdIgHHIgqfUi4nPAccCjwMXA8Zn5dN2oJEmSJEltYQ82Sa0XERuAR4DVlKOhfWXm0SMLSpIk\nSZLUGlawSRoHX2aSxJokSZIkSYOwgk2SJEmSJEkagEMOJEmSJEmSpAGYYJMkSZIkSZIGYIJNkiRJ\nkiRJGoAJNkmSJEmSJGkAJtgkSZIkSZKkAZhgkyRJkiRJkgZggk2SJEmSJEkagAk2SZIkSZIkaQD/\nD6lwIZ0LMZmFAAAAAElFTkSuQmCC\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "plt.figure(figsize=(15, 6))\n", - "plt.plot(np.sort(score_difference))\n", - "plt.yscale(\"symlog\")\n", - "plt.xticks(np.arange(len(game_names)), np.array(\n", - " game_names)[idxs], rotation='vertical')\n", - "plt.grid()\n", - "plt.title(\"Comparison A3C on Atari games: with and without LSTM memory\")\n", - "plt.ylabel(\"Difference between A3C_LSTM and A3C_FeadForward scores\")" - ] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/week08_pomdp/practice_pytorch.ipynb b/week08_pomdp/practice_pytorch.ipynb index 52be1fac4..2d3d43b46 100644 --- a/week08_pomdp/practice_pytorch.ipynb +++ b/week08_pomdp/practice_pytorch.ipynb @@ -1,678 +1,681 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "if 'google.colab' in sys.modules:\n", - " !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/0ccb0673965dd650d9b284e1ec90c2bfd82c8a94/week08_pomdp/atari_util.py\n", - " !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/0ccb0673965dd650d9b284e1ec90c2bfd82c8a94/week08_pomdp/env_pool.py\n", - "\n", - "# If you are running on a server, launch xvfb to record game videos\n", - "# Please make sure you have xvfb installed\n", - "import os\n", - "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", - " !bash ../xvfb start\n", - " os.environ['DISPLAY'] = ':1'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from IPython.core import display\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Kung-Fu, recurrent style\n", - "\n", - "In this notebook we'll once again train RL agent for for Atari [KungFuMaster](https://gym.openai.com/envs/KungFuMaster-v0/), this time using recurrent neural networks.\n", - "\n", - "![img](https://upload.wikimedia.org/wikipedia/en/6/66/Kung_fu_master_mame.png)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cells": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mWARN: gym.spaces.Box autodetected dtype as . Please provide explicit dtype.\u001b[0m\n", - "Observation shape: (1, 42, 42)\n", - "Num actions: 14\n", - "Action names: ['NOOP', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'DOWNRIGHT', 'DOWNLEFT', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE']\n" - ] - } - ], - "source": [ - "import gym\n", - "from atari_util import PreprocessAtari\n", - "\n", - "\n", - "def make_env():\n", - " env = gym.make(\"KungFuMasterDeterministic-v0\")\n", - " env = PreprocessAtari(env, height=42, width=42,\n", - " crop=lambda img: img[60:-30, 15:],\n", - " color=False, n_frames=1)\n", - " return env\n", - "\n", - "\n", - "env = make_env()\n", - "\n", - "obs_shape = env.observation_space.shape\n", - "n_actions = env.action_space.n\n", - "\n", - "print(\"Observation shape:\", obs_shape)\n", - "print(\"Num actions:\", n_actions)\n", - "print(\"Action names:\", env.env.env.get_action_meanings())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "if 'google.colab' in sys.modules:\n", + " !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week08_pomdp/atari_util.py\n", + " !wget https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week08_pomdp/env_pool.py\n", + "\n", + " !pip install -q gymnasium[atari,accept-rom-license]\n", + "\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + " !touch .setup_complete\n", + "# If you are running on a server, launch xvfb to record game videos\n", + "# Please make sure you have xvfb installed\n", + "import os\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " os.environ['DISPLAY'] = ':1'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from IPython.core import display\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Kung-Fu, recurrent style\n", + "\n", + "In this notebook we'll once again train RL agent for for Atari [KungFuMaster](https://gymnasium.farama.org/environments/atari/kung_fu_master/), this time using recurrent neural networks.\n", + "\n", + "![img](https://upload.wikimedia.org/wikipedia/en/6/66/Kung_fu_master_mame.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARN: gym.spaces.Box autodetected dtype as . Please provide explicit dtype.\u001b[0m\n", + "Observation shape: (1, 42, 42)\n", + "Num actions: 14\n", + "Action names: ['NOOP', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'DOWNRIGHT', 'DOWNLEFT', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE']\n" + ] + } + ], + "source": [ + "import gymnasium as gym\n", + "from atari_util import PreprocessAtari\n", + "\n", + "\n", + "def make_env():\n", + " env = gym.make(\"KungFuMasterDeterministic-v0\", render_mode=\"rgb_array\")\n", + " env = PreprocessAtari(env, height=42, width=42,\n", + " crop=lambda img: img[60:-30, 15:],\n", + " color=False, n_frames=1)\n", + " return env\n", + "\n", + "\n", + "env = make_env()\n", + "\n", + "obs_shape = env.observation_space.shape\n", + "n_actions = env.action_space.n\n", + "\n", + "print(\"Observation shape:\", obs_shape)\n", + "print(\"Num actions:\", n_actions)\n", + "print(\"Action names:\", env.unwrapped.get_action_meanings())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jheuristic/anaconda3/lib/python3.6/site-packages/scipy/misc/pilutil.py:482: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", + " if issubdtype(ts, int):\n", + "/home/jheuristic/anaconda3/lib/python3.6/site-packages/scipy/misc/pilutil.py:485: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", + " elif issubdtype(type(size), float):\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAANEAAAEICAYAAADBfBG8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAFmVJREFUeJzt3XvUHHV9x/H3hyBoASHcEgi3wAGO4CVGxFTKRbyFVAXaqsFWUWkJlVA80FMIKFLUAirQKBUImnIRQSqi1BNQCnhpEeRiCJcIJIAQckMIBAVpE7/9Y2Zhstl9nnl2dp+Z2f28ztmzszOzu99J5ru/3/xmnu8oIjCzzm1QdgBmdeckMivISWRWkJPIrCAnkVlBTiKzgpxEfUjSTpJ+J2lM2bEMAidRAZKmS7pd0u8lrUynPyVJZcYVEY9HxKYRsbbMOAaFk6hDkk4EZgNfBsYD44BjgP2AjUoMzUZbRPgxwgewOfB74C+HWe/PgV8Bq4EngNMzy3YBAvhEumwVSRK+FVgAPAuc3/R5nwQWpuv+CNi5zfc2PnvD9PVPgC8AtwK/A/4T2Aq4Io3tDmCXzPtnpzGtBu4C9s8sew1waRrDQuCfgCWZ5dsD1wBPAY8C/1D2/1fP94eyA6jjA5gKrGnspEOsdxDwBpIW/43ACuCwdFljR78QeDXwHuAPwPeBbYEJwErgwHT9w4BFwOuADYHPALe2+d5WSbQI2C39AXgAeAh4V/pZlwH/nnn/36RJtiFwIrAceHW67Czgp8BYYIc04ZekyzZIk+40ktZ4V+AR4L1l/5/1dH8oO4A6PtKdbHnTvFvT1uNF4IA27/tX4Lx0urGjT8gsfxr4cOb1NcCn0+nrgaMyyzYAXqBFa9QmiU7NLD8HuD7z+v3A/CG2dxXwpnR6naQA/jaTRG8DHm9676xsgvbjw8dEnXka2FrSho0ZEfH2iNgiXbYBgKS3SbpF0lOSniPprm3d9FkrMtMvtni9aTq9MzBb0rOSngWeAUTSYuWR93uQdKKkhZKeS79r80zc25N09Rqy0zsD2zdiTN97CsnxYt9yEnXmF8BLwKHDrPdt4Dpgx4jYnKTr1unI3RPAjIjYIvN4TUTc2uHntSRpf+Ak4EPA2PSH4TleiXsZSTeuYcemGB9tinGziJjWzRirxknUgYh4Fvhn4OuS/krSppI2kDQJ2CSz6mbAMxHxB0n7Ah8p8LUXArMk7Q0gaXNJHyzwee1sRnK89xSwoaTTgNdmll+dxjFW0gRgZmbZL4HVkk6S9BpJYyS9XtJbexBnZTiJOhQRXwJOIBmdWknSPbqI5Fe80Tp8CjhD0vMkB9tXF/i+a4GzgaskrQbuAw7peAPa+xHJ8ddDwG9IBjuyXbYzgCUkI2//BXyXpFUmkvNS7wcmpct/C3yDpDvYt5Qe/Jl1RNLfA9Mj4sCyYymLWyIbEUnbSdov7b7uSTIEfm3ZcZVpw+FXMVvHRiTd1okkQ/pXAV8vNaKS9aw7J2kqyZnvMcA3IuKsnnyRWcl6kkTp1cMPAe8mOQi9AzgiIh7o+peZlaxX3bl9gUUR8QiApKtIzqm0TCJJHt2wKvptRGwz3Eq9GliYwLrDoktoOrMu6WhJd0q6s0cxmBX1mzwr9aolanVWfp3WJiLmAHPALZHVW69aoiWseznIDsDSHn2XWal6lUR3ALtLmihpI2A6yTVkZn2nJ925iFgjaSbJJSRjgLkRcX8vvsusbJW47MfHRFZRd0XEPsOt5Mt+zAqqxWU/xx9/fNkh2ACaPXt2rvXcEpkVVIuWaLTMmDEDgIsuuqjtsqzm9ZrXGelyqye3RKlWSdJq2UUXXfTyzp+dn03ATpZbfTmJUm4VrFNOohyyCTZjxowhu3btllv/chKZFeSBhZyGGyRoXset0eBwS5RDnoRw0gyuWlz2MxonW0c6PJ1nHQ9x19vs2bNzXfbjJDJrI28SuTtnVpCTyKwgj85VyNhZY9ebt+rMVSVEYiPhlqgiGgm06sxVLz+y8626nERmBXWcRJJ2TG9gtVDS/ZKOT+efLulJSfPTR1/fm8asyDHRGuDEiLhb0mbAXZJuTJedFxFfKR6eWfV1nEQRsYzkrmlExPOSFpL/1odmfaMrx0SSdgHeDNyezpopaYGkuZJaHhm7Auq6sgMJjUd2vlVX4SFuSZvyyl2uV0u6APg8ScXTz5PcqfqTze9zBdT1OWHqqVBLJOlVJAl0RUR8DyAiVkTE2oj4I3AxSXF7s75VZHROwDeBhRFxbmb+dpnVDie5t6hZ3yrSndsP+Chwr6T56bxTgCPSu2gH8BjgvxGwvlZkdO6/aX33h3mdh2NV5D/hGNrAXjt374NHrPP6DXteOaLl3fiMPN9RthkzZrSsMeFEeoUv+7EhOVmG5ySy3IYqbjnInESWm4tOtuYksiE5YYbnGgs2rEEdnctbY2FgR+csv0FJmk65O2dWkJPIrCAnkVlBA3NM1HyPoVZn4lstzz5nNc9rfNasWQ/3ahO64swzdy87hL4zUC3RcAfIeQ6gszfpyvse628DlUTDnfNoXt5q/Tzr2GAZqCRqbkVaLW+ebl6/1fvdGg22gUqiZp3c1a75Pa2Ol2yw+IoFszZG7YoFSY8BzwNrgTURsY+kLYHvALuQ/HXrhyLCVTisL3WrO/eOiJiUydqTgZsiYnfgpvS1WV/q1XmiQ4GD0ulLgZ8AJ/Xou0ZkJOeDWs1v9Z6sQ37+89HZkA5dv//+ZYfQd7qRRAH8OD2uuSitJzcurZBKRCyTtG0Xvqdrit4m0iyrG925/SJiMnAIcKykA/K8qcwKqCM9X9TpOjYYCidRRCxNn1cC15IUa1zRqD+XPq9s8b45EbFPntGPbhvplQvtXvv8kEHxCqibpHeEQNImwHtIijVeBxyZrnYk8IMi39Ntrc71DLXcbCiFzhNJ2pWk9YHk+OrbEfFFSVsBVwM7AY8DH4yIZ4b4HJ8nssoZlfNEEfEI8KYW858G3lnks83qohZXLJiVpH9qLEz+wuSyQ7ABdPdn7s61Xi2SaNsdKnWayWwdtUiiDa4e6IvNreJqkUTzd5g//EpmJalFEo3faXzZIdgAWsrSXOu5n2RWUC1aIg8sWJX5PJFZe7nOE7k7Z1aQk8isoFocE90w2Vcs2Oibene+KxbcEpkV5CQyK8hJZFZQLY6JJs3zFQtWgpy7nVsis4I6bokk7UlS5bRhV+A0YAvg74Cn0vmnRMS8jiMEPvLx04ZdZ9aJxwFw5jlfK/JVhTiGfosh327bcRJFxIPAJABJY4AnSeotfAI4LyK+0ulnd2LtSWuTiRKvEHIMgxlDt46J3gksjojfSOrSR47MmLPHJBPnlPL1jmGAY+hWEk0Hrsy8ninpY8CdwImjUcx+0H79HEN1Yig8sCBpI+ADwH+ksy4AdiPp6i2jzW9Btyugjjl7zCu/PiVxDIMZQzdaokOAuyNiBUDjGUDSxcAPW70prdk9J12v8FXcg/br5xiqE0M3kugIMl05Sds1itkDh5NURO25QeuHO4bqxFAoiST9CfBuIFtz90uSJpHcLeKxpmU9M2i/fo6hOjEUrYD6ArBV07yPFoqoQ4P26+cYqhNDLS77yWPQfv0cQ3Vi6JskGrRfP8dQnRj6JokG7dfPMVQnhr5JokH79XMM1Ymhb5Jo0H79HEN1YuibJBq0Xz/HUJ0Y+iaJBu3XzzFUJ4ZaFG9cvnzaaIVi9rLx4+e5eKPZaKhFd+6Wyb61ilWXWyKzgpxEZgU5icwKqsUx0TvunlR2CDaIxvtOeWajohYtUZ66c2bdl6/unFsis4JyJZGkuZJWSrovM29LSTdKejh9HpvOl6SvSlokaYEk31zI+lrelugSYGrTvJOBmyJid+Cm9DUk1X92Tx9Hk5TQMutbuZIoIn4GPNM0+1Dg0nT6UuCwzPzLInEbsIWk7boRrFkVFTkmGtcojZU+N66XnQA8kVlvSTpvHd0u3mhWll6MzrUqxr3eVdrdLt5oVpYiLdGKRjctfV6Zzl8C7JhZbwcg31krsxoqkkTXAUem00cCP8jM/1g6SjcFeC5TEdWs7+Tqzkm6EjgI2FrSEuBzwFnA1ZKOAh4HPpiuPg+YBiwCXiC5X5FZ38qVRBFxRJtF72yxbgDHFgnKrE58xYJZQU4is4KcRGYFOYnMCnISmRXkJDIryElkVpCTyKwgJ5FZQU4is4KcRGYFOYnMCnISmRXkJDIryElkVpCTyKwgJ5FZQcMmUZvqp1+W9Ou0wum1krZI5+8i6UVJ89PHhb0M3qwK8rREl7B+9dMbgddHxBuBh4BZmWWLI2JS+jimO2GaVdewSdSq+mlE/Dgi1qQvbyMpi2U2kLpxTPRJ4PrM64mSfiXpp5L2b/cmV0C1flGoAqqkU4E1wBXprGXAThHxtKS3AN+XtHdErG5+bzcroN58w5SXpw+eeluRj6p1DEOpenx11nFLJOlI4H3AX6dlsoiIlyLi6XT6LmAxsEc3Am0nu3OUpQoxjETd4q26jpJI0lTgJOADEfFCZv42ksak07uS3F7lkW4EmlcVdpAqxJBVtXj6zbDduTbVT2cBGwM3SgK4LR2JOwA4Q9IaYC1wTEQ035KlJxpdlDJ3mCrE0E6VY6u7YZOoTfXTb7ZZ9xrgmqJBdaKxc5TZ369CDK0cPPU2J08P1eLGx0M5eOptfO3tZ7z8+rhbBzOG4Sz41rSXpz/9Ld9Iupt82Y9ZQX2RRMfdeto6z4Maw1AarY9boe6rfXcOYI97FnAc5e4cZcVw/rmvBWDmCeudimux3lc4P72X+3DrW361b4n2uGfBOs+DFEMjgZqnh1ovz/o2MrVPoqwyE6lKMTScf+5rnSyjoLbduarsrGXG0eiSNRJluIRpXt+6oy9aoofe9MayQyg1huzxzcwTVrd83ZxAPibqntq2RNZacyvjVqf3+qIlstYtS3OrNNS61rnaJ9Ggd+WympOjMbCQTSYnUPfVPomyB/Zl7cxViGEo2WSy7qt9Etm6nCijr/YDC1X45a9CDFl77bXXeleS33zDlMpdXd4v3BKZFVTbJFo790DWzj1wnddlxVF2DMNxK9Rbte/OAex2/NiyQ6hEDA0HT71t3fND5z7gY6Ue6rQC6umSnsxUOp2WWTZL0iJJD0p6b68Cb6UKO3IVYmjmBOqtTiugApyXqXQ6D0DSXsB0YO/0PV9vFC7ptsWzV7F49ip2O34si2ev6sVX5I6j7BisXHlqLPxM0i45P+9Q4KqIeAl4VNIiYF/gFx1HmEMVduIqxGDlKDKwMDMtaD9XUqMPMwF4IrPOknTeerpVAbWx45bZjapCDFaeTpPoAmA3YBJJ1dNz0vlqsW7L6qYRMSci9omIfTqMYT1V2ImrEIMvOh1dHSVRRKyIiLUR8UfgYpIuGyQtz46ZVXcAlhYL0YrwoELvdVoBdbvMy8OBxsjddcB0SRtLmkhSAfWXxUIcWhV++asQg5Wn0wqoB0maRNJVewyYARAR90u6GniApND9sRGxtjehWyvuyo2+rlZATdf/IvDFIkHlUZVf/6rEYeWp7WU/rVRhiLkKMdjoUnpXlHKDGOb+RENd97Xf8icB+J/xLUfSR0UVYsiqak3wurn5hil35Rk9rsW1cydMbn/r19vnfRZIduS3Tfv8aIVUuRiybr4heR7q382G1/h3HE7tu3NV2GmrEEMr7/uX+WWHMBBq0Z0zK0n/dOd+eMqkskOwAZS3Ja99d86sbE4is4KcRGYFeWDBrD0PLJgV4YEFs1FSi+7c8uXThlps1hPjx8/rn+7cLZN95t2qy905s4KcRGYFOYnMCuq0Aup3MtVPH5M0P52/i6QXM8su7GXwZlWQZ2DhEuB84LLGjIj4cGNa0jnAc5n1F0dEV0/svONunyeyEozPV6iqUAVUSQI+BBw8gtBGbPz4eb38eLNCig5x7w+siIiHM/MmSvoVsBr4TET8vNUbJR0NHJ3nS67cfvuCYZqN3BFLu9QSDfc9wJWZ18uAnSLiaUlvAb4vae+IWK+CYETMAeaAr52zeus4iSRtCPwF8JbGvLSQ/Uvp9F2SFgN7AIXqbeeVPXZqnKBtNc8xlB/DaMTR7vu6/W9RZIj7XcCvI2JJY4akbRq3UpG0K0kF1EeKhTgyrf5RRvuKB8dQrRh6HUeeIe4rSW6NsqekJZKOShdNZ92uHMABwAJJ9wDfBY6JiGe6Fq1ZBXVaAZWI+HiLedcA1xQPy6w+fMWCWUF9mUTZ/m5ZV4A7hurE0Os4avGnECNRhasbHMNgxVCLP8rzyVYrwxFLl+b6o7xaJJFZSfrnL1uT619H5vI//WcAPvqLz3U7GMdQwxg6i2NmrrX6cmDBbDQ5icwKchKZFVSLY6Lx229Vynu7xTFUJwbIH8fyfH8J4ZbIrKhatETbjB/ZHbrPPfuznHDS5QBcfulnOeGk0b+TnWOoTgydxjGwLdEVl5zFuHGbvPx63LhNuOKSsxzDAMfQ6zjq0RJtu8WI39P8j9TJZxTlGKoTQy/jqMUVCyO9lfy3Lzljndcf+fhpIw+qIMdQnRg6jePmG6b0z2U/I00is27Im0R9d0xkNtry/Hn4jpJukbRQ0v2Sjk/nbynpRkkPp89j0/mS9FVJiyQtkDS51xthVqY8LdEa4MSIeB0wBThW0l7AycBNEbE7cFP6GuAQkgIlu5PUlbug61GbVciwSRQRyyLi7nT6eWAhMAE4FLg0Xe1S4LB0+lDgskjcBmwhabuuR25WESMa4k7LCb8ZuB0YFxHLIEk0Sdumq00Ansi8bUk6b1nTZ+WugHrzDVNGEqbZqMqdRJI2Jank8+mIWJ2U4W69aot5642+uQKq9Ytco3OSXkWSQFdExPfS2Ssa3bT0eWU6fwmwY+btOwA5L6Awq588o3MCvgksjIhzM4uuA45Mp48EfpCZ/7F0lG4K8Fyj22fWlyJiyAfwZyTdsQXA/PQxDdiKZFTu4fR5y3R9Af8GLAbuBfbJ8R3hhx8VfNw53L4bEfW4YsGsJL5iwWw0OInMCnISmRXkJDIrqCp/lPdb4Pfpc7/Ymv7Znn7aFsi/PTvn+bBKjM4BSLozz0hIXfTT9vTTtkD3t8fdObOCnERmBVUpieaUHUCX9dP29NO2QJe3pzLHRGZ1VaWWyKyWnERmBZWeRJKmSnowLWxy8vDvqB5Jj0m6V9J8SXem81oWcqkiSXMlrZR0X2ZebQvRtNme0yU9mf4fzZc0LbNsVro9D0p674i/MM+l3r16AGNI/mRiV2Aj4B5grzJj6nA7HgO2bpr3JeDkdPpk4Oyy4xwi/gOAycB9w8VP8mcw15P8ycsU4Pay48+5PacD/9hi3b3S/W5jYGK6P44ZyfeV3RLtCyyKiEci4n+Bq0gKnfSDdoVcKicifgY80zS7toVo2mxPO4cCV0XESxHxKLCIZL/MrewkalfUpG4C+LGku9ICLNBUyAXYtu27q6ld/HX+P5uZdkHnZrrXhben7CTKVdSkBvaLiMkkNfeOlXRA2QH1UF3/zy4AdgMmkVSeOiedX3h7yk6ivihqEhFL0+eVwLUk3YF2hVzqoq8K0UTEiohYGxF/BC7mlS5b4e0pO4nuAHaXNFHSRsB0kkIntSFpE0mbNaaB9wD30b6QS130VSGapuO2w0n+jyDZnumSNpY0kaRy7y9H9OEVGEmZBjxEMipyatnxdBD/riSjO/cA9ze2gTaFXKr4AK4k6eL8H8kv81Ht4qeDQjQV2Z7L03gXpImzXWb9U9PteRA4ZKTf58t+zAoquztnVntOIrOCnERmBTmJzApyEpkV5CQyK8hJZFbQ/wPTMFRqoBLrRQAAAABJRU5ErkJggg==", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "s, _ = env.reset()\n", + "for _ in range(100):\n", + " s, _, _, _, _ = env.step(env.action_space.sample())\n", + "\n", + "plt.title('Game image')\n", + "plt.imshow(env.render())\n", + "plt.show()\n", + "\n", + "plt.title('Agent observation')\n", + "plt.imshow(s.reshape([42, 42]))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### POMDP setting\n", + "\n", + "The Atari game we're working with is actually a POMDP: your agent needs to know timing at which enemies spawn and move, but cannot do so unless it has some memory. \n", + "\n", + "Let's design another agent that has a recurrent neural net memory to solve this. Here's a sketch.\n", + "\n", + "![img](img1.jpg)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class SimpleRecurrentAgent(nn.Module):\n", + " def __init__(self, obs_shape, n_actions, reuse=False):\n", + " \"\"\"A simple actor-critic agent\"\"\"\n", + " super(self.__class__, self).__init__()\n", + "\n", + " self.conv0 = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2))\n", + " self.conv1 = nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2))\n", + " self.conv2 = nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2))\n", + " self.flatten = nn.Flatten()\n", + "\n", + " self.hid = nn.Linear(512, 128)\n", + " self.rnn = nn.LSTMCell(128, 128)\n", + "\n", + " self.logits = nn.Linear(128, n_actions)\n", + " self.state_value = nn.Linear(128, 1)\n", + "\n", + " def forward(self, prev_state, obs_t):\n", + " \"\"\"\n", + " Takes agent's previous hidden state and a new observation,\n", + " returns a new hidden state and whatever the agent needs to learn\n", + " \"\"\"\n", + "\n", + " # Apply the whole neural net for one step here.\n", + " # See docs on self.rnn(...).\n", + " # The recurrent cell should take the last feedforward dense layer as input.\n", + " \n", + "\n", + " new_state = \n", + " logits = \n", + " state_value = \n", + "\n", + " return new_state, (logits, state_value)\n", + "\n", + " def get_initial_state(self, batch_size):\n", + " \"\"\"Return a list of agent memory states at game start. Each state is a np array of shape [batch_size, ...]\"\"\"\n", + " return torch.zeros((batch_size, 128)), torch.zeros((batch_size, 128))\n", + "\n", + " def sample_actions(self, agent_outputs):\n", + " \"\"\"pick actions given numeric agent outputs (np arrays)\"\"\"\n", + " logits, state_values = agent_outputs\n", + " probs = F.softmax(logits, dim=-1)\n", + " return torch.multinomial(probs, 1)[:, 0].data.numpy()\n", + "\n", + " def step(self, prev_state, obs_t):\n", + " \"\"\" like forward, but obs_t is a numpy array \"\"\"\n", + " obs_t = torch.tensor(np.asarray(obs_t), dtype=torch.float32)\n", + " (h, c), (l, s) = self.forward(prev_state, obs_t)\n", + " return (h.detach(), c.detach()), (l.detach(), s.detach())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_parallel_games = 5\n", + "gamma = 0.99\n", + "\n", + "agent = SimpleRecurrentAgent(obs_shape, n_actions)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "state = [env.reset()[0]]\n", + "_, (logits, value) = agent.step(agent.get_initial_state(1), state)\n", + "print(\"action logits:\\n\", logits)\n", + "print(\"state values:\\n\", value)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's play!\n", + "Let's build a function that measures agent's average reward." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate(agent, env, n_games=1):\n", + " \"\"\"Plays an entire game start to end, returns session rewards.\"\"\"\n", + "\n", + " game_rewards = []\n", + " for _ in range(n_games):\n", + " # initial observation and memory\n", + " observation, _ = env.reset()\n", + " prev_memories = agent.get_initial_state(1)\n", + "\n", + " total_reward = 0\n", + " while True:\n", + " new_memories, readouts = agent.step(\n", + " prev_memories, observation[None, ...])\n", + " action = agent.sample_actions(readouts)\n", + "\n", + " observation, reward, terminated, truncated, info = env.step(action[0])\n", + "\n", + " total_reward += reward\n", + " prev_memories = new_memories\n", + " if terminated or truncated:\n", + " break\n", + "\n", + " game_rewards.append(total_reward)\n", + " return game_rewards" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gymnasium.wrappers import RecordVideo\n", + "\n", + "with make_env() as record_env, RecordVideo(record_env, video_folder=\"videos\") as env_monitor:\n", + " rewards = evaluate(agent, env_monitor, n_games=3)\n", + "\n", + "print(rewards)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show video. This may not work in some setups. If it doesn't\n", + "# work for you, you can download the videos and view them locally.\n", + "\n", + "from pathlib import Path\n", + "from base64 import b64encode\n", + "from IPython.display import HTML\n", + "\n", + "video_paths = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_path = video_paths[-1] # You can also try other indices\n", + "\n", + "if 'google.colab' in sys.modules:\n", + " # https://stackoverflow.com/a/57378660/1214547\n", + " with video_path.open('rb') as fp:\n", + " mp4 = fp.read()\n", + " data_url = 'data:video/mp4;base64,' + b64encode(mp4).decode()\n", + "else:\n", + " data_url = str(video_path)\n", + "\n", + "HTML(\"\"\"\n", + "\n", + "\"\"\".format(data_url))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training on parallel games\n", + "\n", + "We introduce a class called EnvPool - it's a tool that handles multiple environments for you. Here's how it works:\n", + "![img](img2.jpg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from env_pool import EnvPool\n", + "pool = EnvPool(agent, make_env, n_parallel_games)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We gonna train our agent on a thing called __rollouts:__\n", + "![img](img3.jpg)\n", + "\n", + "A rollout is just a sequence of T observations, actions and rewards that agent took consequently.\n", + "* First __s0__ is not necessarily initial state for the environment\n", + "* Final state is not necessarily terminal\n", + "* We sample several parallel rollouts for efficiency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# for each of n_parallel_games, take 10 steps\n", + "rollout_obs, rollout_actions, rollout_rewards, rollout_mask = pool.interact(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Actions shape:\", rollout_actions.shape)\n", + "print(\"Rewards shape:\", rollout_rewards.shape)\n", + "print(\"Mask shape:\", rollout_mask.shape)\n", + "print(\"Observations shape: \", rollout_obs.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Actor-critic objective\n", + "\n", + "Here we define a loss function that uses rollout above to train advantage actor-critic agent.\n", + "\n", + "\n", + "Our loss consists of three components:\n", + "\n", + "* __The policy \"loss\"__\n", + " $$ \\hat J = {1 \\over T} \\cdot \\sum_t { \\log \\pi(a_t | s_t) } \\cdot A_{const}(s,a) $$\n", + " * This function has no meaning in and of itself, but it was built such that\n", + " * $ \\nabla \\hat J = {1 \\over N} \\cdot \\sum_t { \\nabla \\log \\pi(a_t | s_t) } \\cdot A(s,a) \\approx \\nabla E_{s, a \\sim \\pi} R(s,a) $\n", + " * Therefore if we __maximize__ J_hat with gradient descent we will maximize expected reward\n", + " \n", + " \n", + "* __The value \"loss\"__\n", + " $$ L_{td} = {1 \\over T} \\cdot \\sum_t { [r + \\gamma \\cdot V_{const}(s_{t+1}) - V(s_t)] ^ 2 }$$\n", + " * Ye Olde TD_loss from q-learning and alike\n", + " * If we minimize this loss, V(s) will converge to $V_\\pi(s) = E_{a \\sim \\pi(a | s)} R(s,a) $\n", + "\n", + "\n", + "* __Entropy Regularizer__\n", + " $$ H = - {1 \\over T} \\sum_t \\sum_a {\\pi(a|s_t) \\cdot \\log \\pi (a|s_t)}$$\n", + " * If we __maximize__ entropy we discourage agent from predicting zero probability to actions\n", + " prematurely (a.k.a. exploration)\n", + " \n", + " \n", + "So we optimize a linear combination of $L_{td}$ $- \\hat J$, $-H$\n", + " \n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "```\n", + "\n", + "\n", + "__One more thing:__ since we train on T-step rollouts, we can use N-step formula for advantage for free:\n", + " * At the last step, $A(s_t,a_t) = r(s_t, a_t) + \\gamma \\cdot V(s_{t+1}) - V(s) $\n", + " * One step earlier, $A(s_t,a_t) = r(s_t, a_t) + \\gamma \\cdot r(s_{t+1}, a_{t+1}) + \\gamma ^ 2 \\cdot V(s_{t+2}) - V(s) $\n", + " * Et cetera, et cetera. This way agent starts training much faster since it's estimate of A(s,a) depends less on his (imperfect) value function and more on actual rewards. There's also a [nice generalization](https://arxiv.org/abs/1506.02438) of this.\n", + "\n", + "\n", + "__Note:__ it's also a good idea to scale rollout_len up to learn longer sequences. You may wish set it to >=20 or to start at 10 and then scale up as time passes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "opt = torch.optim.Adam(agent.parameters(), lr=1e-5)\n", + "\n", + "\n", + "def train_on_rollout(states, actions, rewards, is_not_done, prev_memory_states, gamma=0.99):\n", + " \"\"\"\n", + " Takes a sequence of states, actions and rewards produced by generate_session.\n", + " Updates agent's weights by following the policy gradient above.\n", + " Please use Adam optimizer with default parameters.\n", + " \"\"\"\n", + "\n", + " # shape: [batch_size, time, c, h, w]\n", + " states = torch.tensor(np.asarray(states), dtype=torch.float32)\n", + " actions = torch.tensor(np.array(actions), dtype=torch.int64) # shape: [batch_size, time]\n", + " rewards = torch.tensor(np.array(rewards), dtype=torch.float32) # shape: [batch_size, time]\n", + " is_not_done = torch.tensor(np.array(is_not_done), dtype=torch.float32) # shape: [batch_size, time]\n", + " rollout_length = rewards.shape[1] - 1\n", + "\n", + " # predict logits, probas and log-probas using an agent.\n", + " memory = [m.detach() for m in prev_memory_states]\n", + "\n", + " logits = [] # append logit sequence here\n", + " state_values = [] # append state values here\n", + " for t in range(rewards.shape[1]):\n", + " obs_t = states[:, t]\n", + "\n", + " # use agent to comute logits_t and state values_t.\n", + " # append them to logits and state_values array\n", + "\n", + " memory, (logits_t, values_t) = \n", + "\n", + " logits.append(logits_t)\n", + " state_values.append(values_t)\n", + "\n", + " logits = torch.stack(logits, dim=1)\n", + " state_values = torch.stack(state_values, dim=1)\n", + " probas = F.softmax(logits, dim=2)\n", + " logprobas = F.log_softmax(logits, dim=2)\n", + "\n", + " # select log-probabilities for chosen actions, log pi(a_i|s_i)\n", + " actions_one_hot = F.one_hot(actions, n_actions).view(\n", + " actions.shape[0], actions.shape[1], n_actions)\n", + " logprobas_for_actions = torch.sum(logprobas * actions_one_hot, dim=-1)\n", + "\n", + " # Now let's compute two loss components:\n", + " # 1) Policy gradient objective.\n", + " # Notes: Please don't forget to call .detach() on advantage term. Also please use mean, not sum.\n", + " # it's okay to use loops if you want\n", + " J_hat = 0 # policy objective as in the formula for J_hat\n", + "\n", + " # 2) Temporal difference MSE for state values\n", + " # Notes: Please don't forget to call .detach() on V(s') term. Also please use mean, not sum.\n", + " # it's okay to use loops if you want\n", + " value_loss = 0\n", + "\n", + " cumulative_returns = state_values[:, -1].detach()\n", + "\n", + " for t in reversed(range(rollout_length)):\n", + " r_t = rewards[:, t] # current rewards\n", + " # current state values\n", + " V_t = state_values[:, t]\n", + " V_next = state_values[:, t + 1].detach() # next state values\n", + " # log-probability of a_t in s_t\n", + " logpi_a_s_t = logprobas_for_actions[:, t]\n", + "\n", + " # update G_t = r_t + gamma * G_{t+1} as we did in week6 reinforce\n", + " cumulative_returns = r_t + gamma * cumulative_returns\n", + "\n", + " # Compute temporal difference error (MSE for V(s))\n", + " value_loss += \n", + "\n", + " # compute advantage A(s_t, a_t) using cumulative returns and V(s_t) as baseline\n", + " advantage = \n", + " advantage = advantage.detach()\n", + "\n", + " # compute policy pseudo-loss aka -J_hat.\n", + " J_hat += \n", + "\n", + " # regularize with entropy\n", + " entropy_reg = \n", + "\n", + " # add-up three loss components and average over time\n", + " loss = -J_hat / rollout_length +\\\n", + " value_loss / rollout_length +\\\n", + " -0.01 * entropy_reg\n", + "\n", + " # Gradient descent step\n", + " \n", + "\n", + " return loss.data.numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# let's test it\n", + "memory = list(pool.prev_memory_states)\n", + "rollout_obs, rollout_actions, rollout_rewards, rollout_mask = pool.interact(10)\n", + "\n", + "train_on_rollout(rollout_obs, rollout_actions,\n", + " rollout_rewards, rollout_mask, memory)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train \n", + "\n", + "just run train step and see if agent learns any better" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import clear_output\n", + "from tqdm import trange\n", + "from pandas import DataFrame\n", + "moving_average = lambda x, **kw: DataFrame(\n", + " {'x': np.asarray(x)}).x.ewm(**kw).mean().values\n", + "\n", + "rewards_history = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for i in trange(15000):\n", + "\n", + " memory = list(pool.prev_memory_states)\n", + " rollout_obs, rollout_actions, rollout_rewards, rollout_mask = pool.interact(\n", + " 10)\n", + " train_on_rollout(rollout_obs, rollout_actions,\n", + " rollout_rewards, rollout_mask, memory)\n", + "\n", + " if i % 100 == 0:\n", + " rewards_history.append(np.mean(evaluate(agent, env, n_games=1)))\n", + " clear_output(True)\n", + " plt.plot(rewards_history, label='rewards')\n", + " plt.plot(moving_average(np.array(rewards_history),\n", + " span=10), label='rewards ewma@10')\n", + " plt.legend()\n", + " plt.show()\n", + " if rewards_history[-1] >= 10000:\n", + " print(\"Your agent has just passed the minimum homework threshold\")\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Relax and grab some refreshments while your agent is locked in an infinite loop of violence and death.\n", + "\n", + "__How to interpret plots:__\n", + "\n", + "The session reward is the easy thing: it should in general go up over time, but it's okay if it fluctuates ~~like crazy~~. It's also OK if it reward doesn't increase substantially before some 10k initial steps. However, if reward reaches zero and doesn't seem to get up over 2-3 evaluations, there's something wrong happening.\n", + "\n", + "\n", + "Since we use a policy-based method, we also keep track of __policy entropy__ - the same one you used as a regularizer. The only important thing about it is that your entropy shouldn't drop too low (`< 0.1`) before your agent gets the yellow belt. Or at least it can drop there, but _it shouldn't stay there for long_.\n", + "\n", + "If it does, the culprit is likely:\n", + "* Some bug in entropy computation. Remember that it is $ - \\sum p(a_i) \\cdot log p(a_i) $\n", + "* Your agent architecture converges too fast. Increase entropy coefficient in actor loss. \n", + "* Gradient explosion - just [clip gradients](https://stackoverflow.com/a/56069467) and maybe use a smaller network\n", + "* Us. Or PyTorch developers. Or aliens. Or lizardfolk. Contact us on forums before it's too late!\n", + "\n", + "If you're debugging, just run `logits, values = agent.step(batch_states)` and manually look into logits and values. This will reveal the problem 9 times out of 10: you'll likely see some NaNs or insanely large numbers or zeros. Try to catch the moment when this happens for the first time and investigate from there." + ] + }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/jheuristic/anaconda3/lib/python3.6/site-packages/scipy/misc/pilutil.py:482: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n", - " if issubdtype(ts, int):\n", - "/home/jheuristic/anaconda3/lib/python3.6/site-packages/scipy/misc/pilutil.py:485: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", - " elif issubdtype(type(size), float):\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### \"Final\" evaluation" + ] }, { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAANEAAAEICAYAAADBfBG8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFmVJREFUeJzt3XvUHHV9x/H3hyBoASHcEgi3wAGO4CVGxFTKRbyFVAXa\nqsFWUWkJlVA80FMIKFLUAirQKBUImnIRQSqi1BNQCnhpEeRiCJcIJIAQckMIBAVpE7/9Y2Zhstl9\nnnl2dp+Z2f28ztmzszOzu99J5ru/3/xmnu8oIjCzzm1QdgBmdeckMivISWRWkJPIrCAnkVlBTiKz\ngpxEfUjSTpJ+J2lM2bEMAidRAZKmS7pd0u8lrUynPyVJZcYVEY9HxKYRsbbMOAaFk6hDkk4EZgNf\nBsYD44BjgP2AjUoMzUZbRPgxwgewOfB74C+HWe/PgV8Bq4EngNMzy3YBAvhEumwVSRK+FVgAPAuc\n3/R5nwQWpuv+CNi5zfc2PnvD9PVPgC8AtwK/A/4T2Aq4Io3tDmCXzPtnpzGtBu4C9s8sew1waRrD\nQuCfgCWZ5dsD1wBPAY8C/1D2/1fP94eyA6jjA5gKrGnspEOsdxDwBpIW/43ACuCwdFljR78QeDXw\nHuAPwPeBbYEJwErgwHT9w4BFwOuADYHPALe2+d5WSbQI2C39AXgAeAh4V/pZlwH/nnn/36RJtiFw\nIrAceHW67Czgp8BYYIc04ZekyzZIk+40ktZ4V+AR4L1l/5/1dH8oO4A6PtKdbHnTvFvT1uNF4IA2\n7/tX4Lx0urGjT8gsfxr4cOb1NcCn0+nrgaMyyzYAXqBFa9QmiU7NLD8HuD7z+v3A/CG2dxXwpnR6\nnaQA/jaTRG8DHm9676xsgvbjw8dEnXka2FrSho0ZEfH2iNgiXbYBgKS3SbpF0lOSniPprm3d9Fkr\nMtMvtni9aTq9MzBb0rOSngWeAUTSYuWR93uQdKKkhZKeS79r80zc25N09Rqy0zsD2zdiTN97Csnx\nYt9yEnXmF8BLwKHDrPdt4Dpgx4jYnKTr1unI3RPAjIjYIvN4TUTc2uHntSRpf+Ak4EPA2PSH4Tle\niXsZSTeuYcemGB9tinGziJjWzRirxknUgYh4Fvhn4OuS/krSppI2kDQJ2CSz6mbAMxHxB0n7Ah8p\n8LUXArMk7Q0gaXNJHyzwee1sRnK89xSwoaTTgNdmll+dxjFW0gRgZmbZL4HVkk6S9BpJYyS9XtJb\nexBnZTiJOhQRXwJOIBmdWknSPbqI5Fe80Tp8CjhD0vMkB9tXF/i+a4GzgaskrQbuAw7peAPa+xHJ\n8ddDwG9IBjuyXbYzgCUkI2//BXyXpFUmkvNS7wcmpct/C3yDpDvYt5Qe/Jl1RNLfA9Mj4sCyYymL\nWyIbEUnbSdov7b7uSTIEfm3ZcZVpw+FXMVvHRiTd1okkQ/pXAV8vNaKS9aw7J2kqyZnvMcA3IuKs\nnnyRWcl6kkTp1cMPAe8mOQi9AzgiIh7o+peZlaxX3bl9gUUR8QiApKtIzqm0TCJJHt2wKvptRGwz\n3Eq9GliYwLrDoktoOrMu6WhJd0q6s0cxmBX1mzwr9aolanVWfp3WJiLmAHPALZHVW69aoiWseznI\nDsDSHn2XWal6lUR3ALtLmihpI2A6yTVkZn2nJ925iFgjaSbJJSRjgLkRcX8vvsusbJW47MfHRFZR\nd0XEPsOt5Mt+zAqqxWU/xx9/fNkh2ACaPXt2rvXcEpkVVIuWaLTMmDEDgIsuuqjtsqzm9ZrXGely\nqye3RKlWSdJq2UUXXfTyzp+dn03ATpZbfTmJUm4VrFNOohyyCTZjxowhu3btllv/chKZFeSBhZyG\nGyRoXset0eBwS5RDnoRw0gyuWlz2MxonW0c6PJ1nHQ9x19vs2bNzXfbjJDJrI28SuTtnVpCTyKwg\nj85VyNhZY9ebt+rMVSVEYiPhlqgiGgm06sxVLz+y8626nERmBXWcRJJ2TG9gtVDS/ZKOT+efLulJ\nSfPTR1/fm8asyDHRGuDEiLhb0mbAXZJuTJedFxFfKR6eWfV1nEQRsYzkrmlExPOSFpL/1odmfaMr\nx0SSdgHeDNyezpopaYGkuZJaHhm7Auq6sgMJjUd2vlVX4SFuSZvyyl2uV0u6APg8ScXTz5PcqfqT\nze9zBdT1OWHqqVBLJOlVJAl0RUR8DyAiVkTE2oj4I3AxSXF7s75VZHROwDeBhRFxbmb+dpnVDie5\nt6hZ3yrSndsP+Chwr6T56bxTgCPSu2gH8BjgvxGwvlZkdO6/aX33h3mdh2NV5D/hGNrAXjt374NH\nrPP6DXteOaLl3fiMPN9RthkzZrSsMeFEeoUv+7EhOVmG5ySy3IYqbjnInESWm4tOtuYksiE5YYbn\nGgs2rEEdnctbY2FgR+csv0FJmk65O2dWkJPIrCAnkVlBA3NM1HyPoVZn4lstzz5nNc9rfNasWQ/3\nahO64swzdy87hL4zUC3RcAfIeQ6gszfpyvse628DlUTDnfNoXt5q/Tzr2GAZqCRqbkVaLW+ebl6/\n1fvdGg22gUqiZp3c1a75Pa2Ol2yw+IoFszZG7YoFSY8BzwNrgTURsY+kLYHvALuQ/HXrhyLCVTis\nL3WrO/eOiJiUydqTgZsiYnfgpvS1WV/q1XmiQ4GD0ulLgZ8AJ/Xou0ZkJOeDWs1v9Z6sQ37+89HZ\nkA5dv//+ZYfQd7qRRAH8OD2uuSitJzcurZBKRCyTtG0Xvqdrit4m0iyrG925/SJiMnAIcKykA/K8\nqcwKqCM9X9TpOjYYCidRRCxNn1cC15IUa1zRqD+XPq9s8b45EbFPntGPbhvplQvtXvv8kEHxCqib\npHeEQNImwHtIijVeBxyZrnYk8IMi39Ntrc71DLXcbCiFzhNJ2pWk9YHk+OrbEfFFSVsBVwM7AY8D\nH4yIZ4b4HJ8nssoZlfNEEfEI8KYW858G3lnks83qohZXLJiVpH9qLEz+wuSyQ7ABdPdn7s61Xi2S\naNsdKnWayWwdtUiiDa4e6IvNreJqkUTzd5g//EpmJalFEo3faXzZIdgAWsrSXOu5n2RWUC1aIg8s\nWJX5PJFZe7nOE7k7Z1aQk8isoFocE90w2Vcs2Oibene+KxbcEpkV5CQyK8hJZFZQLY6JJs3zFQtW\ngpy7nVsis4I6bokk7UlS5bRhV+A0YAvg74Cn0vmnRMS8jiMEPvLx04ZdZ9aJxwFw5jlfK/JVhTiG\nfosh327bcRJFxIPAJABJY4AnSeotfAI4LyK+0ulnd2LtSWuTiRKvEHIMgxlDt46J3gksjojfSOrS\nR47MmLPHJBPnlPL1jmGAY+hWEk0Hrsy8ninpY8CdwImjUcx+0H79HEN1Yig8sCBpI+ADwH+ksy4A\ndiPp6i2jzW9Btyugjjl7zCu/PiVxDIMZQzdaokOAuyNiBUDjGUDSxcAPW70prdk9J12v8FXcg/br\n5xiqE0M3kugIMl05Sds1itkDh5NURO25QeuHO4bqxFAoiST9CfBuIFtz90uSJpHcLeKxpmU9M2i/\nfo6hOjEUrYD6ArBV07yPFoqoQ4P26+cYqhNDLS77yWPQfv0cQ3Vi6JskGrRfP8dQnRj6JokG7dfP\nMVQnhr5JokH79XMM1Ymhb5Jo0H79HEN1YuibJBq0Xz/HUJ0Y+iaJBu3XzzFUJ4ZaFG9cvnzaaIVi\n9rLx4+e5eKPZaKhFd+6Wyb61ilWXWyKzgpxEZgU5icwKqsUx0TvunlR2CDaIxvtOeWajohYtUZ66\nc2bdl6/unFsis4JyJZGkuZJWSrovM29LSTdKejh9HpvOl6SvSlokaYEk31zI+lrelugSYGrTvJOB\nmyJid+Cm9DUk1X92Tx9Hk5TQMutbuZIoIn4GPNM0+1Dg0nT6UuCwzPzLInEbsIWk7boRrFkVFTkm\nGtcojZU+N66XnQA8kVlvSTpvHd0u3mhWll6MzrUqxr3eVdrdLt5oVpYiLdGKRjctfV6Zzl8C7JhZ\nbwcg31krsxoqkkTXAUem00cCP8jM/1g6SjcFeC5TEdWs7+Tqzkm6EjgI2FrSEuBzwFnA1ZKOAh4H\nPpiuPg+YBiwCXiC5X5FZ38qVRBFxRJtF72yxbgDHFgnKrE58xYJZQU4is4KcRGYFOYnMCnISmRXk\nJDIryElkVpCTyKwgJ5FZQU4is4KcRGYFOYnMCnISmRXkJDIryElkVpCTyKwgJ5FZQcMmUZvqp1+W\n9Ou0wum1krZI5+8i6UVJ89PHhb0M3qwK8rREl7B+9dMbgddHxBuBh4BZmWWLI2JS+jimO2GaVdew\nSdSq+mlE/Dgi1qQvbyMpi2U2kLpxTPRJ4PrM64mSfiXpp5L2b/cmV0C1flGoAqqkU4E1wBXprGXA\nThHxtKS3AN+XtHdErG5+bzcroN58w5SXpw+eeluRj6p1DEOpenx11nFLJOlI4H3AX6dlsoiIlyLi\n6XT6LmAxsEc3Am0nu3OUpQoxjETd4q26jpJI0lTgJOADEfFCZv42ksak07uS3F7lkW4EmlcVdpAq\nxJBVtXj6zbDduTbVT2cBGwM3SgK4LR2JOwA4Q9IaYC1wTEQ035KlJxpdlDJ3mCrE0E6VY6u7YZOo\nTfXTb7ZZ9xrgmqJBdaKxc5TZ369CDK0cPPU2J08P1eLGx0M5eOptfO3tZ7z8+rhbBzOG4Sz41rSX\npz/9Ld9Iupt82Y9ZQX2RRMfdeto6z4Maw1AarY9boe6rfXcOYI97FnAc5e4cZcVw/rmvBWDmCeud\nimux3lc4P72X+3DrW361b4n2uGfBOs+DFEMjgZqnh1ovz/o2MrVPoqwyE6lKMTScf+5rnSyjoLbd\nuarsrGXG0eiSNRJluIRpXt+6oy9aoofe9MayQyg1huzxzcwTVrd83ZxAPibqntq2RNZacyvjVqf3\n+qIlstYtS3OrNNS61rnaJ9Ggd+WympOjMbCQTSYnUPfVPomyB/Zl7cxViGEo2WSy7qt9Etm6nCij\nr/YDC1X45a9CDFl77bXXeleS33zDlMpdXd4v3BKZFVTbJFo790DWzj1wnddlxVF2DMNxK9Rbte/O\nAex2/NiyQ6hEDA0HT71t3fND5z7gY6Ue6rQC6umSnsxUOp2WWTZL0iJJD0p6b68Cb6UKO3IVYmjm\nBOqtTiugApyXqXQ6D0DSXsB0YO/0PV9vFC7ptsWzV7F49ip2O34si2ev6sVX5I6j7BisXHlqLPxM\n0i45P+9Q4KqIeAl4VNIiYF/gFx1HmEMVduIqxGDlKDKwMDMtaD9XUqMPMwF4IrPOknTeerpVAbWx\n45bZjapCDFaeTpPoAmA3YBJJ1dNz0vlqsW7L6qYRMSci9omIfTqMYT1V2ImrEIMvOh1dHSVRRKyI\niLUR8UfgYpIuGyQtz46ZVXcAlhYL0YrwoELvdVoBdbvMy8OBxsjddcB0SRtLmkhSAfWXxUIcWhV+\n+asQg5Wn0wqoB0maRNJVewyYARAR90u6GniApND9sRGxtjehWyvuyo2+rlZATdf/IvDFIkHlUZVf\n/6rEYeWp7WU/rVRhiLkKMdjoUnpXlHKDGOb+RENd97Xf8icB+J/xLUfSR0UVYsiqak3wurn5hil3\n5Rk9rsW1cydMbn/r19vnfRZIduS3Tfv8aIVUuRiybr4heR7q382G1/h3HE7tu3NV2GmrEEMr7/uX\n+WWHMBBq0Z0zK0n/dOd+eMqkskOwAZS3Ja99d86sbE4is4KcRGYFeWDBrD0PLJgV4YEFs1FSi+7c\n8uXThlps1hPjx8/rn+7cLZN95t2qy905s4KcRGYFOYnMCuq0Aup3MtVPH5M0P52/i6QXM8su7GXw\nZlWQZ2DhEuB84LLGjIj4cGNa0jnAc5n1F0dEV0/svONunyeyEozPV6iqUAVUSQI+BBw8gtBGbPz4\neb38eLNCig5x7w+siIiHM/MmSvoVsBr4TET8vNUbJR0NHJ3nS67cfvuCYZqN3BFLu9QSDfc9wJWZ\n18uAnSLiaUlvAb4vae+IWK+CYETMAeaAr52zeus4iSRtCPwF8JbGvLSQ/Uvp9F2SFgN7AIXqbeeV\nPXZqnKBtNc8xlB/DaMTR7vu6/W9RZIj7XcCvI2JJY4akbRq3UpG0K0kF1EeKhTgyrf5RRvuKB8dQ\nrRh6HUeeIe4rSW6NsqekJZKOShdNZ92uHMABwAJJ9wDfBY6JiGe6Fq1ZBXVaAZWI+HiLedcA1xQP\ny6w+fMWCWUF9mUTZ/m5ZV4A7hurE0Os4avGnECNRhasbHMNgxVCLP8rzyVYrwxFLl+b6o7xaJJFZ\nSfrnL1uT619H5vI//WcAPvqLz3U7GMdQwxg6i2NmrrX6cmDBbDQ5icwKchKZFVSLY6Lx229Vynu7\nxTFUJwbIH8fyfH8J4ZbIrKhatETbjB/ZHbrPPfuznHDS5QBcfulnOeGk0b+TnWOoTgydxjGwLdEV\nl5zFuHGbvPx63LhNuOKSsxzDAMfQ6zjq0RJtu8WI39P8j9TJZxTlGKoTQy/jqMUVCyO9lfy3Lzlj\nndcf+fhpIw+qIMdQnRg6jePmG6b0z2U/I00is27Im0R9d0xkNtry/Hn4jpJukbRQ0v2Sjk/nbynp\nRkkPp89j0/mS9FVJiyQtkDS51xthVqY8LdEa4MSIeB0wBThW0l7AycBNEbE7cFP6GuAQkgIlu5PU\nlbug61GbVciwSRQRyyLi7nT6eWAhMAE4FLg0Xe1S4LB0+lDgskjcBmwhabuuR25WESMa4k7LCb8Z\nuB0YFxHLIEk0Sdumq00Ansi8bUk6b1nTZ+WugHrzDVNGEqbZqMqdRJI2Jank8+mIWJ2U4W69aot5\n642+uQKq9Ytco3OSXkWSQFdExPfS2Ssa3bT0eWU6fwmwY+btOwA5L6Awq588o3MCvgksjIhzM4uu\nA45Mp48EfpCZ/7F0lG4K8Fyj22fWlyJiyAfwZyTdsQXA/PQxDdiKZFTu4fR5y3R9Af8GLAbuBfbJ\n8R3hhx8VfNw53L4bEfW4YsGsJL5iwWw0OInMCnISmRXkJDIrqCp/lPdb4Pfpc7/Ymv7Znn7aFsi/\nPTvn+bBKjM4BSLozz0hIXfTT9vTTtkD3t8fdObOCnERmBVUpieaUHUCX9dP29NO2QJe3pzLHRGZ1\nVaWWyKyWnERmBZWeRJKmSnowLWxy8vDvqB5Jj0m6V9J8SXem81oWcqkiSXMlrZR0X2ZebQvRtNme\n0yU9mf4fzZc0LbNsVro9D0p674i/MM+l3r16AGNI/mRiV2Aj4B5grzJj6nA7HgO2bpr3JeDkdPpk\n4Oyy4xwi/gOAycB9w8VP8mcw15P8ycsU4Pay48+5PacD/9hi3b3S/W5jYGK6P44ZyfeV3RLtCyyK\niEci4n+Bq0gKnfSDdoVcKicifgY80zS7toVo2mxPO4cCV0XESxHxKLCIZL/MrewkalfUpG4C+LGk\nu9ICLNBUyAXYtu27q6ld/HX+P5uZdkHnZrrXhben7CTKVdSkBvaLiMkkNfeOlXRA2QH1UF3/zy4A\ndgMmkVSeOiedX3h7yk6ivihqEhFL0+eVwLUk3YF2hVzqoq8K0UTEiohYGxF/BC7mlS5b4e0pO4nu\nAHaXNFHSRsB0kkIntSFpE0mbNaaB9wD30b6QS130VSGapuO2w0n+jyDZnumSNpY0kaRy7y9H9OEV\nGEmZBjxEMipyatnxdBD/riSjO/cA9ze2gTaFXKr4AK4k6eL8H8kv81Ht4qeDQjQV2Z7L03gXpImz\nXWb9U9PteRA4ZKTf58t+zAoquztnVntOIrOCnERmBTmJzApyEpkV5CQyK8hJZFbQ/wPTMFRqoBLr\nRQAAAABJRU5ErkJggg==\n", - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gymnasium.wrappers import RecordVideo\n", + "\n", + "with make_env() as record_env, RecordVideo(record_env, video_folder=\"videos\") as env_monitor:\n", + " final_rewards = evaluate(agent, env_monitor, n_games=20)\n", + "\n", + "print(\"Final mean reward\", np.mean(final_rewards))" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAEICAYAAACQ6CLfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFz5JREFUeJzt3X20HHV9x/H35z5xQxIeEkIMJAUfooItpi1G6sORolhE\nFGzViihpy7HtsfRYH9qqfcJWrZ6K2HP06EFFUquAjzVVasmJIIVaHsSIQagBBBMTEhACuXm6T9/+\nMXPL3jtzc/fe3Z3dze/zOmfP3f3N7M539u53Z+a3M7+vIgIzS09PuwMws/Zw8pslyslvlignv1mi\nnPxmiXLymyXKyZ8wSSdKCkl97Y5lNiRdIOm6dsfR7Zz8TSTpBkmPSTqswmWGpGdUtbyqlX1BRcQX\nIuLl7YzrUODkbxJJJwIvBgJ4dVuD6SDK+HPWgfxPaZ4Lgf8BrgTW1E6QtFjSv0t6QtJtkt4v6aaa\n6c+WtF7So5L+V9Lra6ZdKekTkr4labekWyQ9PZ92Yz7bDyUNSfrdqUFJ6pH015IelLRT0r9IOnLK\nbH8gaZuk7ZLeWfPc1ZJuz+PeIemjNdNOk/TfknZJ+qGk02um3SDpA5JuBvYC75V0+5S43i5pXX7/\nlZJ+kC9ni6RLamadWMdd+Tr+hqTfm/L+vSB/Xx/P/75gSiz/IOnm/P27TtIxU9+nJEWEb024AfcC\nbwV+HRgBltZMuzq/HQ6cDGwBbsqnzc8f/z7QB/wa8AjwnHz6lcCjwOp8+heAq2teO4BnHCSuP8hj\nexqwAPga8Pl82on586/K4/gV4GHgZfn07wFvzu8vAE7L7x8P/AI4m2wDcmb+eEk+/QbgZ8Bz8piP\nBHYDK2viug14Q37/9HzZPcApwA7gvCkx9tU89/dq3r9FwGPAm/NlnZ8/XlwTy33AM4F5+eMPtfvz\n0gk3b/mbQNKLgBOAL0XE98k+bG/Mp/UCvwP8XUTsjYgfA2trnn4O8EBEfC4iRiPiDuCrwGtr5vla\nRNwaEaNkyb9qFuFdAHw0Iu6PiCHgPcAbpnTyvS8i9kTEj4DPkSUQZF9iz5B0TEQMRcT/5O1vAq6N\niGsjYjwi1gO3k30ZTLgyIu7K1+lx4BsTrytpJfBsYB1ARNwQET/KX+tOsi+jl9S5fq8ENkfE5/Nl\nXQXcA7yqZp7PRcRPImIf8CVm9/4dspz8zbEGuC4iHskff5End/2XkG2RttTMX3v/BOD5+e7zLkm7\nyBL2KTXzPFRzfy/ZVrhexwEP1jx+MI9n6TTxPJg/B+Aisi3mPfnu9Dk1Mb9uSswvApZN85qQvScT\nXypvBP4tIvYCSHq+pOslPSzpceCPgXp3zaeu38Q6HF/zuJH375DVVT/xdCJJ84DXA72SJj5khwFH\nSXousAkYBZYDP8mnr6h5iS3AdyPizBaFuI0sWSf8Uh7PjjymiXjuqZm+DSAiNgPn5x12vw18RdLi\nPObPR8RbDrLcqZeLXgccI2kV2ZfA22umfRH4OPCKiNgv6WM8mfwzXXY6df0m1uHbMzwved7yN+48\nYIzsWH5VfjsJ+C/gwogYIzvOvkTS4ZKeTdY5OOGbwDMlvVlSf357nqST6lz+DrLj+elcBbxd0lMl\nLQA+CFyTH0JM+Js8tueQ9T1cAyDpTZKWRMQ4sCufdwz4V+BVkn5LUq+kQUmnS1rONPLlfQX4J7Lj\n9PU1kxcCj+aJv5r8kCn3MDB+kHW8luz9e6OkvrzT82Sy99UOwsnfuDVkx5Q/i4iHJm5kW7IL8mPr\ni8k6vR4CPk+WkAcAImI38HLgDWRbsYeAD5PtPdTjEmBtvvv9+pLpV+TLvBH4KbAf+NMp83yXrFNw\nA/CRiJg4geYs4C5JQ8A/k3XQ7Y+ILcC5wHvJknML8OfM/Hn6IvAy4MtTvnzeCvy9pN3A35IdlwOQ\nHxp8ALg5X8fTal8wIn5B1m/yTrJOx78Azqk5BLNpKO8RtQpJ+jDwlIhYM+PMZi3iLX8F8t/xT8nO\nd9Fqso60r7c7LkubO/yqsZBsV/84YCdwKdlPX2Zt491+s0R5t98sUQ3t9ks6i6wXuBf4TER86GDz\n9w/Mj8HBoxtZpJkdxP79jzEyvEf1zDvn5M9PW/0E2XndW4HbJK3LT18tNTh4NKeuvniuizSzGdx+\n68frnreR3f7VwL35OePDZBeunNvA65lZhRpJ/uOZfP72ViafTw2ApD/MLwu9fWRkTwOLM7NmaiT5\ny44rCj8dRMTlEXFqRJza3z+/gcWZWTM10uG3lckXqCwnvyBkOhraR//NmxpYpJkdjA7sq3veRrb8\ntwEr8wtGBsjOTV/XwOuZWYXmvOWPiFFJFwP/SfZT3xURcVfTIjOzlmrod/6IuJbskkoz6zI+w88s\nUZVe2BNHzGP/i0+pcpFmSYn/uqHueb3lN0uUk98sUU5+s0Q5+c0S5eQ3S1Slvf3Di4It549Oaovx\n4iUCkkcXMoio/7Mxm3k7SbPjHr6r/ud6y2+WKCe/WaKc/GaJcvKbJaracftDxNjkDo4YKX7/lHUC\ntpsGxgttU9cFgLK2duor7wBST7E9hjtsW1ASI/3F/wNADPcW2zqsv6/0MzRa/Lw09PmfxXM77L9t\nZlVx8pslyslvlignv1miGq3Y8wCwGxgDRiPi1IPOPyL6fj657LzGGomgQmVfk2UdSh3WyTStsvUp\n70vrLNNtrro19ibHrZH6O/ya0dv/mxHxSBNex8wq5N1+s0Q1mvwBXCfp+5L+sGyG2oo9Y3tcsces\nUzS62//CiNgm6VhgvaR7IuLG2hki4nLgcoDB5Su65YjY7JDX6NDd2/K/OyV9nax4543Tzg/ElBOx\nVNbh0YFfEVPjhmk6Kzss9rK4gdJ9vtL/RTuV9F2NT7M+PZ0We4my2MtOYqzqMzTn3X5J8yUtnLgP\nvBxwLS6zLtHIln8p8HVJE6/zxYj4dlOiMrOWa6Rc1/3Ac5sYi5lVyD/1mSWq0kt6RUknWYd1kE2n\nGzr3ykx7BmUXdJCVvb893XJGaInS2Nv4GfKW3yxRTn6zRDn5zRLl5DdLlJPfLFGV9vYHEFO+bnx6\nb2tNe3pvyamzGi22tdUhdnpv6WeoG0/vNbPu5uQ3S5ST3yxRTn6zRFVbsacvGF1cR69Sh3WaAaWd\nTx0Z51SzKf7SaevTaPGjblifJscY/S7RbWYzcPKbJcrJb5YoJ79Zombs8JN0BXAOsDMifjlvWwRc\nA5wIPAC8PiIem3Fp40J7p47g2Wm9MmZdrMkluq8EzprS9m5gQ0SsBDbkj82si8yY/Pk4/I9OaT4X\nWJvfXwuc1+S4zKzF5nrMvzQitgPkf4+dbsZJFXuGXLHHrFO0vMMvIi6PiFMj4tTeBfNbvTgzq9Nc\nz/DbIWlZRGyXtAzYWdezAnpGpjY2ehqXmf2/WfSfz3XLvw5Yk99fA3xjjq9jZm0yY/JLugr4HvAs\nSVslXQR8CDhT0mbgzPyxmXWRGXf7I+L8aSa9tMmxmFmFfIafWaIqv6R3bHGhx8/MmqXPl/Sa2Qyc\n/GaJcvKbJcrJb5aoakt0D4uBrQNVLtIsKRpu7iW9ZnYIcvKbJcrJb5YoJ79Zopz8Zoly8pslyslv\nlignv1minPxmiapnJJ8rJO2UtKmm7RJJP5e0Mb+d3dowzazZ5lq0A+CyiFiV365tblhm1mpzLdph\nZl2ukWP+iyXdmR8WHN20iMysEnNN/k8CTwdWAduBS6ebcVLFnj2u2GPWKeaU/BGxIyLGImIc+DSw\n+iDzPlmxZ74r9ph1ijklf16lZ8JrgE3TzWtmnWnGwTzyoh2nA8dI2gr8HXC6pFVkxYEeAP6ohTGa\nWQvMtWjHZ1sQi5lVyGf4mSXKyW+WKCe/WaKc/GaJcvKbJcrJb5YoJ79Zopz8Zoly8pslyslvlign\nv1minPxmiXLymyXKyW+WKCe/WaKc/GaJcvKbJaqeij0rJF0v6W5Jd0l6W96+SNJ6SZvzvx6+26yL\n1LPlHwXeGREnAacBfyLpZODdwIaIWAlsyB+bWZeop2LP9oi4I7+/G7gbOB44F1ibz7YWOK9VQZpZ\n883qmF/SicCvArcASyNiO2RfEMCx0zzHRTvMOlDdyS9pAfBV4M8i4ol6n+eiHWadqa7kl9RPlvhf\niIiv5c07Jop35H93tiZEM2uFenr7RTZO/90R8dGaSeuANfn9NcA3mh+embXKjEU7gBcCbwZ+JGlj\n3vZe4EPAlyRdBPwMeF1rQjSzVqinYs9NgKaZ/NLmhmNmVfEZfmaJcvKbJaqeY/6m0Tj07Z18BDE2\nLwrzxXQHGW3Ut68Y1PhAcb7x3uL6dIve4fre+LGB7l1He5K3/GaJcvKbJcrJb5YoJ79Zoirt8Ot/\naA/L//G/J7Vte9cLCvMNH9neDqWBJ4odX8ddekuhbdebVhfbVrYkpKZTyVu8Yv1Qoa1vZ/Eyjvsv\nPK7Q1s0dnanylt8sUU5+s0Q5+c0S5eQ3S5ST3yxR1Z7ee9hh9J749Mlt41VGUJ+ekWJb39Ilhbbx\n3gqCaRGNF3/RGD7qsEJb7xNl5zCXvGAXvxep8pbfLFFOfrNEOfnNEtVIxZ5LJP1c0sb8dnbrwzWz\nZqmnw2+iYs8dkhYC35e0Pp92WUR8pN6F7V/Sx0/eMnl4/94DJaeFtvlM0QOLigFsfttTC20aK3ly\nGzswy8ZGmHZwhP3FeR84r2TeviMKTQMPdd7/zGavnjH8tgMTxTl2S5qo2GNmXayRij0AF0u6U9IV\n0xXqdMUes87USMWeTwJPB1aR7RlcWvY8V+wx60xzrtgTETsiYiwixoFPA8XrW82sY814zD9dxR5J\nyyYKdQKvATbN9Fo9IzBv5+ROpZGFnTeAp8aKAQw+XJxvZGGxrZ3XtZ+y+r5C29ED+0rn/e59xYEH\nLlv95ULbkt7i9fwXrntroa1vdweOumoH1UjFnvMlrSLr530A+KOWRGhmLdFIxZ5rmx+OmVXFZ/iZ\nJcrJb5aoSi/pBegZnfy47NLSaPNgkBqts63sDL82Xtq6cePTCm1/dWZ55fS3/MZ3C233DC8rtG3a\nt7zQ1lNnZR/rbN7ymyXKyW+WKCe/WaKc/GaJqrbDTzDeP7mp3Wfz1Wtq3NB5sfc/Xvwu/8Tml5TO\ne8ep1xTaHhrbW2j74I/PKrT1HJhDcNZxvOU3S5ST3yxRTn6zRDn5zRLl5DdLVKW9/dELw0dOOXW3\nAyv2jA0WTy8em1cyYxcMWrnnzkWl7c/ce2GhbXS4+HHof7BYxccODd7ymyXKyW+WKCe/WaLqqdgz\nKOlWST/MK/a8L29/qqRbJG2WdI2kknKuZtap6unwOwCcERFD+Si+N0n6D+AdZBV7rpb0KeAisuG8\np6XBMfpPmjwg5MjdxYow7e4EHDmi2JO38ITHC2177j2y0Na7r7PO+e0ZLY/nWct2FNru2XFsoS1w\nh9+hasYtf2SG8of9+S2AM4Cv5O1rgfNaEqGZtUS94/b35iP37gTWA/cBuyJiYnybrUxTwqu2Ys/o\nE8ULR8ysPepK/rw4xypgOVlxjpPKZpvmuf9fsafviMPnHqmZNdWsevsjYhdwA3AacJSkiT6D5cC2\n5oZmZq1UT8WeJcBIROySNA94GfBh4HrgtcDVwBqgfKTIGhFibKzzf10s67QbGy+Ju9Mu6C8x3l9+\nGuJLj7mn0HbvL44ptI00PSLrFPX09i8D1krqJdtT+FJEfFPSj4GrJb0f+AFZSS8z6xL1VOy5k6ws\n99T2+3FxTrOu1fn74GbWEk5+s0RVeknvQN8oJyx+dFLb/b0LCvOpAy/zXXrE7kLbTw8rxt67v7O+\nT0fnl3f4/crglkLbnkeKP8X6nO1DV2d9Us2sMk5+s0Q5+c0S5eQ3S1SlHX6j4z3sHJrSSdaBnXtj\nC4tB9ajYcdYz0gVn+C0oqS0O3HWgeB2W9rWxvrhVzlt+s0Q5+c0S5eQ3S5ST3yxR1RbtGOpj+ObF\nk9oGy/uj2mrgiWLH17atKwpthw9XEU1j5u0sP0fvW5eeVmg74szitqCsNLl1Lo3VP6+3/GaJcvKb\nJcrJb5YoJ79Zohqp2HOlpJ9K2pjfVrU+XDNrlkYq9gD8eUR85SDPnaRnBOZv74K61ocQjZe/30PP\nOrrQNvho8bTm6On8U5jtST2z+PWsnjH8Aiir2GNmXWxOFXsi4pZ80gck3SnpMkmlRd0mVezZv6dJ\nYZtZo+ZUsUfSLwPvAZ4NPA9YBPzlNM99smLP4PwmhW1mjZprxZ6zImJ7XsTzAPA5PIy3WVeZc8Ue\nScsiYrskkVXo3TTTa4VgzCNCVqy8w2500NfuH4pmU0SqkYo938m/GARsBP54DrGaWZs0UrHnjJZE\nZGaV8Bl+Zoly8pslqtLr+cf7Ye9SnzFm1iqzGX/BW36zRDn5zRLl5DdLlJPfLFGVdvj1jMDhO3xB\noFmr9IzMYt7WhWFmnczJb5YoJ79Zopz8ZomqtMNPMbuKImY2OyWV5KflLb9Zopz8Zoly8pslyslv\nlqi6kz8fvvsHkr6ZP36qpFskbZZ0jSSPzmfWRWbT2/824G7giPzxh4HLIuJqSZ8CLgI+ebAX6BkO\nFm6ZUtS+pCLMeG/5Nf8Du4YLbRorVplphfGB4oCXIwuLF0+X9bb2DRXPuewZruZnj+gt/34fPqr4\nXd0zVgxeo8W2vqHi/6Eqw0eXlodAJbFHX3HdB3YdKD55mqpGzTY2v/h5GRsoxjhdlaR6Pv+9++vP\nh3qLdiwHXgl8Jn8s4AxgolTXWrIRfM2sS9S72/8x4C+Aia+VxcCuiJioDLYVOL7sibUVe0ZGXLHH\nrFPUU6X3HGBnRHy/trlk1tJ9p9qKPf39rthj1inqOeZ/IfBqSWcDg2TH/B8DjpLUl2/9lwPbWhem\nmTVbPeP2v4esLh+STgfeFREXSPoy8FrgamAN8I2ZXmt0vtixenKHzciC4g7D+EB5B8xxNxU7e/qf\nqKbjbOj4YgfZL1aVxFnStPjO4tu8YGs1nWYjC8sr82x7cXGnr/dAcYdu4PFi27G3Nx7XXP38JeUj\nVPbtK8ZZ9tlasaG+Ts1WePTk4ud3aEVJR2VveTxP+d5goe2wxybX5J5NSfVGfuf/S+Adku4l6wP4\nbAOvZWYVm9WFPRFxA1mhTiLiflyc06xr+Qw/s0Q5+c0SpYjqBtSU9DDwYP7wGOCRyhbeWofSuoDX\np9MdbH1OiIgl9bxIpck/acHS7RFxalsW3mSH0rqA16fTNWt9vNtvlignv1mi2pn8l7dx2c12KK0L\neH06XVPWp23H/GbWXt7tN0uUk98sUZUnv6SzJP2vpHslvbvq5TdK0hWSdkraVNO2SNL6fEiz9ZKO\nbmeMsyFphaTrJd0t6S5Jb8vbu26dJA1KulXSD/N1eV/e3tVDzrVqCL1Kk19SL/AJ4BXAycD5kk6u\nMoYmuBI4a0rbu4ENEbES2JA/7hajwDsj4iTgNOBP8v9JN67TAeCMiHgusAo4S9JpPDnk3ErgMbIh\n57rJxBB6E5qyPlVv+VcD90bE/RExTHY58LkVx9CQiLgReHRK87lkQ5lBlw1pFhHbI+KO/P5usg/Z\n8XThOkVmKH/Yn9+CLh5yrpVD6FWd/McDW2oeTzv8V5dZGhHbIUsm4Ng2xzMnkk4EfhW4hS5dp3wX\neSOwE1gP3EedQ851qDkPoTeTqpO/7uG/rFqSFgBfBf4sIp5odzxzFRFjEbGKbHSp1cBJZbNVG9Xc\nNDqE3kwqLdRJ9i21oubxoTL81w5JyyJiu6RlZFudriGpnyzxvxARX8ubu3qdImKXpBvI+jG6dci5\nlg6hV/WW/zZgZd5bOQC8AVhXcQytsI5sKDOoc0izTpEfQ34WuDsiPlozqevWSdISSUfl9+cBLyPr\nw7iebMg56JJ1gWwIvYhYHhEnkuXKdyLiApq1PhFR6Q04G/gJ2bHYX1W9/CbEfxWwHRgh25O5iOw4\nbAOwOf+7qN1xzmJ9XkS223gnsDG/nd2N6wScAvwgX5dNwN/m7U8DbgXuBb4MHNbuWOewbqcD32zm\n+vj0XrNE+Qw/s0Q5+c0S5eQ3S5ST3yxRTn6zRDn5zRLl5DdL1P8B8FPBd33wU/8AAAAASUVORK5C\nYII=\n", - "text/plain": [ - "" + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show video. This may not work in some setups. If it doesn't\n", + "# work for you, you can download the videos and view them locally.\n", + "\n", + "from pathlib import Path\n", + "from base64 import b64encode\n", + "from IPython.display import HTML\n", + "\n", + "video_paths = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_path = video_paths[-1] # You can also try other indices\n", + "\n", + "if 'google.colab' in sys.modules:\n", + " # https://stackoverflow.com/a/57378660/1214547\n", + " with video_path.open('rb') as fp:\n", + " mp4 = fp.read()\n", + " data_url = 'data:video/mp4;base64,' + b64encode(mp4).decode()\n", + "else:\n", + " data_url = str(video_path)\n", + "\n", + "HTML(\"\"\"\n", + "\n", + "\"\"\".format(data_url))" ] - }, - "metadata": {}, - "output_type": "display_data" } - ], - "source": [ - "s = env.reset()\n", - "for _ in range(100):\n", - " s, _, _, _ = env.step(env.action_space.sample())\n", - "\n", - "plt.title('Game image')\n", - "plt.imshow(env.render('rgb_array'))\n", - "plt.show()\n", - "\n", - "plt.title('Agent observation')\n", - "plt.imshow(s.reshape([42, 42]))\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### POMDP setting\n", - "\n", - "The Atari game we're working with is actually a POMDP: your agent needs to know timing at which enemies spawn and move, but cannot do so unless it has some memory. \n", - "\n", - "Let's design another agent that has a recurrent neural net memory to solve this. Here's a sketch.\n", - "\n", - "![img](img1.jpg)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "\n", - "# a special module that converts [batch, channel, w, h] to [batch, units]\n", - "\n", - "\n", - "class Flatten(nn.Module):\n", - " def forward(self, input):\n", - " return input.view(input.size(0), -1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class SimpleRecurrentAgent(nn.Module):\n", - " def __init__(self, obs_shape, n_actions, reuse=False):\n", - " \"\"\"A simple actor-critic agent\"\"\"\n", - " super(self.__class__, self).__init__()\n", - "\n", - " self.conv0 = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2))\n", - " self.conv1 = nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2))\n", - " self.conv2 = nn.Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2))\n", - " self.flatten = Flatten()\n", - "\n", - " self.hid = nn.Linear(512, 128)\n", - " self.rnn = nn.LSTMCell(128, 128)\n", - "\n", - " self.logits = nn.Linear(128, n_actions)\n", - " self.state_value = nn.Linear(128, 1)\n", - "\n", - " def forward(self, prev_state, obs_t):\n", - " \"\"\"\n", - " Takes agent's previous hidden state and a new observation,\n", - " returns a new hidden state and whatever the agent needs to learn\n", - " \"\"\"\n", - "\n", - " # Apply the whole neural net for one step here.\n", - " # See docs on self.rnn(...).\n", - " # The recurrent cell should take the last feedforward dense layer as input.\n", - " \n", - "\n", - " new_state = \n", - " logits = \n", - " state_value = \n", - "\n", - " return new_state, (logits, state_value)\n", - "\n", - " def get_initial_state(self, batch_size):\n", - " \"\"\"Return a list of agent memory states at game start. Each state is a np array of shape [batch_size, ...]\"\"\"\n", - " return torch.zeros((batch_size, 128)), torch.zeros((batch_size, 128))\n", - "\n", - " def sample_actions(self, agent_outputs):\n", - " \"\"\"pick actions given numeric agent outputs (np arrays)\"\"\"\n", - " logits, state_values = agent_outputs\n", - " probs = F.softmax(logits)\n", - " return torch.multinomial(probs, 1)[:, 0].data.numpy()\n", - "\n", - " def step(self, prev_state, obs_t):\n", - " \"\"\" like forward, but obs_t is a numpy array \"\"\"\n", - " obs_t = torch.tensor(np.asarray(obs_t), dtype=torch.float32)\n", - " (h, c), (l, s) = self.forward(prev_state, obs_t)\n", - " return (h.detach(), c.detach()), (l.detach(), s.detach())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_parallel_games = 5\n", - "gamma = 0.99\n", - "\n", - "agent = SimpleRecurrentAgent(obs_shape, n_actions)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "state = [env.reset()]\n", - "_, (logits, value) = agent.step(agent.get_initial_state(1), state)\n", - "print(\"action logits:\\n\", logits)\n", - "print(\"state values:\\n\", value)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Let's play!\n", - "Let's build a function that measures agent's average reward." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate(agent, env, n_games=1):\n", - " \"\"\"Plays an entire game start to end, returns session rewards.\"\"\"\n", - "\n", - " game_rewards = []\n", - " for _ in range(n_games):\n", - " # initial observation and memory\n", - " observation = env.reset()\n", - " prev_memories = agent.get_initial_state(1)\n", - "\n", - " total_reward = 0\n", - " while True:\n", - " new_memories, readouts = agent.step(\n", - " prev_memories, observation[None, ...])\n", - " action = agent.sample_actions(readouts)\n", - "\n", - " observation, reward, done, info = env.step(action[0])\n", - "\n", - " total_reward += reward\n", - " prev_memories = new_memories\n", - " if done:\n", - " break\n", - "\n", - " game_rewards.append(total_reward)\n", - " return game_rewards" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import gym.wrappers\n", - "\n", - "with gym.wrappers.Monitor(make_env(), directory=\"videos\", force=True) as env_monitor:\n", - " rewards = evaluate(agent, env_monitor, n_games=3)\n", - "\n", - "print(rewards)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Show video. This may not work in some setups. If it doesn't\n", - "# work for you, you can download the videos and view them locally.\n", - "\n", - "from pathlib import Path\n", - "from IPython.display import HTML\n", - "\n", - "video_names = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", - "\n", - "HTML(\"\"\"\n", - "\n", - "\"\"\".format(video_names[-1])) # You can also try other indices" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Training on parallel games\n", - "\n", - "We introduce a class called EnvPool - it's a tool that handles multiple environments for you. Here's how it works:\n", - "![img](img2.jpg)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from env_pool import EnvPool\n", - "pool = EnvPool(agent, make_env, n_parallel_games)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We gonna train our agent on a thing called __rollouts:__\n", - "![img](img3.jpg)\n", - "\n", - "A rollout is just a sequence of T observations, actions and rewards that agent took consequently.\n", - "* First __s0__ is not necessarily initial state for the environment\n", - "* Final state is not necessarily terminal\n", - "* We sample several parallel rollouts for efficiency" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# for each of n_parallel_games, take 10 steps\n", - "rollout_obs, rollout_actions, rollout_rewards, rollout_mask = pool.interact(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Actions shape:\", rollout_actions.shape)\n", - "print(\"Rewards shape:\", rollout_rewards.shape)\n", - "print(\"Mask shape:\", rollout_mask.shape)\n", - "print(\"Observations shape: \", rollout_obs.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Actor-critic objective\n", - "\n", - "Here we define a loss function that uses rollout above to train advantage actor-critic agent.\n", - "\n", - "\n", - "Our loss consists of three components:\n", - "\n", - "* __The policy \"loss\"__\n", - " $$ \\hat J = {1 \\over T} \\cdot \\sum_t { \\log \\pi(a_t | s_t) } \\cdot A_{const}(s,a) $$\n", - " * This function has no meaning in and of itself, but it was built such that\n", - " * $ \\nabla \\hat J = {1 \\over N} \\cdot \\sum_t { \\nabla \\log \\pi(a_t | s_t) } \\cdot A(s,a) \\approx \\nabla E_{s, a \\sim \\pi} R(s,a) $\n", - " * Therefore if we __maximize__ J_hat with gradient descent we will maximize expected reward\n", - " \n", - " \n", - "* __The value \"loss\"__\n", - " $$ L_{td} = {1 \\over T} \\cdot \\sum_t { [r + \\gamma \\cdot V_{const}(s_{t+1}) - V(s_t)] ^ 2 }$$\n", - " * Ye Olde TD_loss from q-learning and alike\n", - " * If we minimize this loss, V(s) will converge to $V_\\pi(s) = E_{a \\sim \\pi(a | s)} R(s,a) $\n", - "\n", - "\n", - "* __Entropy Regularizer__\n", - " $$ H = - {1 \\over T} \\sum_t \\sum_a {\\pi(a|s_t) \\cdot \\log \\pi (a|s_t)}$$\n", - " * If we __maximize__ entropy we discourage agent from predicting zero probability to actions\n", - " prematurely (a.k.a. exploration)\n", - " \n", - " \n", - "So we optimize a linear combination of $L_{td}$ $- \\hat J$, $-H$\n", - " \n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "```\n", - "\n", - "\n", - "__One more thing:__ since we train on T-step rollouts, we can use N-step formula for advantage for free:\n", - " * At the last step, $A(s_t,a_t) = r(s_t, a_t) + \\gamma \\cdot V(s_{t+1}) - V(s) $\n", - " * One step earlier, $A(s_t,a_t) = r(s_t, a_t) + \\gamma \\cdot r(s_{t+1}, a_{t+1}) + \\gamma ^ 2 \\cdot V(s_{t+2}) - V(s) $\n", - " * Et cetera, et cetera. This way agent starts training much faster since it's estimate of A(s,a) depends less on his (imperfect) value function and more on actual rewards. There's also a [nice generalization](https://arxiv.org/abs/1506.02438) of this.\n", - "\n", - "\n", - "__Note:__ it's also a good idea to scale rollout_len up to learn longer sequences. You may wish set it to >=20 or to start at 10 and then scale up as time passes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def to_one_hot(y, n_dims=None):\n", - " \"\"\" Take an integer tensor and convert it to 1-hot matrix. \"\"\"\n", - " y_tensor = y.to(dtype=torch.int64).view(-1, 1)\n", - " n_dims = n_dims if n_dims is not None else int(torch.max(y_tensor)) + 1\n", - " y_one_hot = torch.zeros(y_tensor.size()[0], n_dims).scatter_(1, y_tensor, 1)\n", - " return y_one_hot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "opt = torch.optim.Adam(agent.parameters(), lr=1e-5)\n", - "\n", - "\n", - "def train_on_rollout(states, actions, rewards, is_not_done, prev_memory_states, gamma=0.99):\n", - " \"\"\"\n", - " Takes a sequence of states, actions and rewards produced by generate_session.\n", - " Updates agent's weights by following the policy gradient above.\n", - " Please use Adam optimizer with default parameters.\n", - " \"\"\"\n", - "\n", - " # shape: [batch_size, time, c, h, w]\n", - " states = torch.tensor(np.asarray(states), dtype=torch.float32)\n", - " actions = torch.tensor(np.array(actions), dtype=torch.int64) # shape: [batch_size, time]\n", - " rewards = torch.tensor(np.array(rewards), dtype=torch.float32) # shape: [batch_size, time]\n", - " is_not_done = torch.tensor(np.array(is_not_done), dtype=torch.float32) # shape: [batch_size, time]\n", - " rollout_length = rewards.shape[1] - 1\n", - "\n", - " # predict logits, probas and log-probas using an agent.\n", - " memory = [m.detach() for m in prev_memory_states]\n", - "\n", - " logits = [] # append logit sequence here\n", - " state_values = [] # append state values here\n", - " for t in range(rewards.shape[1]):\n", - " obs_t = states[:, t]\n", - "\n", - " # use agent to comute logits_t and state values_t.\n", - " # append them to logits and state_values array\n", - "\n", - " memory, (logits_t, values_t) = \n", - "\n", - " logits.append(logits_t)\n", - " state_values.append(values_t)\n", - "\n", - " logits = torch.stack(logits, dim=1)\n", - " state_values = torch.stack(state_values, dim=1)\n", - " probas = F.softmax(logits, dim=2)\n", - " logprobas = F.log_softmax(logits, dim=2)\n", - "\n", - " # select log-probabilities for chosen actions, log pi(a_i|s_i)\n", - " actions_one_hot = to_one_hot(actions, n_actions).view(\n", - " actions.shape[0], actions.shape[1], n_actions)\n", - " logprobas_for_actions = torch.sum(logprobas * actions_one_hot, dim=-1)\n", - "\n", - " # Now let's compute two loss components:\n", - " # 1) Policy gradient objective.\n", - " # Notes: Please don't forget to call .detach() on advantage term. Also please use mean, not sum.\n", - " # it's okay to use loops if you want\n", - " J_hat = 0 # policy objective as in the formula for J_hat\n", - "\n", - " # 2) Temporal difference MSE for state values\n", - " # Notes: Please don't forget to call on V(s') term. Also please use mean, not sum.\n", - " # it's okay to use loops if you want\n", - " value_loss = 0\n", - "\n", - " cumulative_returns = state_values[:, -1].detach()\n", - "\n", - " for t in reversed(range(rollout_length)):\n", - " r_t = rewards[:, t] # current rewards\n", - " # current state values\n", - " V_t = state_values[:, t]\n", - " V_next = state_values[:, t + 1].detach() # next state values\n", - " # log-probability of a_t in s_t\n", - " logpi_a_s_t = logprobas_for_actions[:, t]\n", - "\n", - " # update G_t = r_t + gamma * G_{t+1} as we did in week6 reinforce\n", - " cumulative_returns = G_t = r_t + gamma * cumulative_returns\n", - "\n", - " # Compute temporal difference error (MSE for V(s))\n", - " value_loss += \n", - "\n", - " # compute advantage A(s_t, a_t) using cumulative returns and V(s_t) as baseline\n", - " advantage = \n", - " advantage = advantage.detach()\n", - "\n", - " # compute policy pseudo-loss aka -J_hat.\n", - " J_hat += \n", - "\n", - " # regularize with entropy\n", - " entropy_reg = \n", - "\n", - " # add-up three loss components and average over time\n", - " loss = -J_hat / rollout_length +\\\n", - " value_loss / rollout_length +\\\n", - " -0.01 * entropy_reg\n", - "\n", - " # Gradient descent step\n", - " \n", - "\n", - " return loss.data.numpy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# let's test it\n", - "memory = list(pool.prev_memory_states)\n", - "rollout_obs, rollout_actions, rollout_rewards, rollout_mask = pool.interact(10)\n", - "\n", - "train_on_rollout(rollout_obs, rollout_actions,\n", - " rollout_rewards, rollout_mask, memory)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Train \n", - "\n", - "just run train step and see if agent learns any better" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython.display import clear_output\n", - "from tqdm import trange\n", - "from pandas import DataFrame\n", - "moving_average = lambda x, **kw: DataFrame(\n", - " {'x': np.asarray(x)}).x.ewm(**kw).mean().values\n", - "\n", - "rewards_history = []" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for i in trange(15000):\n", - "\n", - " memory = list(pool.prev_memory_states)\n", - " rollout_obs, rollout_actions, rollout_rewards, rollout_mask = pool.interact(\n", - " 10)\n", - " train_on_rollout(rollout_obs, rollout_actions,\n", - " rollout_rewards, rollout_mask, memory)\n", - "\n", - " if i % 100 == 0:\n", - " rewards_history.append(np.mean(evaluate(agent, env, n_games=1)))\n", - " clear_output(True)\n", - " plt.plot(rewards_history, label='rewards')\n", - " plt.plot(moving_average(np.array(rewards_history),\n", - " span=10), label='rewards ewma@10')\n", - " plt.legend()\n", - " plt.show()\n", - " if rewards_history[-1] >= 10000:\n", - " print(\"Your agent has just passed the minimum homework threshold\")\n", - " break" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Relax and grab some refreshments while your agent is locked in an infinite loop of violence and death.\n", - "\n", - "__How to interpret plots:__\n", - "\n", - "The session reward is the easy thing: it should in general go up over time, but it's okay if it fluctuates ~~like crazy~~. It's also OK if it reward doesn't increase substantially before some 10k initial steps. However, if reward reaches zero and doesn't seem to get up over 2-3 evaluations, there's something wrong happening.\n", - "\n", - "\n", - "Since we use a policy-based method, we also keep track of __policy entropy__ - the same one you used as a regularizer. The only important thing about it is that your entropy shouldn't drop too low (`< 0.1`) before your agent gets the yellow belt. Or at least it can drop there, but _it shouldn't stay there for long_.\n", - "\n", - "If it does, the culprit is likely:\n", - "* Some bug in entropy computation. Remember that it is $ - \\sum p(a_i) \\cdot log p(a_i) $\n", - "* Your agent architecture converges too fast. Increase entropy coefficient in actor loss. \n", - "* Gradient explosion - just [clip gradients](https://stackoverflow.com/a/56069467) and maybe use a smaller network\n", - "* Us. Or PyTorch developers. Or aliens. Or lizardfolk. Contact us on forums before it's too late!\n", - "\n", - "If you're debugging, just run `logits, values = agent.step(batch_states)` and manually look into logits and values. This will reveal the problem 9 times out of 10: you'll likely see some NaNs or insanely large numbers or zeros. Try to catch the moment when this happens for the first time and investigate from there." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### \"Final\" evaluation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import gym.wrappers\n", - "\n", - "with gym.wrappers.Monitor(make_env(), directory=\"videos\", force=True) as env_monitor:\n", - " final_rewards = evaluate(agent, env_monitor, n_games=20)\n", - "\n", - "print(\"Final mean reward\", np.mean(final_rewards))" - ] + ], + "metadata": { + "language_info": { + "name": "python", + "pygments_lexer": "ipython3" + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Show video. This may not work in some setups. If it doesn't\n", - "# work for you, you can download the videos and view them locally.\n", - "\n", - "from pathlib import Path\n", - "from IPython.display import HTML\n", - "\n", - "video_names = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", - "\n", - "HTML(\"\"\"\n", - "\n", - "\"\"\".format(video_names[-1])) # You can also try other indices" - ] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 + "nbformat": 4, + "nbformat_minor": 1 } diff --git a/week08_pomdp/practice_tensorflow.ipynb b/week08_pomdp/practice_tensorflow.ipynb index f776acfa6..f4465f21a 100644 --- a/week08_pomdp/practice_tensorflow.ipynb +++ b/week08_pomdp/practice_tensorflow.ipynb @@ -22,9 +22,9 @@ " %tensorflow_version 1.x\n", " \n", " if not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", "\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week08_pomdp/atari_util.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week08_pomdp/atari_util.py\n", "\n", " !touch .setup_complete\n", "\n", @@ -284,15 +284,25 @@ "# work for you, you can download the videos and view them locally.\n", "\n", "from pathlib import Path\n", + "from base64 import b64encode\n", "from IPython.display import HTML\n", "\n", - "video_names = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_paths = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_path = video_paths[-1] # You can also try other indices\n", + "\n", + "if 'google.colab' in sys.modules:\n", + " # https://stackoverflow.com/a/57378660/1214547\n", + " with video_path.open('rb') as fp:\n", + " mp4 = fp.read()\n", + " data_url = 'data:video/mp4;base64,' + b64encode(mp4).decode()\n", + "else:\n", + " data_url = str(video_path)\n", "\n", "HTML(\"\"\"\n", "\n", - "\"\"\".format(video_names[-1])) # You can also try other indices" + "\"\"\".format(data_url))" ] }, { @@ -547,15 +557,25 @@ "# work for you, you can download the videos and view them locally.\n", "\n", "from pathlib import Path\n", + "from base64 import b64encode\n", "from IPython.display import HTML\n", "\n", - "video_names = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_paths = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_path = video_paths[-1] # You can also try other indices\n", + "\n", + "if 'google.colab' in sys.modules:\n", + " # https://stackoverflow.com/a/57378660/1214547\n", + " with video_path.open('rb') as fp:\n", + " mp4 = fp.read()\n", + " data_url = 'data:video/mp4;base64,' + b64encode(mp4).decode()\n", + "else:\n", + " data_url = str(video_path)\n", "\n", "HTML(\"\"\"\n", "\n", - "\"\"\".format(video_names[-1])) # You can also try other indices" + "\"\"\".format(data_url))" ] }, { diff --git a/week08_pomdp/practice_theano.ipynb b/week08_pomdp/practice_theano.ipynb deleted file mode 100644 index 63d19af0a..000000000 --- a/week08_pomdp/practice_theano.ipynb +++ /dev/null @@ -1,509 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from __future__ import print_function, division" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# If you are running on a server, launch xvfb to record game videos\n", - "# Please make sure you have xvfb installed\n", - "import os\n", - "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", - " !bash ../xvfb start\n", - " os.environ['DISPLAY'] = ':1'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you are new to this course and want more instructions on how to set up environement and all the libs (docker / windows / gpu / blas / etc.), you could read [vital instructions here](https://github.com/yandexdataschool/Practical_RL/issues/1#issue-202648393). \n", - "\n", - "Please make sure that your have bleeding edge versions of Theano, Lasagne and Agentnet." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# General purpose libs import" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "%matplotlib inline\n", - "from timeit import default_timer as timer\n", - "\n", - "from IPython.core import display" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# if you have GPU uncomment the line below\n", - "%env THEANO_FLAGS = device = gpu0, floatX = float32" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Universal collection of a gentleman:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import gym\n", - "\n", - "from agentnet.agent import Agent\n", - "from agentnet.experiments.openai_gym.wrappers import PreprocessImage\n", - "from agentnet.memory import WindowAugmentation, LSTMCell, GRUCell\n", - "from agentnet.target_network import TargetNetwork\n", - "from agentnet.resolver import EpsilonGreedyResolver, ProbabilisticResolver\n", - "from agentnet.experiments.openai_gym.pool import EnvPool\n", - "from agentnet.learning import qlearning\n", - "\n", - "import theano\n", - "import theano.tensor as T\n", - "\n", - "import lasagne\n", - "from lasagne.layers import DenseLayer, Conv2DLayer, InputLayer, NonlinearityLayer\n", - "from lasagne.layers import batch_norm, get_all_params, get_output, reshape, concat, dropout\n", - "from lasagne.nonlinearities import rectify, leaky_rectify, elu, tanh, softmax" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Helper function definitions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Downsample image, and crop it, showing only the most useful part of image. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def make_env():\n", - " env = gym.make(\"KungFuMaster-v0\")\n", - " env = PreprocessImage(env, height=64, width=64,\n", - " grayscale=True, crop=lambda img: img[60:-30, 7:])\n", - " return env" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Function for tracking performance while training " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def eval_and_plot(rewards, epoch_counter, pool, target_score, th_times, loop_times):\n", - " rewards[epoch_counter] = np.mean(pool.evaluate(\n", - " n_games=N_EVAL_GAMES, record_video=False, verbose=False))\n", - " info_string = \"Time (DL/All) {:.1f}/{:.1f} epoch={}, mean_score={:.2f}\"\n", - " info_string = info_string.format(np.mean(th_times), np.mean(loop_times),\n", - " epoch_counter, np.mean(rewards[epoch_counter]))\n", - " plt.figure(figsize=(8, 5))\n", - " plt.plot([rewards[i] for i in sorted(rewards.keys())])\n", - " plt.grid()\n", - " plt.ylabel(\"Mean reward over evaluation games\")\n", - " plt.title(info_string)\n", - " plt.show()\n", - " display.clear_output(wait=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Experiment setup\n", - "Here we basically just load the game and check that it works" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "env = gym.make('KungFuMaster-v0')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(env.env.get_action_meanings())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.imshow(env.reset())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "env = make_env()\n", - "plt.imshow(np.squeeze(env.reset()), interpolation='none', cmap='gray')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Global constants definition" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All hyperparameters (except number of layers and neurons) are declared here as upper case letters along with global varaibles." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "N_ACTIONS = env.action_space.n\n", - "OBS_SHAPE = env.observation_space.shape\n", - "OBS_CHANNELS, OBS_HEIGHT, OBS_WIDTH = OBS_SHAPE\n", - "\n", - "# These 4 constanst were shown to lead to nearly state of the art on kung-fu master game\n", - "N_SIMULTANEOUS_GAMES = 10 # this is also known as number of agents in exp_replay_pool\n", - "SEQ_LENGTH = 25\n", - "\n", - "EVAL_EVERY_N_ITER = 100\n", - "N_EVAL_GAMES = 2\n", - "\n", - "N_FRAMES_IN_BUFFER = 4 # number of consequent frames to feed in CNN" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# A2C with memory" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "observation_layer = InputLayer((None,) + OBS_SHAPE)\n", - "prev_wnd = InputLayer(\n", - " [None, N_FRAMES_IN_BUFFER, OBS_CHANNELS, OBS_HEIGHT, OBS_WIDTH])\n", - "new_wnd = WindowAugmentation(observation_layer, prev_wnd)\n", - "wnd_reshape = reshape(\n", - " new_wnd, [-1, N_FRAMES_IN_BUFFER * OBS_CHANNELS, OBS_HEIGHT, OBS_WIDTH])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# TYPE YOUR CODE HERE\n", - "# provide the main body of the network : first three convolutional layers and dense one on top\n", - "# you may want to change nonlinearity - feel free to do this\n", - "# note that we have changed filter size here because of reduced image width and height compared to those in papers\n", - "conv1 = Conv2DLayer(wnd_reshape, ...)\n", - "...\n", - "dense = Dense(...)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# define 256 neuron LSTM cell:\n", - "# - define two input layers each of n_lstm_cells (maybe 256 is a good baseline) neurons\n", - "# - feed into `LSTMcell` this two layers and\n", - "# input layer (last `Dense` in case of A2C+LSTM) as additional third parameter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "neck_layer = concat([ , ]) # network neck " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# define actors head as\n", - "# - logits_layer – dense(neck) with nonlinearity=None\n", - "# - policy layer – softmax over logits_layer\n", - "........\n", - "action_layer = ProbabilisticResolver(policy_layer)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# critic head\n", - "V_layer = DenseLayer(neck_layer, 1, nonlinearity=None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# `observation_layers` is input layer to NN, as usual\n", - "# `policy_estimators` should include 1) logits_layer and 2) V_layer\n", - "# `agent_states` is a dictionary of {new_value: old_value}. You should bother to update\n", - "# a) prev window (input buffer, prev_wnd) b) previous LSTM cell state c) output of LSTM cell\n", - "# `action_layers` is action_layer, as usual : )\n", - "agent = Agent(....)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# may need to adjust (increasing N_SIMULTANEOUS_GAMES is usually a good idea)\n", - "pool = EnvPool(agent, make_env, n_games=N_SIMULTANEOUS_GAMES)\n", - "replay = pool.experience_replay" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "_, _, _, action_seq, (logits_seq, V_seq) = agent.get_sessions(\n", - " replay,\n", - " session_length=SEQ_LENGTH,\n", - " experience_replay=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# compute pi(a|s) and log(pi(a|s)) manually [use logsoftmax]\n", - "# we can't guarantee that theano optimizes logsoftmax automatically since it's still in dev\n", - "# for more info see (https://github.com/Theano/Theano/issues/2944 of 2015 year)\n", - "\n", - "# logits_seq.shape is (batch_size, SEQ_LENGTH, N_ACTIONS)\n", - "logits_flat = logits_seq.reshape([-1, N_ACTIONS])\n", - "policy_seq = T.nnet.softmax(logits_flat).reshape(logits_seq.shape)\n", - "logpolicy_seq = T.nnet.logsoftmax(logits_flat).reshape(logits_seq.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get policy gradient\n", - "from agentnet.learning import a2c\n", - "elwise_actor_loss, elwise_critic_loss = a2c.get_elementwise_objective(\n", - " policy=logpolicy_seq,\n", - " treat_policy_as_logpolicy=True,\n", - " state_values=V_seq[:, :, 0],\n", - " actions=replay.actions[0],\n", - " rewards=replay.rewards/10,\n", - " is_alive=replay.is_alive,\n", - " gamma_or_gammas=0.99,\n", - " n_steps=None,\n", - " return_separate=True\n", - ")\n", - "\n", - "# add losses with magic numbers\n", - "# (you can change them more or less harmlessly, this usually just makes learning faster/slower)\n", - "# actor and critic multipliers were selected guided by prior knowledge\n", - "# entropy / regularization multipliers were tuned with logscale gridsearch\n", - "# NB: regularization affects exploration\n", - "reg_logits = T.mean(logits_seq ** 2)\n", - "reg_entropy = T.mean(T.sum(policy_seq * logpolicy_seq, axis=-1))\n", - "loss = 0.1 * elwise_actor_loss.mean() + 0.25 * elwise_critic_loss.mean() + \\\n", - " 1e-3 * reg_entropy + 1e-3 * reg_logits" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Compute weight updates, clip by norm for stability\n", - "weights = lasagne.layers.get_all_params(\n", - " [V_layer, policy_layer], trainable=True)\n", - "grads = T.grad(loss, weights)\n", - "grads = lasagne.updates.total_norm_constraint(grads, 10)\n", - "updates = lasagne.updates.adam(grads, weights)\n", - "train_step = theano.function([], loss, updates=updates)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Train " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "epoch_counter = 1 # starting epoch\n", - "rewards = {} # full game rewards\n", - "target_score = 10000\n", - "loss, eval_rewards = 0, []" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "untrained_reward = np.mean(pool.evaluate(\n", - " n_games=5, record_video=False, verbose=False))\n", - "untrained_reward" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# IF you feel disgust about stderr messages due to pool.evaluate() execution\n", - "# which pollutes output of jupyter cell, you could do one of the following:\n", - "# 1. use warnings.filterwarnings(\"ignore\")\n", - "# 2. use cell magic %%capture\n", - "# 3. simply redirect stderr to /dev/null with command\n", - "# import os, sys\n", - "# stder_old = sys.stderr\n", - "# sys.stderr = open(os.devnull, 'w')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "th_times, loop_times = [], []\n", - "for i in range(2000):\n", - " loop_starts = timer()\n", - " pool.update(SEQ_LENGTH)\n", - "\n", - " train_starts = timer()\n", - "\n", - " \n", - " raise NotImplementedError\n", - "\n", - " th_times.append(timer() - train_starts)\n", - " epoch_counter += 1\n", - " loop_times.append(timer() - loop_starts)\n", - "\n", - " # You may want to set EVAL_EVERY_N_ITER=1 for the time being\n", - " if epoch_counter % EVAL_EVERY_N_ITER == 0:\n", - " eval_and_plot(rewards, epoch_counter, pool,\n", - " target_score, th_times, loop_times)\n", - " if rewards[epoch_counter] >= target_score:\n", - " print(\"VICTORY!\")\n", - " break\n", - " th_times, loop_times = [], []" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "eval_and_plot(rewards, epoch_counter, pool, target_score, th_times, loop_times)" - ] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/week08_pomdp/theano_optional_recurrence_tutorial.ipynb b/week08_pomdp/theano_optional_recurrence_tutorial.ipynb deleted file mode 100644 index deeeda072..000000000 --- a/week08_pomdp/theano_optional_recurrence_tutorial.ipynb +++ /dev/null @@ -1,434 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Recurrent memory intro\n", - "\n", - "In the seminar you'll deploy recurrent neural network inside SARSA agent.\n", - "\n", - "The environment it plays is a simple POMDP of rock-paper-scissors game with exploitable opponent.\n", - "\n", - "#### Instructions\n", - "\n", - "First, read through the code and __run it as you read__. The code will create a feedforward neural network and train it with SARSA.\n", - "\n", - "Since the game is partially observable, default algorithm will won't reach optimal score. In fact, it's unstable and may even end up worse than random.\n", - "\n", - "After you ran the code, __find the two ```#YOUR CODE HERE``` chunks__ (mb ctrl+f) and implement a recurrent memory.\n", - "\n", - "Re-run the experiment and compare the performance of feedworward vs recurrent agent. \n", - "RNN should be _much_ better, session __reward > 50__.\n", - "\n", - "After you're done with that, proceed to the next part, for it is going to be much more interesting." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "%matplotlib inline\n", - "\n", - "# number of parallel agents and batch sequence length (frames)\n", - "N_AGENTS = 10\n", - "SEQ_LENGTH = 25" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The environment we're going to use now is not a default gym env.\n", - "\n", - "It was instead written from scratch in `rockpaperscissors.py`.\n", - "\n", - "Morale: you can make your own gym environments easily with anything you want (including OS and the web, e.g. selenium)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import gym\n", - "from rockpaperscissors import RockPaperScissors\n", - "\n", - "\n", - "def make_env():\n", - " env = RockPaperScissors()\n", - " return gym.wrappers.TimeLimit(env, max_episode_steps=100)\n", - "\n", - "\n", - "# spawn game instance\n", - "env = make_env()\n", - "observation_shape = env.observation_space.shape\n", - "n_actions = env.action_space.n\n", - "\n", - "env.reset()\n", - "obs = env.step(env.action_space.sample())[0]\n", - "\n", - "print obs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Basic agent setup\n", - "Here we define a simple agent that maps game images into policy with a minimalistic neural net\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# setup theano/lasagne. Prefer CPU\n", - "%env THEANO_FLAGS = device = cpu, floatX = float32\n", - "\n", - "import theano\n", - "import lasagne\n", - "import theano.tensor as T\n", - "from lasagne.layers import *" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# observation\n", - "obs = InputLayer((None,)+observation_shape,)\n", - "\n", - "nn = DenseLayer(obs, 32, nonlinearity=T.nnet.elu)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from agentnet.memory import RNNCell, GRUCell, LSTMCell\n", - ":\n", - "# Implement a recurrent agent memory by un-comemnting code below and defining h_new\n", - "\n", - "#h_prev = InputLayer((None,50),name=\"previous memory state with 50 units\")\n", - "\n", - "# h_new = RNNCell(,,nonlinearity=T.nnet.elu)\n", - "\n", - "# (IMPORTANT!) use new cell to compute q-values instead of dense layer\n", - "#nn = h_new" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from agentnet.resolver import EpsilonGreedyResolver\n", - "l_qvalues = DenseLayer(nn, n_actions)\n", - "l_actions = EpsilonGreedyResolver(l_qvalues)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Agent, as usual" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from agentnet.agent import Agent\n", - "\n", - "# uncomment agent_states and define what layers should be used\n", - "\n", - "agent = Agent(observation_layers=obs,\n", - " policy_estimators=(l_qvalues),\n", - " # agent_states={:},\n", - " action_layers=l_actions)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Pool, as usual" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from agentnet.experiments.openai_gym.pool import EnvPool\n", - "\n", - "pool = EnvPool(agent, make_env, n_games=16) # may need to adjust\n", - "\n", - "pool.update(SEQ_LENGTH)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Learning\n", - "\n", - "For N+1'st time, we use vanilla SARSA" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "replay = pool.experience_replay\n", - "\n", - "qvalues_seq = agent.get_sessions(\n", - " replay,\n", - " session_length=SEQ_LENGTH,\n", - " experience_replay=True,\n", - " unroll_scan=False, # this new guy makes compilation 100x faster for a bit slower runtime\n", - ")[-1]\n", - "\n", - "auto_updates = agent.get_automatic_updates() # required if unroll_scan=False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get SARSA mse loss\n", - "from agentnet.learning import sarsa\n", - "elemwise_mse = sarsa.get_elementwise_objective(qvalues_seq,\n", - " actions=replay.actions[0],\n", - " rewards=replay.rewards,\n", - " is_alive=replay.is_alive)\n", - "loss = elemwise_mse.mean()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Compute weights and updates\n", - "weights = lasagne.layers.get_all_params([l_actions], trainable=True)\n", - "\n", - "updates = lasagne.updates.adam(loss, weights)\n", - "\n", - "# compile train function\n", - "train_step = theano.function([], loss, updates=auto_updates+updates)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Demo run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "untrained_reward = np.mean(pool.evaluate(save_path=\"./records\", n_games=10,\n", - " record_video=False, use_monitor=False))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Training loop" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# starting epoch\n", - "epoch_counter = 1\n", - "\n", - "# full game rewards\n", - "rewards = {0: untrained_reward}\n", - "loss, reward = 0, untrained_reward" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from tqdm import trange\n", - "from IPython.display import clear_output\n", - "\n", - "for i in trange(10000):\n", - "\n", - " # play\n", - " pool.update(SEQ_LENGTH)\n", - " # train\n", - " loss = train_step()\n", - "\n", - " # update epsilon\n", - " new_epsilon = max(0.01, 1-2e-4*epoch_counter)\n", - " l_actions.epsilon.set_value(np.float32(new_epsilon))\n", - "\n", - " # record current learning progress and show learning curves\n", - " if epoch_counter % 100 == 0:\n", - " clear_output(True)\n", - " print(\"iter=%i,loss=%.3f,epsilon=%.3f\" %\n", - " (epoch_counter, loss, new_epsilon))\n", - " reward = 0.9*reward + 0.1*np.mean(np.mean(pool.evaluate(save_path=\"./records\", n_games=10,\n", - " record_video=False, use_monitor=False)))\n", - " rewards[epoch_counter] = reward\n", - "\n", - " plt.plot(*zip(*sorted(rewards.items(), key=lambda (t, r): t)))\n", - " plt.grid()\n", - " plt.show()\n", - "\n", - " epoch_counter += 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Evaluating results\n", - " * Here we plot learning curves and sample testimonials" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.plot(*zip(*sorted(rewards.items(), key=lambda k: k[0])))\n", - "plt.grid()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Bonus (1++ points)\n", - "\n", - "Compare two types of nonlinearities for the RNN:\n", - "- `T.nnet.elu`\n", - "- `T.nnet.sigmoid`\n", - "\n", - "Re-train agent at least 10 times. It's probably a good idea to automate the process.\n", - "\n", - "Notice something weird? Any clue why this happens and how to fix it?\n", - "\n", - "_Running the experiment and reporting results gets your 1 point. Reward will get much higher as you go down the rabbit hole! Don't forget to send this notebook to Anytask and mention that you went for this bonus._" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# results, ideas, solutions..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "\n", - "\n", - "```\n", - "```\n", - "\n", - "\n", - "```\n", - "```\n", - "\n", - "\n", - "```\n", - "```\n", - "\n", - "\n", - "```\n", - "```\n", - "\n", - "\n", - "```\n", - "```\n", - "\n", - "\n", - "```\n", - "```\n", - "\n", - "\n", - "```\n", - "```\n", - "\n", - "\n", - "```\n", - "```\n", - "\n", - "\n", - "```\n", - "```\n", - "\n", - "\n", - "```\n", - "```\n", - "\n", - "\n", - "```\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/week09_policy_II/README.md b/week09_policy_II/README.md index b10fd7592..727562223 100644 --- a/week09_policy_II/README.md +++ b/week09_policy_II/README.md @@ -2,17 +2,17 @@ * [__slides #2 (dpg)__](https://yadi.sk/i/uV6IA-C23UTn7c) ## Materials -This section covers some steroids for policy gradient methods, along with a cool general trick called +This section covers some steroids for policy gradient methods, along with a cool general trick called * Lecture on NPG and TRPO by J. Schulman - [video](https://www.youtube.com/watch?v=_t5fpZuuf-4) * Alternative lecture on TRPO and open problems by... J. Schulman - [video](https://www.youtube.com/watch?v=gb5Q2XL5c8A) -* Our videos: [lecture](https://yadi.sk/i/c7GR1kAAJc00Og), [seminar(pytorch)](https://yadi.sk/i/OGZJJjkQH_7h5g) [seminar(theano)](https://yadi.sk/i/b0ol2gUV3HiKKJ) (russian) +* Our videos: [lecture](https://yadi.sk/i/c7GR1kAAJc00Og), [seminar(PyTorch)](https://yadi.sk/i/OGZJJjkQH_7h5g) (russian) * Original articles - [TRPO](https://arxiv.org/abs/1502.05477), [NPG](https://papers.nips.cc/paper/2073-a-natural-policy-gradient.pdf) ## Practice -* Seminar: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week09_policy_II/seminar_TRPO_pytorch.ipynb) +* TRPO: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week09_policy_II/seminar_TRPO_pytorch.ipynb) -* Homework: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week09_policy_II/ppo.ipynb) +* PPO: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week09_policy_II/ppo.ipynb) ## More: Reinforcement learning in large/continuous action spaces While you already know algorithms that will work with continuously many actions, it can't hurt to learn something more specialized. @@ -21,5 +21,4 @@ While you already know algorithms that will work with continuously many actions, * Deterministic policy gradient - [article](https://arxiv.org/pdf/1512.07679.pdf), [post+code](https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html) * Stochastic value gradient - [article](https://arxiv.org/abs/1510.09142) * Embedding large discrete action spaces for RL - [article](https://arxiv.org/pdf/1512.07679.pdf) - * Lecture by A. Seleznev, 5vision (russian) - [video](www.youtube.com/watch?v=j1L2FnanXPo&t=119m45s) - + * Lecture by A. Seleznev, 5vision (russian) - [video](https://www.youtube.com/watch?v=j1L2FnanXPo&t=119m45s) diff --git a/week09_policy_II/mujoco_wrappers.py b/week09_policy_II/mujoco_wrappers.py index 72bc1bd9c..9ca1b9dae 100644 --- a/week09_policy_II/mujoco_wrappers.py +++ b/week09_policy_II/mujoco_wrappers.py @@ -1,6 +1,6 @@ """ MuJoCo env wrappers. """ # Adapted from https://github.com/openai/baselines -import gym +import gymnasium as gym import numpy as np @@ -83,17 +83,17 @@ def observation(self, obs): return obs def step(self, action): - obs, rews, resets, info = self.env.step(action) + obs, rews, terminated, truncated, info = self.env.step(action) self.ret = self.ret * self.gamma + rews obs = self.observation(obs) if self.ret_rmv: self.ret_rmv.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rmv.var + self.eps), -self.cliprew, self.cliprew) - self.ret[resets] = 0. - return obs, rews, resets, info + self.ret[terminated] = 0. + return obs, rews, terminated, truncated, info def reset(self, **kwargs): self.ret = np.zeros(getattr(self.env.unwrapped, "nenvs", 1)) - obs = self.env.reset(**kwargs) - return self.observation(obs) + obs, info = self.env.reset(**kwargs) + return self.observation(obs), info diff --git a/week09_policy_II/ppo.ipynb b/week09_policy_II/ppo.ipynb index 2cd6d67d0..9640490c8 100644 --- a/week09_policy_II/ppo.ipynb +++ b/week09_policy_II/ppo.ipynb @@ -3,15 +3,22 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "j9dfWABRNCVm", + "outputId": "ee8f0e74-ab77-4b9e-ff10-05d4e7a0930b" + }, "outputs": [], "source": [ "import sys, os\n", "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", - " \n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week09_policy_II/runners.py\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/week09_policy_II/mujoco_wrappers.py\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + "\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week09_policy_II/mujoco_wrappers.py\n", + "\n", + " !pip -q install gymnasium[mujoco]\n", " \n", " !touch .setup_complete\n", "\n", @@ -24,7 +31,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "WI0kqjVINCVt" + }, "source": [ "# Implementing Proximal Policy Optimization \n", "\n", @@ -36,47 +45,52 @@ "You will be solving a continuous control environment on which it may be easier and faster \n", "to train an agent, however note that PPO here may not be the best algorithm as, for example,\n", "Deep Deterministic Policy Gradient and Soft Actor Critic may be more suited \n", - "for continuous control environments. To run the environment you will need to install \n", - "[pybullet-gym](https://github.com/benelot/pybullet-gym) which unlike MuJoCo \n", - "does not require you to have a license.\n", - "\n", - "To install the library:" + "for continuous control environments." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "MZeKC7aO4RIC" + }, "outputs": [], "source": [ - "!git clone https://github.com/benelot/pybullet-gym lib/pybullet-gym\n", - "!pip install -e lib/pybullet-gym" + "from tqdm import tqdm\n", + "from sklearn.metrics import r2_score" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "Iod_fZUFNCVw" + }, "source": [ "The overall structure of the code is similar to the one in the A2C optional homework, but don't worry if you haven't done it, it should be relatively easy to figure it out. \n", "First, we will create an instance of the environment. \n", "We will normalize the observations and rewards, but before that you will need a wrapper that will \n", - "write summaries, mainly, the total reward during an episode. You can either use one for `TensorFlow` \n", + "write summaries, mainly, the total reward during an episode. You can either use one for `TensorBoard` \n", "implemented in `atari_wrappers.py` file from the optional A2C homework, or implement your own. " ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Y2N76WIRNCVx", + "outputId": "b23c69ce-eedb-4043-9f8c-1d89eec416a8" + }, "outputs": [], "source": [ - "import gym \n", - "import pybulletgym\n", + "import gymnasium as gym\n", "\n", - "env = gym.make(\"HalfCheetahMuJoCoEnv-v0\")\n", + "env = gym.make(\"HalfCheetah-v4\", render_mode=\"rgb_array\")\n", "print(\"observation space: \", env.observation_space,\n", - " \"\\nobservations:\", env.reset())\n", - "print(\"action space: \", env.action_space, \n", + " \"\\nobservations:\", env.reset()[0])\n", + "print(\"action space: \", env.action_space,\n", " \"\\naction_sample: \", env.action_space.sample())" ] }, @@ -86,20 +100,60 @@ "metadata": {}, "outputs": [], "source": [ + "import matplotlib.pyplot as plt\n", + "plt.imshow(env.render())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_au8nSY479qw" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", "class Summaries(gym.Wrapper):\n", - " \"\"\" Wrapper to write summaries. \"\"\"\n", - " def step(self, action):\n", - " # TODO: implement writing summaries\n", - " return self.env.step(action)\n", - " \n", - " def reset(self, **kwargs):\n", - " # TODO: implement writing summaries\n", - " return self.env.reset(**kwargs)" + " \"\"\" Wrapper to write summaries. \"\"\"\n", + " def __init__(self, env):\n", + " super().__init__(env)\n", + " self.episode_counter = 0\n", + " self.current_step_var = 0\n", + "\n", + " self.episode_rewards = []\n", + " self.episode_lens = []\n", + "\n", + " self.current_reward = 0\n", + " self.current_len = 0\n", + "\n", + " def step(self, action):\n", + " obs, rew, terminated, truncated, info = self.env.step(action)\n", + "\n", + " self.current_reward += rew\n", + " self.current_len += 1\n", + " self.current_step_var += 1\n", + "\n", + " if terminated or truncated:\n", + " self.episode_rewards.append((self.current_step_var, self.current_reward))\n", + " self.episode_lens.append((self.current_step_var, self.current_len))\n", + "\n", + " return obs, rew, terminated, truncated, info\n", + "\n", + " def reset(self, **kwargs):\n", + " self.episode_counter += 1\n", + "\n", + " self.current_reward = 0\n", + " self.current_len = 0\n", + "\n", + " return self.env.reset(**kwargs)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "JVom_S9yNCVz" + }, "source": [ "The normalization wrapper will subtract running mean from observations and rewards and divide \n", "the resulting quantities by the running variances." @@ -108,29 +162,34 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lxevKC-FNCVz", + "outputId": "11bcf7dd-b04c-47c8-c8e0-ca8724aee842" + }, "outputs": [], "source": [ "from mujoco_wrappers import Normalize\n", "\n", - "env = Normalize(Summaries(gym.make(\"HalfCheetahMuJoCoEnv-v0\")));\n", - "env.unwrapped.seed(0);" + "env = Normalize(Summaries(gym.make(\"HalfCheetah-v4\", render_mode=\"rgb_array\")));\n", + "env.reset(seed=0)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "_9lf2Z8bNCV0" + }, "source": [ "Next, you will need to define a model for training. We suggest that you use two separate networks: one for policy\n", "and another for value function. Each network should be a 3-layer MLP with 64 hidden units, $\\mathrm{tanh}$ \n", - "activation function, kernel matrices initialized with orthogonal initializer with parameter $\\sqrt{2}$\n", - "and biases initialized with zeros. \n", + "activation function.\n", "\n", "Our policy distribution is going to be multivariate normal with diagonal covariance. \n", "The network from above will predict the mean, and the covariance should be represented by a single \n", - "(learned) vector of size 6 (corresponding to the dimensionality of the action space from above). \n", - "You should initialize this vector to zero and take the exponent of it to always\n", - "have a non-negative quantity. \n", + "(learned) vector of size 6 (corresponding to the dimensionality of the action space from above). Or you can also predict the variance using your model. You should take the exponent of that values to always have a non-negative quantity. \n", "\n", "Overall the model should return three things: predicted mean of the distribution, variance vector, \n", "value function. " @@ -139,18 +198,47 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "OwwLcUApNCV0" + }, "outputs": [], "source": [ "# import tensorflow as tf\n", - "# import torch\n", + "import torch\n", + "\n", + "from torch import nn\n", + "from torch.nn import functional as F\n", + "import torch\n", + "\n", + "class PolicyModel(nn. Module):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.h = 64\n", "\n", - "" + " self.policy_model = < Create your model >\n", + "\n", + " self.value_model = < Create your model >\n", + "\n", + " def get_policy(self, x):\n", + " < insert your code here >\n", + " return means, var\n", + "\n", + " def get_value(self, x):\n", + " out = self.value_model(x.float())\n", + " return out\n", + "\n", + " def forward(self, x):\n", + " policy = self.get_policy(x)\n", + " value = self.get_value(x)\n", + "\n", + " return policy, value" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "YI6d9Xq_NCV1" + }, "source": [ "This model will be wrapped by a `Policy`. The policy can work in two modes, but in either case \n", "it is going to return dictionary with string-type keys. The first mode is when the policy is \n", @@ -178,21 +266,47 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "C54tGZahXbSa" + }, "outputs": [], "source": [ + "from torch.distributions.normal import Normal\n", + "from torch.distributions.multivariate_normal import MultivariateNormal\n", + "\n", "class Policy:\n", - " def __init__(self, model):\n", - " self.model = model\n", + " def __init__(self, model):\n", + " self.model = model\n", " \n", - " def act(self, inputs, training=False):\n", - " \n", - " # Should return a dict." + " def act(self, inputs, training=False):\n", + " inputs = torch.tensor(inputs)\n", + " if inputs.ndim < 2:\n", + " inputs = inputs.unsqueeze(0)\n", + " inputs = inputs.cuda()\n", + " \n", + " batch_size = inputs.shape[0]\n", + "\n", + " < insert your code here >\n", + " normal_distr = MultivariateNormal(means, covar_matrix)\n", + "\n", + " actions = normal_distr.sample()\n", + " log_probs = normal_distr.log_prob(actions)\n", + "\n", + " values = self.model.get_value(inputs)\n", + "\n", + " if not training:\n", + " return {'actions': actions.cpu().numpy().tolist()[0], \n", + " 'log_probs': log_probs[0].detach().cpu().numpy(),\n", + " 'values': values[0].detach().cpu().numpy()}\n", + " else:\n", + " return {'distribution': normal_distr, 'values': values}" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "-kML-jSsNCV2" + }, "source": [ "We will use `EnvRunner` to perform interactions with an environment with a policy for a fixed number of timesteps. Calling `.get_next()` on a runner will return a trajectory — dictionary \n", "containing keys\n", @@ -212,18 +326,19 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "5LeiuohcNCV3" + }, "outputs": [], "source": [ "class AsArray:\n", - " \"\"\" \n", - " Converts lists of interactions to ndarray.\n", - " \"\"\"\n", - " def __call__(self, trajectory):\n", - " # Modify trajectory inplace. \n", - " for k, v in filter(lambda kv: kv[0] != \"state\",\n", - " trajectory.items()):\n", - " trajectory[k] = np.asarray(v)" + " \"\"\" \n", + " Converts lists of interactions to ndarray.\n", + " \"\"\"\n", + " def __call__(self, trajectory):\n", + " # Modify trajectory inplace. \n", + " for k, v in filter(lambda kv: kv[0] != \"state\", trajectory.items()):\n", + " trajectory[k] = np.asarray(v)" ] }, { @@ -231,15 +346,94 @@ "execution_count": null, "metadata": {}, "outputs": [], + "source": [ + "\"\"\" RL env runner \"\"\"\n", + "from collections import defaultdict\n", + "\n", + "import numpy as np\n", + "\n", + "\n", + "class EnvRunner:\n", + " \"\"\" Reinforcement learning runner in an environment with given policy \"\"\"\n", + "\n", + " def __init__(self, env, policy, nsteps, transforms=None, step_var=None):\n", + " self.env = env\n", + " self.policy = policy\n", + " self.nsteps = nsteps\n", + " self.transforms = transforms or []\n", + " self.step_var = step_var if step_var is not None else 0\n", + " self.state = {\"latest_observation\": self.env.reset()[0]}\n", + "\n", + " @property\n", + " def nenvs(self):\n", + " \"\"\" Returns number of batched envs or `None` if env is not batched \"\"\"\n", + " return getattr(self.env.unwrapped, \"nenvs\", None)\n", + "\n", + " def reset(self, **kwargs):\n", + " \"\"\" Resets env and runner states. \"\"\"\n", + " self.state[\"latest_observation\"], info = self.env.reset(**kwargs)\n", + " self.policy.reset()\n", + "\n", + " def get_next(self):\n", + " \"\"\" Runs the agent in the environment. \"\"\"\n", + " trajectory = defaultdict(list, {\"actions\": []})\n", + " observations = []\n", + " rewards = []\n", + " resets = []\n", + " self.state[\"env_steps\"] = self.nsteps\n", + "\n", + " for i in range(self.nsteps):\n", + " observations.append(self.state[\"latest_observation\"])\n", + " act = self.policy.act(self.state[\"latest_observation\"])\n", + " if \"actions\" not in act:\n", + " raise ValueError(\"result of policy.act must contain 'actions' \"\n", + " f\"but has keys {list(act.keys())}\")\n", + " for key, val in act.items():\n", + " trajectory[key].append(val)\n", + "\n", + " obs, rew, terminated, truncated, _ = self.env.step(trajectory[\"actions\"][-1])\n", + " done = np.logical_or(terminated, truncated)\n", + " self.state[\"latest_observation\"] = obs\n", + " rewards.append(rew)\n", + " resets.append(done)\n", + " self.step_var += self.nenvs or 1\n", + "\n", + " # Only reset if the env is not batched. Batched envs should\n", + " # auto-reset.\n", + " if not self.nenvs and np.all(done):\n", + " self.state[\"env_steps\"] = i + 1\n", + " self.state[\"latest_observation\"] = self.env.reset()[0]\n", + "\n", + " trajectory.update(\n", + " observations=observations,\n", + " rewards=rewards,\n", + " resets=resets)\n", + " trajectory[\"state\"] = self.state\n", + "\n", + " for transform in self.transforms:\n", + " transform(trajectory)\n", + " return trajectory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wNzPNxKLNCV4", + "outputId": "e15b29e7-1864-45db-e67a-3f1994b77a81" + }, + "outputs": [], "source": [ "import numpy as np\n", - "from runners import EnvRunner\n", "\n", "class DummyPolicy:\n", - " def act(self, inputs, training=False):\n", - " assert not training\n", - " return {\"actions\": np.random.randn(6), \"values\": np.nan}\n", - " \n", + " def act(self, inputs, training=False):\n", + " assert not training\n", + " return {\"actions\": np.random.randn(6), \"values\": np.nan}\n", + "\n", "runner = EnvRunner(env, DummyPolicy(), 3,\n", " transforms=[AsArray()])\n", "trajectory = runner.get_next()\n", @@ -249,7 +443,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "NyhcvAMZNCV5" + }, "source": [ "You will need to implement the following two transformations. \n", "\n", @@ -281,23 +477,66 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "AC5rUuiKNCV6" + }, "outputs": [], "source": [ + "import os\n", "class GAE:\n", - " \"\"\" Generalized Advantage Estimator. \"\"\"\n", - " def __init__(self, policy, gamma=0.99, lambda_=0.95):\n", - " self.policy = policy\n", - " self.gamma = gamma\n", - " self.lambda_ = lambda_\n", + " \"\"\" Generalized Advantage Estimator. \"\"\"\n", + " def __init__(self, policy, gamma=0.99, lambda_=0.95):\n", + " self.policy = policy\n", + " self.gamma = gamma\n", + " self.lambda_ = lambda_\n", + "\n", + " def __call__(self, trajectory):\n", + " gamma = self.gamma\n", + " lambda_ = self.lambda_\n", + " \n", + " < insert your code here >" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def test_gae():\n", + " !curl -O https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week09_policy_II/test_ppo/actions.npy\n", + " !curl -O https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week09_policy_II/test_ppo/log_probs.npy\n", + " !curl -O https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week09_policy_II/test_ppo/values.npy\n", + " !curl -O https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week09_policy_II/test_ppo/observations.npy\n", + " !curl -O https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week09_policy_II/test_ppo/rewards.npy\n", + " !curl -O https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week09_policy_II/test_ppo/resets.npy\n", + " !curl -O https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week09_policy_II/test_ppo/state.npy\n", + " !curl -O https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week09_policy_II/test_ppo/advantages.npy\n", + " !curl -O https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week09_policy_II/test_ppo/value_targets.npy\n", + " !curl -O https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week09_policy_II/test_ppo/policy\n", + "\n", + " trajectory = {}\n", + " for key in ['actions', 'log_probs', 'values', 'observations', 'rewards', 'resets']:\n", + " trajectory[key] = np.load(f'{key}.npy', allow_pickle=True)\n", + " trajectory['state'] = {\"latest_observation\": np.load('state.npy')}\n", " \n", - " def __call__(self, trajectory):\n", - " " + " policy = torch.load(f'policy')\n", + " gae_to_test = GAE(policy, gamma=0.99, lambda_=0.95)\n", + " \n", + " gae_to_test(trajectory)\n", + " \n", + " for key in ['advantages', 'value_targets']:\n", + " assert np.allclose(np.load(f'{key}.npy'), trajectory[key], atol=2e-2)\n", + " \n", + " print(\"It's all good!\")\n", + "test_gae()" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "yce0A8x7NCV7" + }, "source": [ "The main advantage of PPO over simpler policy based methods like A2C is that it is possible\n", "to train on the same trajectory for multiple gradient steps. The following class wraps \n", @@ -308,56 +547,96 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "J7tIjwbkNCV8" + }, "outputs": [], "source": [ "class TrajectorySampler:\n", - " \"\"\" Samples minibatches from trajectory for a number of epochs. \"\"\"\n", - " def __init__(self, runner, num_epochs, num_minibatches, transforms=None):\n", - " self.runner = runner\n", - " self.num_epochs = num_epochs\n", - " self.num_minibatches = num_minibatches\n", - " self.transforms = transforms or []\n", - " self.minibatch_count = 0\n", - " self.epoch_count = 0\n", - " self.trajectory = None\n", - " \n", - " def shuffle_trajectory(self):\n", - " \"\"\" Shuffles all elements in trajectory.\n", - " \n", - " Should be called at the beginning of each epoch.\n", - " \"\"\"\n", - " \n", - " \n", - " def get_next(self):\n", - " \"\"\" Returns next minibatch. \"\"\"\n", - " " + " \"\"\" Samples minibatches from trajectory for a number of epochs. \"\"\"\n", + " def __init__(self, runner, num_epochs, num_minibatches, transforms=None):\n", + " self.runner = runner\n", + " self.num_epochs = num_epochs\n", + " self.num_minibatches = num_minibatches\n", + " self.transforms = transforms or []\n", + " self.minibatch_count = 0\n", + " self.epoch_count = 0\n", + " self.trajectory = None\n", + "\n", + " def shuffle_trajectory(self):\n", + " \"\"\" Shuffles all elements in trajectory.\n", + "\n", + " Should be called at the beginning of each epoch.\n", + " \"\"\"\n", + " trajectory_len = self.trajectory[\"observations\"].shape[0]\n", + "\n", + " permutation = np.random.permutation(trajectory_len)\n", + " for key, value in self.trajectory.items():\n", + " if key != 'state':\n", + " self.trajectory[key] = value[permutation]\n", + "\n", + " def get_next(self):\n", + " \"\"\" Returns next minibatch. \"\"\"\n", + " if not self.trajectory:\n", + " self.trajectory = self.runner.get_next()\n", + "\n", + " if self.minibatch_count == self.num_minibatches:\n", + " self.shuffle_trajectory()\n", + " self.minibatch_count = 0\n", + " self.epoch_count += 1\n", + "\n", + " if self.epoch_count == self.num_epochs:\n", + " self.trajectory = self.runner.get_next()\n", + "\n", + " self.shuffle_trajectory()\n", + " self.minibatch_count = 0\n", + " self.epoch_count = 0\n", + "\n", + " trajectory_len = self.trajectory[\"observations\"].shape[0]\n", + "\n", + " batch_size = trajectory_len//self.num_minibatches\n", + "\n", + " minibatch = {}\n", + " for key, value in self.trajectory.items():\n", + " if key != 'state':\n", + " minibatch[key] = value[self.minibatch_count*batch_size: (self.minibatch_count + 1)*batch_size]\n", + "\n", + " self.minibatch_count += 1\n", + "\n", + " for transform in self.transforms:\n", + " transform(minibatch)\n", + "\n", + " return minibatch" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "U1UgHPb0NCV8" + }, "source": [ - "A common trick to use with GAE is to normalize advantages, the following transformation does that. " + "A common trick to use with GAE is to normalize advantages, please implement the normalization." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "GZrUlmFYNCV9" + }, "outputs": [], "source": [ "class NormalizeAdvantages:\n", - " \"\"\" Normalizes advantages to have zero mean and variance 1. \"\"\"\n", - " def __call__(self, trajectory):\n", - " adv = trajectory[\"advantages\"]\n", - " adv = (adv - adv.mean()) / (adv.std() + 1e-8)\n", - " trajectory[\"advantages\"] = adv" + " \"\"\" Normalizes advantages to have zero mean and variance 1. \"\"\"\n", + " def __call__(self, trajectory):\n", + " < insert your code here >" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "DXnare-INCV-" + }, "source": [ "Finally, we can create our PPO runner. " ] @@ -365,28 +644,32 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "tx0Yr0GtNCV_" + }, "outputs": [], "source": [ "def make_ppo_runner(env, policy, num_runner_steps=2048,\n", " gamma=0.99, lambda_=0.95, \n", " num_epochs=10, num_minibatches=32):\n", - " \"\"\" Creates runner for PPO algorithm. \"\"\"\n", - " runner_transforms = [AsArray(),\n", + " \"\"\" Creates runner for PPO algorithm. \"\"\"\n", + " runner_transforms = [AsArray(),\n", " GAE(policy, gamma=gamma, lambda_=lambda_)]\n", - " runner = EnvRunner(env, policy, num_runner_steps, \n", + " runner = EnvRunner(env, policy, num_runner_steps, \n", " transforms=runner_transforms)\n", - " \n", - " sampler_transforms = [NormalizeAdvantages()]\n", - " sampler = TrajectorySampler(runner, num_epochs=num_epochs, \n", + "\n", + " sampler_transforms = [NormalizeAdvantages()]\n", + " sampler = TrajectorySampler(runner, num_epochs=num_epochs, \n", " num_minibatches=num_minibatches,\n", " transforms=sampler_transforms)\n", - " return sampler" + " return sampler" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "MX1aZueWNCWA" + }, "source": [ "In the next cell you will need to implement Proximal Policy Optimization algorithm itself. The algorithm\n", "modifies the typical policy gradient loss in the following way:\n", @@ -429,43 +712,55 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "JYGV4EmxNCWB" + }, "outputs": [], "source": [ "class PPO:\n", - " def __init__(self, policy, optimizer,\n", + " def __init__(self, policy, optimizer,\n", " cliprange=0.2,\n", " value_loss_coef=0.25,\n", " max_grad_norm=0.5):\n", - " self.policy = policy\n", - " self.optimizer = optimizer\n", - " self.cliprange = cliprange\n", - " self.value_loss_coef = value_loss_coef\n", - " # Note that we don't need entropy regularization for this env.\n", - " self.max_grad_norm = max_grad_norm\n", - " \n", - " def policy_loss(self, trajectory, act):\n", - " \"\"\" Computes and returns policy loss on a given trajectory. \"\"\"\n", - " \n", - " \n", - " def value_loss(self, trajectory, act):\n", - " \"\"\" Computes and returns value loss on a given trajectory. \"\"\"\n", - " \n", - " \n", - " def loss(self, trajectory):\n", - " act = self.policy.act(trajectory[\"observations\"], training=True)\n", - " policy_loss = self.policy_loss(trajectory, act)\n", - " value_loss = self.value_loss(trajectory, act)\n", - " return policy_loss + self.value_loss_coef * value_loss\n", - " \n", - " def step(self, trajectory):\n", - " \"\"\" Computes the loss function and performs a single gradient step. \"\"\"\n", - " " + " self.policy = policy\n", + " self.optimizer = optimizer\n", + " self.cliprange = cliprange\n", + " self.value_loss_coef = value_loss_coef\n", + " # Note that we don't need entropy regularization for this env.\n", + " self.max_grad_norm = max_grad_norm\n", + "\n", + " def policy_loss(self, trajectory, act):\n", + " \"\"\" Computes and returns policy loss on a given trajectory. \"\"\"\n", + " < insert your code here >\n", + "\n", + " def value_loss(self, trajectory, act):\n", + " \"\"\" Computes and returns value loss on a given trajectory. \"\"\"\n", + " < insert your code here >\n", + "\n", + " def loss(self, trajectory):\n", + " act = self.policy.act(trajectory[\"observations\"], training=True)\n", + " policy_loss = self.policy_loss(trajectory, act)\n", + " value_loss = self.value_loss(trajectory, act)\n", + "\n", + " return policy_loss + self.value_loss_coef * value_loss\n", + "\n", + " def step(self, trajectory):\n", + " \"\"\" Computes the loss function and performs a single gradient step. \"\"\"\n", + " self.optimizer.zero_grad()\n", + " loss = self.loss(trajectory)\n", + "\n", + " loss.backward()\n", + "\n", + " grad_norm = nn.utils.clip_grad_norm_(self.policy.model.parameters(), self.max_grad_norm)\n", + "\n", + " self.optimizer.step()" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "AVNS0IFhNCWB" + }, "source": [ "Now everything is ready to do training. In one million of interactions it should be possible to \n", "achieve the total raw reward of about 1500. You should plot this quantity with respect to \n", @@ -486,6 +781,108 @@ "from 3e-4 to 0 and epsilon 1e-5." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WmroaOHX33nZ" + }, + "outputs": [], + "source": [ + "model = PolicyModel()\n", + "model = model.cuda()\n", + "\n", + "policy = Policy(model)\n", + "\n", + "runner = make_ppo_runner(env, policy)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Psvv8oru3POG" + }, + "outputs": [], + "source": [ + "from IPython.display import clear_output\n", + "from matplotlib import pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 312 + }, + "id": "2W5StJsb1mz2", + "outputId": "7a902763-0cc7-4dfe-9746-3511fbe555f5" + }, + "outputs": [], + "source": [ + "optimizer = torch.optim.Adam(policy.model.parameters(), lr = 3e-4, eps=1e-5)\n", + "epochs = 250000\n", + "\n", + "lr_mult = lambda epoch: (1 - (epoch/epochs))\n", + "sched = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_mult)\n", + "\n", + "ppo = PPO(policy, optimizer)\n", + "\n", + "for epoch in tqdm(range(epochs)):\n", + " trajectory = runner.get_next()\n", + "\n", + " if (epoch + 1) % 100 == 0:\n", + " clear_output(True)\n", + " rewards = np.array(env.env.episode_rewards)\n", + "\n", + " if rewards.size > 0:\n", + " plt.plot(rewards[:, 0], rewards[:, 1], label = \"episode rewards\")\n", + " plt.title(\"Reward\")\n", + " plt.xlabel(\"Total steps\")\n", + " plt.ylabel(\"Reward\")\n", + " plt.grid()\n", + " plt.show()\n", + "\n", + " ppo.step(trajectory)\n", + " sched.step()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 295 + }, + "id": "OxD3w7uWbsYt", + "outputId": "551690d0-f96d-406a-efc7-73ff50508c70" + }, + "outputs": [], + "source": [ + "rewards = np.array(env.env.episode_rewards)\n", + "\n", + "if rewards.size > 0:\n", + " plt.plot(rewards[:, 0], rewards[:, 1], label = \"episode rewards\")\n", + " plt.title(\"Reward\")\n", + " plt.xlabel(\"Total steps\")\n", + " plt.ylabel(\"Reward\")\n", + " plt.ylim(-1000, 2500)\n", + " plt.grid()\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sixQI_jMASJR" + }, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -495,6 +892,13 @@ } ], "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "Копия блокнота \"Копия блокнота \"Копия блокнота \"Копия блокнота \"ppo.ipynb\"\"\"\"", + "provenance": [], + "toc_visible": true + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -510,9 +914,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.8.5" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 1 } diff --git a/week09_policy_II/seminar_TRPO_pytorch.ipynb b/week09_policy_II/seminar_TRPO_pytorch.ipynb index f00b906a3..f2d8dcc8f 100644 --- a/week09_policy_II/seminar_TRPO_pytorch.ipynb +++ b/week09_policy_II/seminar_TRPO_pytorch.ipynb @@ -6,17 +6,16 @@ "metadata": {}, "outputs": [], "source": [ - "import sys\n", - "if 'google.colab' in sys.modules:\n", - " import os\n", + "import sys, os\n", + "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", "\n", - " os.system('apt-get install -y xvfb')\n", - " os.system('wget https://raw.githubusercontent.com/yandexdataschool/Practical_DL/fall18/xvfb -O ../xvfb')\n", - " os.system('apt-get install -y python-opengl ffmpeg')\n", - " os.system('pip install pyglet==1.2.4')\n", + " !pip install -q gymnasium\n", "\n", - "# launch XVFB if you run on a server\n", - "import os\n", + " !touch .setup_complete\n", + "\n", + "# This code creates a virtual display to draw game images on.\n", + "# It will have no effect if your machine has a monitor.\n", "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", " !bash ../xvfb start\n", " os.environ['DISPLAY'] = ':1'" @@ -39,35 +38,30 @@ "metadata": {}, "outputs": [], "source": [ + "from typing import Tuple\n", + "\n", "import numpy as np\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", - "from torch.autograd import Variable" + "\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mWARN: gym.spaces.Box autodetected dtype as . Please provide explicit dtype.\u001b[0m\n", - "Observation Space Box(6,)\n", - "Action Space Discrete(3)\n" - ] - } - ], - "source": [ - "import gym\n", - "\n", - "env = gym.make(\"Acrobot-v1\")\n", + "outputs": [], + "source": [ + "import gymnasium as gym\n", + "\n", + "env = gym.make(\"Acrobot-v1\", render_mode=\"rgb_array\")\n", "env.reset()\n", "observation_shape = env.observation_space.shape\n", "n_actions = env.action_space.n\n", + "\n", "print(\"Observation Space\", env.observation_space)\n", "print(\"Action Space\", env.action_space)" ] @@ -76,32 +70,9 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQsAAAD8CAYAAABgtYFHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAADjJJREFUeJzt3X3I3Wd9x/H3Z+mDborpw70Qkkgqhkn/2GpzUyvKcC2O2onpH1VaZAYJBDYHFQcu3WBD2B+6P6wKQw2rLA617XygoXRzXVoZ+8PaO/bBPqz2rrQ0oZqobd0Q3arf/XGu6DGmua8793lM3i84nOt3/a7fOd9TTj69fr9znXOnqpCklfzGtAuQNB8MC0ldDAtJXQwLSV0MC0ldDAtJXcYSFkmuSvJ4kuUke8bxHJImK6NeZ5FkHfBt4K3AIeA+4PqqenSkTyRposYxs7gMWK6q71TV/wK3ADvG8DySJuisMTzmJuCZoe1DwBtOdsCFF15YW7duHUMpko45ePDg96tq4VSPH0dYdEmyG9gN8OpXv5qlpaVplSKdEZI8vZbjx3EachjYMrS9ufX9iqraW1WLVbW4sHDKYSdpQsYRFvcB25JclOQc4Dpg/xieR9IEjfw0pKpeTPJnwFeBdcBnquqRUT+PpMkayzWLqroTuHMcjy1pOlzBKamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpy4phkeQzSY4keXio7/wkdyV5ot2f1/qT5BNJlpM8lOTScRYvaXJ6Zhb/CFx1XN8e4EBVbQMOtG2AtwHb2m038MnRlClp2lYMi6r6D+CHx3XvAPa19j7gmqH+z9bA14H1STaOqlhJ03Oq1yw2VNWzrf1dYENrbwKeGRp3qPX9miS7kywlWTp69OgpliFpUtZ8gbOqCqhTOG5vVS1W1eLCwsJay5A0ZqcaFt87dnrR7o+0/sPAlqFxm1ufpDl3qmGxH9jZ2juB24f639M+FbkceGHodEXSHDtrpQFJvgC8BbgwySHgb4APA7cl2QU8DbyrDb8TuBpYBn4MvHcMNUuaghXDoqquf4ldV55gbAHvW2tRkmaPKzgldTEsJHUxLCR1MSwkdTEsJHUxLCR1MSwkdTEsJHUxLCR1yWDR5ZSLSKZfhHT6O1hVi6d68IrLvSdh+/btLC0tTbsM6bSWZE3HexoiqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIanLimGRZEuSe5I8muSRJDe0/vOT3JXkiXZ/XutPkk8kWU7yUJJLx/0iJI1fz8ziReDPq+pi4HLgfUkuBvYAB6pqG3CgbQO8DdjWbruBT468akkTt2JYVNWzVfXN1v5v4DFgE7AD2NeG7QOuae0dwGdr4OvA+iQbR165pIla1TWLJFuB1wP3Ahuq6tm267vAhtbeBDwzdNih1idpjnWHRZJXAF8C3l9VPxreV4O/rryqP26cZHeSpSRLR48eXc2hkqagKyySnM0gKD5XVV9u3d87dnrR7o+0/sPAlqHDN7e+X1FVe6tqsaoWFxYWTrV+SRPS82lIgJuBx6rqo0O79gM7W3sncPtQ/3vapyKXAy8Mna5ImlNndYx5E/DHwLeSPND6/hL4MHBbkl3A08C72r47gauBZeDHwHtHWrGkqVgxLKrqP4G8xO4rTzC+gPetsS5JM8YVnJK6GBaSuhgWkroYFpK6GBaSuhgWkroYFpK6GBaSuhgWkroYFpK6GBaSuvR8kUz6hYMHf/VrQtu3r+pnTDTHnFmo2/FB8VJ9Oj0ZFupyslAwMM4MhoVW1BMGBsbpz7CQ1MWwkNTFsJDUxbDQihZZGskYzTfDQl1OFgYGxZnBsFC3E4WCQXHmcAWnVsVwOHM5s5DUxbCQ1MWwkNTFsJDUxbCQ1MWwkNTFsJDUxbCQ1MWwkNTFsJDUxbCQ1MWwkNTFsJDUZcWwSPKyJN9I8mCSR5J8qPVflOTeJMtJbk1yTus/t20vt/1bx/sSJE1Cz8zip8AVVfV7wCXAVUkuBz4C3FRVrwWeA3a18buA51r/TW2cpDm3YljUwP+0zbPbrYArgC+2/n3ANa29o23T9l+ZxN+Jl+Zc1zWLJOuSPAAcAe4CngSer6oX25BDwKbW3gQ8A9D2vwBccILH3J1kKcnS0aNH1/YqNHW1ffu0S9CYdYVFVf2sqi4BNgOXAa9b6xNX1d6qWqyqxYWFhbU+nKQxW9WnIVX1PHAP8EZgfZJjP8u3GTjc2oeBLQBt/6uAH4ykWklT0/NpyEKS9a39cuCtwGMMQuPaNmwncHtr72/btP13V5V/aluacz0/2LsR2JdkHYNwua2q7kjyKHBLkr8F7gdubuNvBv4pyTLwQ+C6MdQtacJWDIuqegh4/Qn6v8Pg+sXx/T8B3jmS6iTNDFdwSupiWOikcvDgtEvQjDAsJHUxLCR1MSwkdTEsJHUxLCR1MSwkdTEsJHUxLCR1MSwkdTEsJHUxLCR1MSwkdTEsJHUxLCR1MSy0KkssssTitMvQFPT8rJ70awFxbHuRpWmUoylwZqEVnWwm4SzjzGFYSOpiWOikemYOzi7ODIaFpC6GhaQuhoVOqufTDj8ROTMYFpK6GBZa0clmDs4qzhwuytJJ1fbt5OBBQ0HOLDQa/uWy059hIamLYSGpi2EhqYthIamLYSGpi2EhqUt3WCRZl+T+JHe07YuS3JtkOcmtSc5p/ee27eW2f+t4Spc0SauZWdwAPDa0/RHgpqp6LfAcsKv17wKea/03tXGS5lxXWCTZDPwR8A9tO8AVwBfbkH3ANa29o23T9l/ZxkuaY70zi48BHwR+3rYvAJ6vqhfb9iFgU2tvAp4BaPtfaOMlzbEVwyLJ24EjVTXS9bxJdidZSrJ09OjRUT60Rqy2b592CZoBPTOLNwHvSPIUcAuD04+PA+uTHPsi2mbgcGsfBrYAtP2vAn5w/INW1d6qWqyqxYWFhTW9CM0Gvx9yelsxLKrqxqraXFVbgeuAu6vq3cA9wLVt2E7g9tbe37Zp+++uqhpp1ZImbi3rLP4C+ECSZQbXJG5u/TcDF7T+DwB71laipFmwqt+zqKqvAV9r7e8Al51gzE+Ad46gNkkzxBWckroYFpK6GBaSuhgWkroYFpK6GBaSuhgW6uKSbxkWGimXfJ++DAtJXQwLSV0MC0ldDAtJXQwLSV0MC0ldDAtJXQwLSV0MC0ldDAtJXQwLSV0MC0ldDAtJXQwLSV0MC0ldDAtJXQwLdfPXss5shoVGzl/LOj0ZFpK6GBaSuqzqDyNLXrc4czmzkNTFsJDUxbCQ1MWwkNTFsJDUxbCQ1KUrLJI8leRbSR5IstT6zk9yV5In2v15rT9JPpFkOclDSS4d5wuQNBmrmVn8QVVdUlWLbXsPcKCqtgEH2jbA24Bt7bYb+OSoipU0PWs5DdkB7GvtfcA1Q/2frYGvA+uTbFzD80iaAb0rOAv4tyQFfLqq9gIbqurZtv+7wIbW3gQ8M3Tsodb37FAfSXYzmHkA/DTJw6dQ/7RcCHx/2kV0mqdaYb7qnadaAX5nLQf3hsWbq+pwkt8G7kryX8M7q6pakHRrgbMXIMnS0OnNzJuneuepVpiveuepVhjUu5bju05Dqupwuz8CfAW4DPjesdOLdn+kDT8MbBk6fHPrkzTHVgyLJL+V5JXH2sAfAg8D+4GdbdhO4PbW3g+8p30qcjnwwtDpiqQ51XMasgH4SpJj4z9fVf+a5D7gtiS7gKeBd7XxdwJXA8vAj4H3djzH3tUWPmXzVO881QrzVe881QprrDdVq7rUIOkM5QpOSV2mHhZJrkryeFvxuWflI8Zez2eSHBn+KHeWV6sm2ZLkniSPJnkkyQ2zWnOSlyX5RpIHW60fav0XJbm31XRrknNa/7lte7nt3zqpWodqXpfk/iR3zEGt411pXVVTuwHrgCeB1wDnAA8CF0+5pt8HLgUeHur7O2BPa+8BPtLaVwP/AgS4HLh3CvVuBC5t7VcC3wYunsWa23O+orXPBu5tNdwGXNf6PwX8SWv/KfCp1r4OuHUK/30/AHweuKNtz3KtTwEXHtc3svfBRF/MCV7cG4GvDm3fCNw4zZpaHVuPC4vHgY2tvRF4vLU/DVx/onFTrP124K2zXjPwm8A3gTcwWNh01vHvCeCrwBtb+6w2LhOscTODrzJcAdzR/mHNZK3teU8UFiN7H0z7NOSlVnvOmtWuVp2KNvV9PYP/Y89kzW1a/wCDdTl3MZhZPl9VL56gnl/U2va/AFwwqVqBjwEfBH7eti9gdmuFX660PthWSMMI3wf+YO8qVa1+teokJHkF8CXg/VX1o/ZRNzBbNVfVz4BLkqxnsMDvdVMu6YSSvB04UlUHk7xl2vV0GvlK62HTnlnMy2rPmV6tmuRsBkHxuar6cuue6Zqr6nngHgZT+fVJjv2Pa7ieX9Ta9r8K+MGESnwT8I4kTwG3MDgV+fiM1gqMf6X1tMPiPmBbu8J8DoMLQ/unXNOJzOxq1QymEDcDj1XVR4d2zVzNSRbajIIkL2dwbeUxBqFx7UvUeuw1XAvcXe0Ee9yq6saq2lxVWxm8L++uqnfPYq0woZXWk7wA8xIXZa5mcAX/SeCvZqCeLzD4huz/MTiP28Xg3PMA8ATw78D5bWyAv2+1fwtYnEK9b2ZwrvoQ8EC7XT2LNQO/C9zfan0Y+OvW/xrgGwxW/f4zcG7rf1nbXm77XzOl98Rb+OWnITNZa6vrwXZ75Ni/pVG+D1zBKanLtE9DJM0Jw0JSF8NCUhfDQlIXw0JSF8NCUhfDQlIXw0JSl/8Huhr8fpmXAZ4AAAAASUVORK5CYII=\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "plt.imshow(env.render('rgb_array'))" + "plt.imshow(env.render())" ] }, { @@ -110,9 +81,9 @@ "source": [ "### Step 1: Defining a network\n", "\n", - "With all it's complexity, at it's core TRPO is yet another policy gradient method. \n", + "With all it's complexity, at it's core TRPO is yet another policy gradient method.\n", "\n", - "This essentially means we're actually training a stochastic policy $ \\pi_\\theta(a|s) $. \n", + "This essentially means we're actually training a stochastic policy $\\pi_\\theta \\left( a \\middle| s \\right)$.\n", "\n", "And yes, it's gonna be a neural network. So let's start by defining one." ] @@ -124,43 +95,46 @@ "outputs": [], "source": [ "class TRPOAgent(nn.Module):\n", - " def __init__(self, state_shape, n_actions, hidden_size=32):\n", + " def __init__(self, state_shape: Tuple[int], n_actions: int):\n", " '''\n", " Here you should define your model\n", " You should have LOG-PROBABILITIES as output because you will need it to compute loss\n", - " We recommend that you start simple: \n", + " We recommend that you start simple:\n", " use 1-2 hidden layers with 100-500 units and relu for the first try\n", " '''\n", - " nn.Module.__init__(self)\n", + " super().__init__()\n", "\n", - " \n", - " self.model = None\n", + " assert isinstance(state_shape, tuple)\n", + " assert len(state_shape) == 1\n", + " input_dim = state_shape[0]\n", + " \n", + " # Prepare your model here.\n", + " \n", "\n", - " def forward(self, states):\n", + " def forward(self, states: torch.Tensor):\n", " \"\"\"\n", - " takes agent's observation (Variable), returns log-probabilities (Variable)\n", + " takes agent's observation, returns log-probabilities\n", " :param state_t: a batch of states, shape = [batch_size, state_shape]\n", " \"\"\"\n", "\n", - " # Use your network to compute log_probs for given state\n", - " log_probs = self.model(states)\n", + " # Use your network to compute log_probs for the given states.\n", + " \n", + " \n", " return log_probs\n", "\n", - " def get_log_probs(self, states):\n", + " def get_log_probs(self, states: torch.Tensor):\n", " '''\n", " Log-probs for training\n", " '''\n", - "\n", " return self.forward(states)\n", "\n", - " def get_probs(self, states):\n", + " def get_probs(self, states: torch.Tensor):\n", " '''\n", " Probs for interaction\n", " '''\n", - "\n", " return torch.exp(self.forward(states))\n", "\n", - " def act(self, obs, sample=True):\n", + " def act(self, obs: np.ndarray, sample: bool = True):\n", " '''\n", " Samples action from policy distribution (sample = True) or takes most likely action (sample = False)\n", " :param: obs - single observation vector\n", @@ -168,7 +142,8 @@ " :returns: action (single integer) and probabilities for all actions\n", " '''\n", "\n", - " probs = self.get_probs(Variable(torch.FloatTensor([obs]))).data.numpy()\n", + " with torch.no_grad():\n", + " probs = self.get_probs(torch.tensor(obs[np.newaxis], dtype=torch.float32)).numpy()\n", "\n", " if sample:\n", " action = int(np.random.choice(n_actions, p=probs[0]))\n", @@ -185,37 +160,21 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sampled: [(2, array([0.35253003, 0.37892205, 0.26854792], dtype=float32)), (2, array([0.35269254, 0.37673423, 0.27057323], dtype=float32)), (0, array([0.35406563, 0.37682924, 0.26910514], dtype=float32)), (0, array([0.3560282 , 0.37561142, 0.2683604 ], dtype=float32)), (1, array([0.35539204, 0.37685862, 0.26774937], dtype=float32))]\n", - "greedy: [(1, array([0.3518883 , 0.37830737, 0.2698043 ], dtype=float32)), (1, array([0.3544095 , 0.37609497, 0.26949552], dtype=float32)), (1, array([0.35528135, 0.37493262, 0.269786 ], dtype=float32)), (1, array([0.3589018 , 0.37457928, 0.26651892], dtype=float32)), (1, array([0.35414994, 0.3769723 , 0.26887777], dtype=float32))]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.5/dist-packages/torch/nn/modules/container.py:67: UserWarning: Implicit dimension choice for log_softmax has been deprecated. Change the call to include dim=X as an argument.\n", - " input = module(input)\n" - ] - } - ], + "outputs": [], "source": [ "# Check if log-probabilities satisfies all the requirements\n", - "log_probs = agent.get_log_probs(Variable(torch.FloatTensor([env.reset()])))\n", - "assert isinstance(\n", - " log_probs, Variable) and log_probs.requires_grad, \"qvalues must be a torch variable with grad\"\n", - "assert len(\n", - " log_probs.shape) == 2 and log_probs.shape[0] == 1 and log_probs.shape[1] == n_actions\n", - "sums = torch.sum(torch.exp(log_probs), dim=1)\n", - "assert (0.999 < sums).all() and (1.001 > sums).all()\n", + "log_probs = agent.get_log_probs(torch.tensor(env.reset()[0][np.newaxis], dtype=torch.float32))\n", + "assert (\n", + " isinstance(log_probs, torch.Tensor) and\n", + " log_probs.requires_grad\n", + "), \"log_probs must be a torch.Tensor with grad\"\n", + "assert log_probs.shape == (1, n_actions)\n", + "sums = torch.exp(log_probs).sum(dim=1)\n", + "assert torch.allclose(sums, torch.ones_like(sums))\n", "\n", "# Demo use\n", - "print(\"sampled:\", [agent.act(env.reset()) for _ in range(5)])\n", - "print(\"greedy:\", [agent.act(env.reset(), sample=False) for _ in range(5)])" + "print(\"sampled:\", [agent.act(env.reset()[0]) for _ in range(5)])\n", + "print(\"greedy:\", [agent.act(env.reset()[0], sample=False) for _ in range(5)])" ] }, { @@ -234,10 +193,7 @@ "outputs": [], "source": [ "def get_flat_params_from(model):\n", - " params = []\n", - " for param in model.parameters():\n", - " params.append(param.data.view(-1))\n", - "\n", + " params = [torch.ravel(param.detach()) for param in model.parameters()]\n", " flat_params = torch.cat(params)\n", " return flat_params\n", "\n", @@ -245,9 +201,10 @@ "def set_flat_params_to(model, flat_params):\n", " prev_ind = 0\n", " for param in model.parameters():\n", - " flat_size = int(np.prod(list(param.size())))\n", + " flat_size = int(np.prod(list(param.shape)))\n", " param.data.copy_(\n", - " flat_params[prev_ind:prev_ind + flat_size].view(param.size()))\n", + " flat_params[prev_ind:prev_ind + flat_size].reshape(param.shape)\n", + " )\n", " prev_ind += flat_size" ] }, @@ -255,7 +212,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Compute cummulative reward just like you did in vanilla REINFORCE" + "Compute cumulative reward just like you did in vanilla REINFORCE" ] }, { @@ -267,9 +224,9 @@ "import scipy.signal\n", "\n", "\n", - "def get_cummulative_returns(r, gamma=1):\n", + "def get_cumulative_returns(r, gamma=1):\n", " \"\"\"\n", - " Computes cummulative discounted rewards given immediate rewards\n", + " Computes cumulative discounted rewards given immediate rewards\n", " G_i = r_i + gamma*r_{i+1} + gamma^2*r_{i+2} + ...\n", " Also known as R(s,a).\n", " \"\"\"\n", @@ -285,7 +242,7 @@ "outputs": [], "source": [ "# simple demo on rewards [0,0,1,0,0,1]\n", - "get_cummulative_returns([0, 0, 1, 0, 0, 1], gamma=0.9)" + "get_cumulative_returns([0, 0, 1, 0, 0, 1], gamma=0.9)" ] }, { @@ -314,22 +271,23 @@ " total_timesteps = 0\n", " while total_timesteps < n_timesteps:\n", " obervations, actions, rewards, action_probs = [], [], [], []\n", - " obervation = env.reset()\n", + " obervation, _ = env.reset()\n", " for _ in range(max_pathlength):\n", " action, policy = agent.act(obervation)\n", " obervations.append(obervation)\n", " actions.append(action)\n", " action_probs.append(policy)\n", - " obervation, reward, done, _ = env.step(action)\n", + " obervation, reward, terminated, truncated, _ = env.step(action)\n", " rewards.append(reward)\n", " total_timesteps += 1\n", - " if done or total_timesteps == n_timesteps:\n", - " path = {\"observations\": np.array(obervations),\n", - " \"policy\": np.array(action_probs),\n", - " \"actions\": np.array(actions),\n", - " \"rewards\": np.array(rewards),\n", - " \"cumulative_returns\": get_cummulative_returns(rewards),\n", - " }\n", + " if terminated or truncated or total_timesteps >= n_timesteps:\n", + " path = {\n", + " \"observations\": np.array(obervations),\n", + " \"policy\": np.array(action_probs),\n", + " \"actions\": np.array(actions),\n", + " \"rewards\": np.array(rewards),\n", + " \"cumulative_returns\": get_cumulative_returns(rewards),\n", + " }\n", " paths.append(path)\n", " break\n", " return paths" @@ -341,14 +299,18 @@ "metadata": {}, "outputs": [], "source": [ + "from pprint import pprint\n", + "\n", "paths = rollout(env, agent, max_pathlength=5, n_timesteps=100)\n", - "print(paths[-1])\n", + "pprint(paths[-1])\n", + "\n", "assert (paths[0]['policy'].shape == (5, n_actions))\n", "assert (paths[0]['cumulative_returns'].shape == (5,))\n", "assert (paths[0]['rewards'].shape == (5,))\n", - "assert (paths[0]['observations'].shape == (5,)+observation_shape)\n", + "assert (paths[0]['observations'].shape == (5,) + observation_shape)\n", "assert (paths[0]['actions'].shape == (5,))\n", - "print('It\\'s ok')" + "\n", + "print(\"It's ok\")" ] }, { @@ -364,14 +326,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The surrogate reward should be\n", - "$$J_{surr}= {1 \\over N} \\sum\\limits_{i=0}^N \\frac{\\pi_{\\theta}(s_i, a_i)}{\\pi_{\\theta_{old}}(s_i, a_i)}A_{\\theta_{old}(s_i, a_i)}$$\n", + "The surrogate reward should be:\n", + "$$J_{surr}= {1 \\over N} \\sum\\limits_{i=1}^N \\frac{\\pi_{\\theta}(s_i, a_i)}{\\pi_{\\theta_{old}}(s_i, a_i)}A_{\\theta_{old}(s_i, a_i)}$$\n", "\n", - "For simplicity, let's use cummulative returns instead of advantage for now:\n", - "$$J'_{surr}= {1 \\over N} \\sum\\limits_{i=0}^N \\frac{\\pi_{\\theta}(s_i, a_i)}{\\pi_{\\theta_{old}}(s_i, a_i)}G_{\\theta_{old}(s_i, a_i)}$$\n", + "For simplicity, in this assignment we are going to use cumulative rewards instead of advantage:\n", + "$$J'_{surr}= {1 \\over N} \\sum\\limits_{i=1}^N \\frac{\\pi_{\\theta}(s_i, a_i)}{\\pi_{\\theta_{old}}(s_i, a_i)}G_{\\theta_{old}(s_i, a_i)}$$\n", "\n", - "Or alternatively, minimize the surrogate loss:\n", - "$$ L_{surr} = - J'_{surr} $$ \n" + "Since we want to maximize the reward, we are going to minimize the corresponding surrogate loss:\n", + "$$ L_{surr} = - J'_{surr} $$\n" ] }, { @@ -380,42 +342,39 @@ "metadata": {}, "outputs": [], "source": [ - "def get_loss(agent, observations, actions, cummulative_returns, old_probs):\n", + "def get_loss(agent, observations, actions, cumulative_returns, old_probs):\n", " \"\"\"\n", " Computes TRPO objective\n", - " :param: observations - batch of observations\n", - " :param: actions - batch of actions\n", - " :param: cummulative_returns - batch of cummulative returns\n", - " :param: old_probs - batch of probabilities computed by old network\n", + " :param: observations - batch of observations [timesteps x state_shape]\n", + " :param: actions - batch of actions [timesteps]\n", + " :param: cumulative_returns - batch of cumulative returns [timesteps]\n", + " :param: old_probs - batch of probabilities computed by old network [timesteps x num_actions]\n", " :returns: scalar value of the objective function\n", " \"\"\"\n", " batch_size = observations.shape[0]\n", - " log_probs_all = agent.get_log_probs(observations)\n", - " probs_all = torch.exp(log_probs_all)\n", + " probs_all = agent.get_probs(observations)\n", "\n", - " probs_for_actions = probs_all[torch.arange(\n", - " 0, batch_size, out=torch.LongTensor()), actions]\n", - " old_probs_for_actions = old_probs[torch.arange(\n", - " 0, batch_size, out=torch.LongTensor()), actions]\n", + " probs_for_actions = probs_all[torch.arange(batch_size), actions]\n", + " old_probs_for_actions = old_probs[torch.arange(batch_size), actions]\n", "\n", " # Compute surrogate loss, aka importance-sampled policy gradient\n", - " Loss = \n", + " \n", "\n", - " assert Loss.shape == torch.Size([])\n", - " return Loss" + " assert loss.ndim == 0\n", + " return loss" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can ascend these gradients as long as our $pi_\\theta(a|s)$ satisfies the constraint\n", - "$$E_{s,\\pi_{\\Theta_{t}}}\\Big[KL(\\pi(\\Theta_{t}, s) \\:||\\:\\pi(\\Theta_{t+1}, s))\\Big]< \\alpha$$\n", + "We can ascend these gradients as long as our $\\pi_\\theta(a|s)$ satisfies the constraint\n", + "$$\\mathbb{E}_{s,\\pi_{\\theta_{t}}} \\Big[ \\operatorname{KL} \\left( \\pi_{\\theta_{t}} (s) \\:\\|\\: \\pi_{\\theta_{t+1}} (s) \\right) \\Big] < \\alpha$$\n", "\n", "\n", "where\n", "\n", - "$$KL(p||q) = E _p log({p \\over q})$$" + "$$\\operatorname{KL} \\left( p \\| q \\right) = \\mathbb{E}_p \\log \\left( \\frac p q \\right)$$" ] }, { @@ -424,26 +383,27 @@ "metadata": {}, "outputs": [], "source": [ - "def get_kl(agent, observations, actions, cummulative_returns, old_probs):\n", + "def get_kl(agent, observations, actions, cumulative_returns, old_probs):\n", " \"\"\"\n", " Computes KL-divergence between network policy and old policy\n", - " :param: observations - batch of observations\n", - " :param: actions - batch of actions\n", - " :param: cummulative_returns - batch of cummulative returns (we don't need it actually)\n", - " :param: old_probs - batch of probabilities computed by old network\n", + " :param: observations - batch of observations [timesteps x state_shape]\n", + " :param: actions - batch of actions [timesteps]\n", + " :param: cumulative_returns - batch of cumulative returns [timesteps] (we don't need it actually)\n", + " :param: old_probs - batch of probabilities computed by old network [timesteps x num_actions]\n", " :returns: scalar value of the KL-divergence\n", " \"\"\"\n", " batch_size = observations.shape[0]\n", " log_probs_all = agent.get_log_probs(observations)\n", " probs_all = torch.exp(log_probs_all)\n", "\n", - " # Compute Kullback-Leibler divergence (see formula above)\n", - " # Note: you need to sum KL and entropy over all actions, not just the ones agent took\n", - " old_log_probs = torch.log(old_probs+1e-10)\n", + " # Compute Kullback-Leibler divergence (see formula above).\n", + " # Note: you need to sum KL and entropy over all actions, not just the ones agent took.\n", + " # You will also need to compute max KL over all timesteps.\n", + " old_log_probs = torch.log(old_probs + 1e-10)\n", "\n", - " kl = \n", + " \n", "\n", - " assert kl.shape == torch.Size([])\n", + " assert kl.ndim == 0\n", " assert (kl > -0.0001).all() and (kl < 10000).all()\n", " return kl" ] @@ -456,20 +416,19 @@ "source": [ "def get_entropy(agent, observations):\n", " \"\"\"\n", - " Computes entropy of the network policy \n", + " Computes entropy of the network policy\n", " :param: observations - batch of observations\n", " :returns: scalar value of the entropy\n", " \"\"\"\n", "\n", - " observations = Variable(torch.FloatTensor(observations))\n", + " observations = torch.tensor(observations, dtype=torch.float32)\n", "\n", - " batch_size = observations.shape[0]\n", " log_probs_all = agent.get_log_probs(observations)\n", " probs_all = torch.exp(log_probs_all)\n", "\n", - " entropy = torch.sum(-probs_all * log_probs_all) / batch_size\n", + " entropy = (-probs_all * log_probs_all).sum(dim=1).mean(dim=0)\n", "\n", - " assert entropy.shape == torch.Size([])\n", + " assert entropy.ndim == 0\n", " return entropy" ] }, @@ -479,7 +438,7 @@ "source": [ "**Linear search**\n", "\n", - "TRPO in its core involves ascending surrogate policy gradient constrained by KL divergence. \n", + "TRPO in its core involves ascending surrogate policy gradient constrained by KL divergence.\n", "\n", "In order to enforce this constraint, we're gonna use linesearch. You can find out more about it [here](https://en.wikipedia.org/wiki/Linear_search)" ] @@ -490,7 +449,7 @@ "metadata": {}, "outputs": [], "source": [ - "def linesearch(f, x, fullstep, max_kl):\n", + "def linesearch(f, x: torch.Tensor, fullstep: torch.Tensor, max_kl: float, max_backtracks: int = 10, backtrack_coef: float = 0.5):\n", " \"\"\"\n", " Linesearch finds the best parameters of neural networks in the direction of fullstep contrainted by KL divergence.\n", " :param: f - function that returns loss, kl and arbitrary third component.\n", @@ -499,13 +458,11 @@ " :param: max_kl - constraint of KL divergence.\n", " :returns:\n", " \"\"\"\n", - " max_backtracks = 10\n", " loss, _, = f(x)\n", - " for stepfrac in .5**np.arange(max_backtracks):\n", + " for stepfrac in backtrack_coef**np.arange(max_backtracks):\n", " xnew = x + stepfrac * fullstep\n", " new_loss, kl = f(xnew)\n", - " actual_improve = new_loss - loss\n", - " if kl.data.numpy() <= max_kl and actual_improve.data.numpy() < 0:\n", + " if kl <= max_kl and new_loss < loss:\n", " x = xnew\n", " loss = new_loss\n", " return x" @@ -517,9 +474,9 @@ "source": [ "**Conjugate gradients**\n", "\n", - "Since TRPO includes contrainted optimization, we will need to solve Ax=b using conjugate gradients.\n", + "Since TRPO includes contrainted optimization, we will need to solve $A x = b$ using conjugate gradients.\n", "\n", - "In general, CG is an algorithm that solves Ax=b where A is positive-defined. A is Hessian matrix so A is positive-defined. You can find out more about them [here](https://en.wikipedia.org/wiki/Conjugate_gradient_method)" + "In general, CG is an algorithm that solves $A x = b$ where $A$ is positive-defined. $A$ is the Hessian matrix so $A$ is positive-defined. You can find out more about CG [here](https://en.wikipedia.org/wiki/Conjugate_gradient_method)." ] }, { @@ -528,12 +485,9 @@ "metadata": {}, "outputs": [], "source": [ - "from numpy.linalg import inv\n", - "\n", - "\n", "def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10):\n", " \"\"\"\n", - " This method solves system of equation Ax=b using iterative method called conjugate gradients\n", + " This method solves system of equation Ax=b using an iterative method called conjugate gradients\n", " :f_Ax: function that returns Ax\n", " :b: targets for Ax\n", " :cg_iters: how many iterations this method should do\n", @@ -541,7 +495,7 @@ " \"\"\"\n", " p = b.clone()\n", " r = b.clone()\n", - " x = torch.zeros(b.size())\n", + " x = torch.zeros_like(b)\n", " rdotr = torch.sum(r*r)\n", " for i in range(cg_iters):\n", " z = f_Ax(p)\n", @@ -565,19 +519,18 @@ "source": [ "# This code validates conjugate gradients\n", "A = np.random.rand(8, 8)\n", - "A = np.matmul(np.transpose(A), A)\n", + "A = A.T @ A\n", "\n", "\n", "def f_Ax(x):\n", - " return torch.matmul(torch.FloatTensor(A), x.view((-1, 1))).view(-1)\n", + " return torch.ravel(torch.tensor(A, dtype=torch.float32) @ x.reshape(-1, 1))\n", "\n", "\n", "b = np.random.rand(8)\n", + "w = (np.linalg.inv(A.T @ A) @ A.T @ b.reshape(-1, 1)).reshape(-1)\n", "\n", - "w = np.matmul(np.matmul(inv(np.matmul(np.transpose(A), A)),\n", - " np.transpose(A)), b.reshape((-1, 1))).reshape(-1)\n", "print(w)\n", - "print(conjugate_gradient(f_Ax, torch.FloatTensor(b)).numpy())" + "print(conjugate_gradient(f_Ax, torch.tensor(b, dtype=torch.float32)).numpy())" ] }, { @@ -594,47 +547,44 @@ "metadata": {}, "outputs": [], "source": [ - "def update_step(agent, observations, actions, cummulative_returns, old_probs, max_kl):\n", + "def update_step(agent, observations, actions, cumulative_returns, old_probs, max_kl):\n", " \"\"\"\n", " This function does the TRPO update step\n", " :param: observations - batch of observations\n", " :param: actions - batch of actions\n", - " :param: cummulative_returns - batch of cummulative returns\n", + " :param: cumulative_returns - batch of cumulative returns\n", " :param: old_probs - batch of probabilities computed by old network\n", " :param: max_kl - controls how big KL divergence may be between old and new policy every step.\n", " :returns: KL between new and old policies and the value of the loss function.\n", " \"\"\"\n", "\n", " # Here we prepare the information\n", - " observations = Variable(torch.FloatTensor(observations))\n", - " actions = torch.LongTensor(actions)\n", - " cummulative_returns = Variable(torch.FloatTensor(cummulative_returns))\n", - " old_probs = Variable(torch.FloatTensor(old_probs))\n", + " observations = torch.tensor(observations, dtype=torch.float32)\n", + " actions = torch.tensor(actions, dtype=torch.int64)\n", + " cumulative_returns = torch.tensor(cumulative_returns, dtype=torch.float32)\n", + " old_probs = torch.tensor(old_probs, dtype=torch.float32)\n", "\n", " # Here we compute gradient of the loss function\n", - " loss = get_loss(agent, observations, actions,\n", - " cummulative_returns, old_probs)\n", + " loss = get_loss(agent, observations, actions, cumulative_returns, old_probs)\n", " grads = torch.autograd.grad(loss, agent.parameters())\n", - " loss_grad = torch.cat([grad.view(-1) for grad in grads]).data\n", + " loss_grad = torch.cat([torch.ravel(grad.detach()) for grad in grads])\n", "\n", " def Fvp(v):\n", " # Here we compute Fx to do solve Fx = g using conjugate gradients\n", " # We actually do here a couple of tricks to compute it efficiently\n", "\n", - " kl = get_kl(agent, observations, actions,\n", - " cummulative_returns, old_probs)\n", + " kl = get_kl(agent, observations, actions, cumulative_returns, old_probs)\n", "\n", " grads = torch.autograd.grad(kl, agent.parameters(), create_graph=True)\n", - " flat_grad_kl = torch.cat([grad.view(-1) for grad in grads])\n", + " flat_grad_kl = torch.cat([grad.reshape(-1) for grad in grads])\n", "\n", - " kl_v = (flat_grad_kl * Variable(v)).sum()\n", + " kl_v = (flat_grad_kl * v).sum()\n", " grads = torch.autograd.grad(kl_v, agent.parameters())\n", - " flat_grad_grad_kl = torch.cat(\n", - " [grad.contiguous().view(-1) for grad in grads]).data\n", + " flat_grad_grad_kl = torch.cat([torch.ravel(grad) for grad in grads]).detach()\n", "\n", " return flat_grad_grad_kl + v * 0.1\n", "\n", - " # Here we solveolve Fx = g system using conjugate gradients\n", + " # Here we solve Fx = g system using conjugate gradients\n", " stepdir = conjugate_gradient(Fvp, -loss_grad, 10)\n", "\n", " # Here we compute the initial vector to do linear search\n", @@ -651,8 +601,10 @@ " def get_loss_kl(params):\n", " # Helper for linear search\n", " set_flat_params_to(agent, params)\n", - " return [get_loss(agent, observations, actions, cummulative_returns, old_probs),\n", - " get_kl(agent, observations, actions, cummulative_returns, old_probs)]\n", + " return [\n", + " get_loss(agent, observations, actions, cumulative_returns, old_probs),\n", + " get_kl(agent, observations, actions, cumulative_returns, old_probs),\n", + " ]\n", "\n", " # Here we find our new parameters\n", " new_params = linesearch(get_loss_kl, prev_params, fullstep, max_kl)\n", @@ -667,7 +619,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "##### Step 5: Main TRPO loop\n", + "### Step 5: Main TRPO loop\n", "\n", "Here we will train our network!" ] @@ -680,16 +632,14 @@ "source": [ "import time\n", "from itertools import count\n", - "from collections import OrderedDict\n", "\n", - "# this is hyperparameter of TRPO. It controls how big KL divergence may be between old and new policy every step.\n", + "# TRPO hyperparameter; controls how big KL divergence may be between the old and the new policy at every step.\n", "max_kl = 0.01\n", - "numeptotal = 0 # this is number of episodes that we played.\n", + "numeptotal = 0 # Number of episodes we have completed so far.\n", "\n", "start_time = time.time()\n", "\n", "for i in count(1):\n", - "\n", " print(\"\\n********** Iteration %i ************\" % i)\n", "\n", " # Generating paths.\n", @@ -703,13 +653,12 @@ " returns = np.concatenate([path[\"cumulative_returns\"] for path in paths])\n", " old_probs = np.concatenate([path[\"policy\"] for path in paths])\n", "\n", - " loss, kl = update_step(agent, observations, actions,\n", - " returns, old_probs, max_kl)\n", + " loss, kl = update_step(agent, observations, actions, returns, old_probs, max_kl)\n", "\n", " # Report current progress\n", " episode_rewards = np.array([path[\"rewards\"].sum() for path in paths])\n", "\n", - " stats = OrderedDict()\n", + " stats = {}\n", " numeptotal += len(episode_rewards)\n", " stats[\"Total number of episodes\"] = numeptotal\n", " stats[\"Average sum of rewards per episode\"] = episode_rewards.mean()\n", @@ -719,8 +668,7 @@ " stats[\"Entropy\"] = get_entropy(agent, observations).data.numpy()\n", " stats[\"Surrogate loss\"] = loss.data.numpy()\n", " for k, v in stats.items():\n", - " print(k + \": \" + \" \" * (40 - len(k)) + str(v))\n", - " i += 1" + " print(k + \": \" + \" \" * (40 - len(k)) + str(v))" ] }, { @@ -733,12 +681,12 @@ "\n", "![img](https://s17.postimg.cc/i90chxgvj/vine.png)\n", "\n", - "In most gym environments, you can actually backtrack by using states. You can find a wrapper that saves/loads states in [the mcts seminar](https://github.com/yandexdataschool/Practical_RL/blob/master/week10_planning/seminar_MCTS.ipynb).\n", + "In most gym environments, you can actually backtrack by using states. You can find a wrapper that saves/loads states in [the MCTS seminar](https://github.com/yandexdataschool/Practical_RL/blob/master/week10_planning/seminar_MCTS.ipynb).\n", "\n", - "You can read more about in the [TRPO article](https://arxiv.org/abs/1502.05477) in section 5.2.\n", + "You can read more about TRPO in the [original paper](https://arxiv.org/abs/1502.05477) in section 5.2.\n", "\n", "The goal here is to implement such rollout policy (we recommend using tree data structure like in the seminar above).\n", - "Then you can assign cummulative rewards similar to `get_cummulative_rewards`, but for a tree.\n", + "Then you can assign cumulative rewards similar to `get_cumulative_rewards`, but for a tree.\n", "\n", "__bonus task__ - parallelize samples using multiple cores" ] @@ -750,14 +698,14 @@ "# Homework option II (10+pts)\n", "\n", "Let's use TRPO to train evil robots! (pick any of two)\n", - "* [MuJoCo robots](https://gym.openai.com/envs#mujoco)\n", - "* [Box2d robot](https://gym.openai.com/envs/BipedalWalker-v2)\n", + "* [MuJoCo robots](https://gymnasium.farama.org/environments/mujoco/#mujoco)\n", + "* [Box2d robot](https://gymnasium.farama.org/environments/box2d/bipedal_walker/)\n", "\n", - "The catch here is that those environments have continuous action spaces. \n", + "The catch here is that those environments have continuous action spaces.\n", "\n", "Luckily, TRPO is a policy gradient method, so it's gonna work for any parametric $\\pi_\\theta(a|s)$. We recommend starting with gaussian policy:\n", "\n", - "$$\\pi_\\theta(a|s) = N(\\mu_\\theta(s),\\sigma^2_\\theta(s)) = {1 \\over \\sqrt { 2 \\pi {\\sigma^2}_\\theta(s) } } e^{ (a - \n", + "$$\\pi_\\theta(a|s) = N(\\mu_\\theta(s),\\sigma^2_\\theta(s)) = {1 \\over \\sqrt { 2 \\pi {\\sigma^2}_\\theta(s) } } e^{ (a -\n", "\\mu_\\theta(s))^2 \\over 2 {\\sigma^2}_\\theta(s) } $$\n", "\n", "In the $\\sqrt { 2 \\pi {\\sigma^2}_\\theta(s) }$ clause, $\\pi$ means ~3.1415926, not agent's policy.\n", @@ -766,17 +714,10 @@ "* $\\mu_\\theta(s)$, a dense layer with linear activation\n", "* ${\\sigma^2}_\\theta(s)$, a dense layer with activation tf.exp (to make it positive; like rho from bandits)\n", "\n", - "For multidimensional actions, you can use fully factorized gaussian (basically a vector of gaussians).\n", + "For multidimensional actions, you can use a fully factorized gaussian (basically a vector of gaussians).\n", "\n", - "__bonus task__: compare performance of continuous action space method to action space discretization" + "__Bonus task__: compare the performance of the continuous action space method to action space discretization." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -786,5 +727,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 1 } diff --git a/week09_policy_II/seminar_TRPO_theano.ipynb b/week09_policy_II/seminar_TRPO_theano.ipynb deleted file mode 100644 index 15e0b367d..000000000 --- a/week09_policy_II/seminar_TRPO_theano.ipynb +++ /dev/null @@ -1,733 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# launch XVFB if you run on a server\n", - "import os\n", - "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", - " !bash ../xvfb start\n", - " os.environ['DISPLAY'] = ':1'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Let's make a TRPO!\n", - "\n", - "In this notebook we will write the code of the one Trust Region Policy Optimization.\n", - "As usually, it contains a few different parts which we are going to reproduce.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import theano\n", - "import theano.tensor as T" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Observation Space Box(6,)\n", - "Action Space Discrete(3)\n" - ] - } - ], - "source": [ - "import gym\n", - "\n", - "env = gym.make(\"Acrobot-v1\")\n", - "env.reset()\n", - "\n", - "observation_shape = env.observation_space.shape\n", - "n_actions = env.action_space.n\n", - "print(\"Observation Space\", env.observation_space)\n", - "print(\"Action Space\", env.action_space)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQsAAAD8CAYAAABgtYFHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADktJREFUeJzt3W2opGd9x/Hvr5sHbRXXJMew7K5sxKXoizZmD3ElpdhE\nS0zFzYsEIlKXsLDQWlAs2E0LLUJfaF+YIBR1aaRrUZPUB7KEtDZsEkpfGHPWPLuNOUpqDhvclTzY\nItpG/30x19Fx9yTn2j0zZ2Z2vx8Y5rqv+7pn/hNmf7nue64zk6pCklbzG5MuQNJsMCwkdTEsJHUx\nLCR1MSwkdTEsJHUZS1gkuTrJk0kWk+wbx3NIWl8Z9TqLJBuA7wLvBpaAB4H3V9V3RvpEktbVOGYW\nlwOLVfX9qvpf4DZg1xieR9I6OmcMj7kZeGZoewl4+ysdcNFFF9W2bdvGUIqkZYcPH/5RVc2d7vHj\nCIus0HfSuU6SvcBegDe+8Y0sLCyMoRRJy5L811qOH8dpyBKwdWh7C3D0xEFVtb+q5qtqfm7utMNO\n0joZR1g8CGxPckmS84AbgINjeB5J62jkpyFV9VKSPwO+AWwAPl9VT4z6eSStr3Fcs6Cq7gbuHsdj\nS5oMV3BK6mJYSOpiWEjqYlhI6mJYSOpiWEjqYlhI6mJYSOpiWEjqYlhI6mJYSOpiWEjqYlhI6mJY\nSOpiWEjqYlhI6mJYSOpiWEjqYlhI6mJYSOpiWEjqYlhI6mJYSOpiWEjqYlhI6mJYSOpiWEjqYlhI\n6mJYSOpiWEjqYlhI6mJYSOpiWEjqYlhI6mJYSOqyalgk+XySY0keH+q7IMk9SZ5q969v/Uny6SSL\nSR5Nctk4i5e0fnpmFv8IXH1C3z7gUFVtBw61bYD3ANvbbS/wmdGUKWnSVg2Lqvp34LkTuncBB1r7\nAHDtUP8XauCbwMYkm0ZVrKTJOd1rFhdX1bMA7f4NrX8z8MzQuKXWd5Ike5MsJFk4fvz4aZYhab2M\n+gJnVuirlQZW1f6qmq+q+bm5uRGXIWnUTjcsfrh8etHuj7X+JWDr0LgtwNHTL0/StDjdsDgI7G7t\n3cCdQ/0fbJ+K7AReXD5dkTTbzlltQJIvA+8ELkqyBPwN8AngjiR7gB8A17fhdwPXAIvAT4Abx1Cz\npAlYNSyq6v0vs+uqFcYW8KG1FiVp+riCU1IXw0JSF8NCUhfDQlIXw0JSF8NCUhfDQlIXw0JSF8NC\nUpcMFl1OuIhk8kVIZ77DVTV/ugevutx7PezYsYOFhYVJlyGd0ZKVvkGin6chkroYFpK6GBaSuhgW\nkroYFpK6GBaSuhgWkroYFpK6GBaSuhgWkroYFpK6GBaSuhgWkroYFpK6GBaSuhgWkroYFpK6GBaS\nuhgWkroYFpK6GBaSuhgWkroYFpK6rBoWSbYmuS/JkSRPJPlw678gyT1Jnmr3r2/9SfLpJItJHk1y\n2bhfhKTx65lZvAT8eVW9BdgJfCjJW4F9wKGq2g4catsA7wG2t9te4DMjr1rSuls1LKrq2ar6dmv/\nN3AE2AzsAg60YQeAa1t7F/CFGvgmsDHJppFXLmldndI1iyTbgLcBDwAXV9WzMAgU4A1t2GbgmaHD\nllqfpBnWHRZJXgN8FfhIVf34lYau0HfSDx8n2ZtkIcnC8ePHe8uQNCFdYZHkXAZB8cWq+lrr/uHy\n6UW7P9b6l4CtQ4dvAY6e+JhVtb+q5qtqfm5u7nTrl7ROej4NCXArcKSqPjW06yCwu7V3A3cO9X+w\nfSqyE3hx+XRF0uw6p2PMFcAfA48lebj1/SXwCeCOJHuAHwDXt313A9cAi8BPgBtHWrGkiVg1LKrq\nP1j5OgTAVSuML+BDa6xL0pRxBaekLoaFpC6GhaQuhoWkLoaFpC6GhaQuhoWkLoaFpC6GhaQuhoWk\nLoaFpC49f0gm/dLhw7/+Z0I7dpz0VSU6QzmzULcTg+Ll+nRmMizU5ZVCwcA4OxgWWlVPGBgYZz7D\nQlIXw0JSF8NCUhfDQquaZ2EkYzTbDAt1eaUwMCjODoaFuq0UCgbF2cMVnDolhsPZy5mFpC6GhaQu\nhoWkLoaFpC6GhaQuhoWkLoaFpC6GhaQuhoWkLoaFpC6GhaQuhoWkLoaFpC6rhkWSVyX5VpJHkjyR\n5OOt/5IkDyR5KsntSc5r/ee37cW2f9t4X4Kk9dAzs/gZcGVV/S5wKXB1kp3AJ4Gbq2o78Dywp43f\nAzxfVW8Gbm7jJM24VcOiBv6nbZ7bbgVcCXyl9R8Arm3tXW2btv+qJH5PvDTjuq5ZJNmQ5GHgGHAP\n8D3ghap6qQ1ZAja39mbgGYC2/0XgwhUec2+ShSQLx48fX9urkDR2XWFRVT+vqkuBLcDlwFtWGtbu\nV5pFnPSDmFW1v6rmq2p+bm6ut15JE3JKn4ZU1QvA/cBOYGOS5a/l2wIcbe0lYCtA2/864LlRFCtp\ncno+DZlLsrG1Xw28CzgC3Adc14btBu5s7YNtm7b/3qryp7alGdfzhb2bgANJNjAIlzuq6q4k3wFu\nS/K3wEPArW38rcA/JVlkMKO4YQx1S1pnq4ZFVT0KvG2F/u8zuH5xYv9PgetHUp2kqeEKTkldDAtJ\nXQwLrap27Jh0CZoChoWkLoaFpC6GhaQuhoWkLoaFpC6GhaQuhoWkLoaFpC6GhUYmhw9PugSNkWEh\nqYthIamLYSGpi2EhqYthodOywDwLzE+6DK0jw0Kn5MSQMDDOHoaFur1cMBgYZwfDQl1WCwQD48xn\nWGhkDIwzm2EhqYthIamLYaGRmWdh0iVojAwLdVktCAyKM59hoTUzKM4OPb91KgGGwtnOmYWkLoaF\npC6GhaQuhoWkLoaFpC6GhaQuhoWkLt1hkWRDkoeS3NW2L0nyQJKnktye5LzWf37bXmz7t42ndEnr\n6VRmFh8GjgxtfxK4uaq2A88De1r/HuD5qnozcHMbJ2nGdYVFki3AHwH/0LYDXAl8pQ05AFzb2rva\nNm3/VW28pBnWO7O4BfgY8Iu2fSHwQlW91LaXgM2tvRl4BqDtf7GNlzTDVg2LJO8FjlXV8G/TrTRT\nqI59w4+7N8lCkoXjx493FStpcnpmFlcA70vyNHAbg9OPW4CNSZb/EG0LcLS1l4CtAG3/64DnTnzQ\nqtpfVfNVNT83N7emFyFp/FYNi6q6qaq2VNU24Abg3qr6AHAfcF0bthu4s7UPtm3a/nur6qSZhaTZ\nspZ1Fn8BfDTJIoNrEre2/luBC1v/R4F9aytR0jQ4pe+zqKr7gftb+/vA5SuM+Slw/QhqkzRFXMEp\nqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGpi2EhqYthIamLYSGp\ni2EhqYthIamLYSGpi2GhLrVjx6RL0IQZFpK6GBaSuhgWkroYFpK6GBaSuhgWkroYFpK6GBYaqRw+\nPOkSNCaGhaQup/TDyDq7uYrz7ObMQlIXw0JSF8NCUhfDQlIXw0JSF8NCUpeusEjydJLHkjycZKH1\nXZDkniRPtfvXt/4k+XSSxSSPJrlsnC9A0vo4lZnFH1TVpVU137b3AYeqajtwqG0DvAfY3m57gc+M\nqlhJk7OW05BdwIHWPgBcO9T/hRr4JrAxyaY1PI+kKdC7grOAf0tSwOeqaj9wcVU9C1BVzyZ5Qxu7\nGXhm6Nil1vfs8AMm2ctg5gHwsySPn+ZrmISLgB9NuohOs1QrzFa9s1QrwG+v5eDesLiiqo62QLgn\nyX++wtis0FcndQwCZz9AkoWh05upN0v1zlKtMFv1zlKtMKh3Lcd3nYZU1dF2fwz4OnA58MPl04t2\nf6wNXwK2Dh2+BTi6liIlTd6qYZHkt5K8drkN/CHwOHAQ2N2G7QbubO2DwAfbpyI7gReXT1ckza6e\n05CLga8nWR7/par61yQPAnck2QP8ALi+jb8buAZYBH4C3NjxHPtPtfAJm6V6Z6lWmK16Z6lWWGO9\nqTrpcoIkncQVnJK6TDwsklyd5Mm24nPf6keMvZ7PJzk2/FHuNK9WTbI1yX1JjiR5IsmHp7XmJK9K\n8q0kj7RaP976L0nyQKv19iTntf7z2/Zi279tvWodqnlDkoeS3DUDtY53pXVVTewGbAC+B7wJOA94\nBHjrhGv6feAy4PGhvr8D9rX2PuCTrX0N8C8MPi7eCTwwgXo3AZe19muB7wJvncaa23O+prXPBR5o\nNdwB3ND6Pwv8SWv/KfDZ1r4BuH0C/30/CnwJuKttT3OtTwMXndA3svfBur6YFV7cO4BvDG3fBNw0\nyZpaHdtOCIsngU2tvQl4srU/B7x/pXETrP1O4N3TXjPwm8C3gbczWNh0zonvCeAbwDta+5w2LutY\n4xYGf8pwJXBX+4c1lbW2510pLEb2Ppj0acjLrfacNr+2WhVYbbXqRLSp79sY/B97Kmtu0/qHGazL\nuYfBzPKFqnpphXp+WWvb/yJw4XrVCtwCfAz4Rdu+kOmtFX610vpwWyENI3wfTPoLe7tWe06xqak/\nyWuArwIfqaoft4+6Vxy6Qt+61VxVPwcuTbKRwQK/t7xCPROrNcl7gWNVdTjJOzvqmYb3wshXWg+b\n9MxiVlZ7TvVq1STnMgiKL1bV11r3VNdcVS8A9zM4X96YZPl/XMP1/LLWtv91wHPrVOIVwPuSPA3c\nxuBU5JYprRUY/0rrSYfFg8D2doX5PAYXhg5OuKaVTO1q1QymELcCR6rqU0O7pq7mJHNtRkGSVwPv\nAo4A9wHXvUyty6/hOuDeaifY41ZVN1XVlqraxuB9eW9VfWAaa4V1Wmm9nhdgXuaizDUMruB/D/ir\nKajnywz+Qvb/GKTvHgbnnoeAp9r9BW1sgL9vtT8GzE+g3t9jMH18FHi43a6ZxpqB3wEearU+Dvx1\n638T8C0Gq37/GTi/9b+qbS+2/W+a0Hvinfzq05CprLXV9Ui7PbH8b2mU7wNXcErqMunTEEkzwrCQ\n1MWwkNTFsJDUxbCQ1MWwkNTFsJDUxbCQ1OX/ASKF9tx4Ki+cAAAAAElFTkSuQmCC\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "plt.imshow(env.render('rgb_array'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 1: Defining a network\n", - "\n", - "With all it's complexity, at it's core TRPO is yet another policy gradient method. \n", - "\n", - "This essentially means we're actually training a stochastic policy $ \\pi_\\theta(a|s) $. \n", - "\n", - "And yes, it's gonna be a neural network. So let's start by defining one." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# input tensors\n", - "observations = T.matrix(name=\"obs\")\n", - "actions = T.ivector(name=\"action\")\n", - "cummulative_returns = T.vector(name=\"G = r + gamma*r' + gamma^2*r'' + ...\")\n", - "old_probs = T.matrix(name=\"action probabilities from previous iteration\")\n", - "\n", - "all_inputs = [observations, actions, cummulative_returns, old_probs]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create neural network.\n", - "from lasagne.layers import *\n", - "\n", - "nn = InputLayer((None,)+observation_shape, input_var=observations)\n", - "\n", - "\n", - "\n", - "policy = \n", - "\n", - "probs = get_output(policy)\n", - "\n", - "weights = get_all_params(policy, trainable=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 2: Actions and rollouts\n", - "\n", - "In this section, we'll define functions that take actions $ a \\sim \\pi_\\theta(a|s) $ and rollouts $ $." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# compile function\n", - "get_policy = theano.function([observations], probs, allow_input_downcast=True)\n", - "\n", - "\n", - "def act(obs, sample=True):\n", - " \"\"\"\n", - " Samples action from policy distribution (sample = True) or takes most likely action (sample = False)\n", - " :param: obs - single observation vector\n", - " :param sample: if True, samples from \\pi, otherwise takes most likely action\n", - " :returns: action (single integer) and probabilities for all actions\n", - " \"\"\"\n", - "\n", - " policy = get_policy([obs])[0]\n", - "\n", - " if sample:\n", - " action = int(np.random.choice(n_actions, p=policy))\n", - " else:\n", - " action = int(np.argmax(policy))\n", - "\n", - " return action, policy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# demo\n", - "print(\"sampled:\", [act(env.reset()) for _ in range(100)])\n", - "print(\"greedy:\", [act(env.reset(), sample=False) for _ in range(100)])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Compute cummulative reward just like you did in vanilla REINFORCE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import scipy.signal\n", - "\n", - "\n", - "def get_cummulative_returns(r, gamma=1):\n", - " \"\"\"\n", - " Computes cummulative discounted rewards given immediate rewards\n", - " G_i = r_i + gamma*r_{i+1} + gamma^2*r_{i+2} + ...\n", - " Also known as R(s,a).\n", - " \"\"\"\n", - " r = np.array(r)\n", - " assert r.ndim >= 1\n", - " return scipy.signal.lfilter([1], [1, -gamma], r[::-1], axis=0)[::-1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# simple demo on rewards [0,0,1,0,0,1]\n", - "get_cummulative_returns([0, 0, 1, 0, 0, 1], gamma=0.9)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Rollout**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def rollout(env, act, max_pathlength=2500, n_timesteps=50000):\n", - " \"\"\"\n", - " Generate rollouts for training.\n", - " :param: env - environment in which we will make actions to generate rollouts.\n", - " :param: act - the function that can return policy and action given observation.\n", - " :param: max_pathlength - maximum size of one path that we generate.\n", - " :param: n_timesteps - total sum of sizes of all pathes we generate.\n", - " \"\"\"\n", - " paths = []\n", - "\n", - " total_timesteps = 0\n", - " while total_timesteps < n_timesteps:\n", - " obervations, actions, rewards, action_probs = [], [], [], []\n", - " obervation = env.reset()\n", - " for _ in range(max_pathlength):\n", - " action, policy = act(obervation)\n", - " obervations.append(obervation)\n", - " actions.append(action)\n", - " action_probs.append(policy)\n", - " obervation, reward, done, _ = env.step(action)\n", - " rewards.append(reward)\n", - " total_timesteps += 1\n", - " if done or total_timesteps == n_timesteps:\n", - " path = {\"observations\": np.array(obervations),\n", - " \"policy\": np.array(action_probs),\n", - " \"actions\": np.array(actions),\n", - " \"rewards\": np.array(rewards),\n", - " \"cumulative_returns\": get_cummulative_returns(rewards),\n", - " }\n", - " paths.append(path)\n", - " break\n", - " return paths" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "paths = rollout(env, act, max_pathlength=5, n_timesteps=100)\n", - "print(paths[-1])\n", - "assert (paths[0]['policy'].shape == (5, n_actions))\n", - "assert (paths[0]['cumulative_returns'].shape == (5,))\n", - "assert (paths[0]['rewards'].shape == (5,))\n", - "assert (paths[0]['observations'].shape == (5,)+observation_shape)\n", - "assert (paths[0]['actions'].shape == (5,))\n", - "print('It\\'s ok')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 3: loss functions\n", - "\n", - "Now let's define the loss functions and constraints for actual TRPO training." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The surrogate reward should be\n", - "$$J_{surr}= {1 \\over N} \\sum\\limits_{i=0}^N \\frac{\\pi_{\\theta}(s_i, a_i)}{\\pi_{\\theta_{old}}(s_i, a_i)}A_{\\theta_{old}(s_i, a_i)}$$\n", - "\n", - "For simplicity, let's use cummulative returns instead of advantage for now:\n", - "$$J'_{surr}= {1 \\over N} \\sum\\limits_{i=0}^N \\frac{\\pi_{\\theta}(s_i, a_i)}{\\pi_{\\theta_{old}}(s_i, a_i)}G_{\\theta_{old}(s_i, a_i)}$$\n", - "\n", - "Or alternatively, minimize the surrogate loss:\n", - "$$ L_{surr} = - J'_{surr} $$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# select probabilities of chosen actions\n", - "batch_size = actions.shape[0]\n", - "\n", - "probs_for_actions = probs[T.arange(batch_size), actions]\n", - "old_probs_for_actions = old_probs[T.arange(batch_size), actions]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Compute surrogate loss: negative importance-sampled policy gradient\n", - "\n", - "L_surr = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# compute and return surrogate policy gradient\n", - "\n", - "\n", - "def get_flat_gradient(loss, var_list):\n", - " \"\"\"gradient of loss wrt var_list flattened into a large vector\"\"\"\n", - " grads = T.grad(loss, var_list)\n", - " return T.concatenate([grad.ravel() for grad in grads])\n", - "\n", - "\n", - "get_surrogate_gradients = theano.function(all_inputs, get_flat_gradient(L_surr, weights),\n", - " allow_input_downcast=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can ascend these gradients as long as our $pi_\\theta(a|s)$ satisfies the constraint\n", - "$$E_{s,\\pi_{\\Theta_{t}}}\\Big[KL(\\pi(\\Theta_{t}, s) \\:||\\:\\pi(\\Theta_{t+1}, s))\\Big] < \\alpha$$\n", - "\n", - "\n", - "where\n", - "\n", - "$$KL(p||q) = E _p log({p \\over q})$$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Compute Kullback-Leibler divergence (see formula above)\n", - "# Note: you need to sum KL and entropy over all actions, not just the ones agent took\n", - "old_log_probs = T.log(old_probs + 1e-10)\n", - "\n", - "kl = \n", - "\n", - "# Compute policy entropy\n", - "entropy = \n", - "\n", - "compute_losses = theano.function(all_inputs, [L_surr, kl, entropy],\n", - " allow_input_downcast=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Linear search**\n", - "\n", - "TRPO in its core involves ascending surrogate policy gradient constrained by KL divergence. \n", - "\n", - "In order to enforce this constraint, we're gonna use linesearch. You can find out more about it [here](https://en.wikipedia.org/wiki/Linear_search)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def linesearch(f, x, fullstep, max_kl):\n", - " \"\"\"\n", - " Linesearch finds the best parameters of neural networks in the direction of fullstep contrainted by KL divergence.\n", - " :param: f - function that returns loss, kl and arbitrary third component.\n", - " :param: x - old parameters of neural network.\n", - " :param: fullstep - direction in which we make search.\n", - " :param: max_kl - constraint of KL divergence.\n", - " :returns:\n", - " \"\"\"\n", - " max_backtracks = 10\n", - " loss, _, _ = f(x)\n", - " for stepfrac in .5 ** np.arange(max_backtracks):\n", - " xnew = x + stepfrac * fullstep\n", - " new_loss, kl, _ = f(xnew)\n", - " actual_improve = new_loss - loss\n", - " if kl <= max_kl and actual_improve < 0:\n", - " x = xnew\n", - " loss = new_loss\n", - " return x" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 4: training\n", - "In this section we construct rest parts of our computational graph" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def slice_vector(vector, shapes):\n", - " \"\"\"\n", - " Slices symbolic vector into several symbolic tensors of given shapes.\n", - " Auxilary function used to un-flatten gradients, tangents etc.\n", - " :param vector: 1-dimensional symbolic vector\n", - " :param shapes: list or tuple of shapes (list, tuple or symbolic)\n", - " :returns: list of symbolic tensors of given shapes\n", - " \"\"\"\n", - " assert vector.ndim == 1, \"vector must be 1-dimensional\"\n", - " start = 0\n", - " tensors = []\n", - " for shape in shapes:\n", - " size = T.prod(shape)\n", - " tensor = vector[start:(start + size)].reshape(shape)\n", - " tensors.append(tensor)\n", - " start += size\n", - " return tensors" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "conjugate_grad_intermediate_vector = T.vector(\n", - " \"intermediate grad in conjugate_gradient\")\n", - "\n", - "# slice flat_tangent into chunks for each weight\n", - "weight_shapes = [var.get_value().shape for var in weights]\n", - "tangents = slice_vector(conjugate_grad_intermediate_vector, weight_shapes)\n", - "\n", - "# KL divergence where first arg is fixed\n", - "from theano.gradient import disconnected_grad as const\n", - "kl_firstfixed = (const(probs) * (const(T.log(probs)) -\n", - " T.log(probs))).sum(axis=-1).mean()\n", - "\n", - "# compute fisher information matrix (used for conjugate gradients and to estimate KL)\n", - "gradients = T.grad(kl_firstfixed, weights)\n", - "gradient_vector_product = [T.sum(g * t) for (g, t) in zip(gradients, tangents)]\n", - "\n", - "fisher_vector_product = get_flat_gradient(\n", - " sum(gradient_vector_product), weights)\n", - "\n", - "compute_fisher_vector_product = theano.function(\n", - " [observations, conjugate_grad_intermediate_vector], fisher_vector_product)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### TRPO helpers\n", - "\n", - "Here we define a few helper functions used in the main TRPO loop" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Conjugate gradients**\n", - "\n", - "Since TRPO includes contrainted optimization, we will need to solve Ax=b using conjugate gradients.\n", - "\n", - "In general, CG is an algorithm that solves Ax=b where A is positive-defined. A is Hessian matrix so A is positive-defined. You can find out more about them [here](https://en.wikipedia.org/wiki/Conjugate_gradient_method)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from numpy.linalg import inv\n", - "\n", - "\n", - "def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10):\n", - " \"\"\"\n", - " This method solves system of equation Ax=b using iterative method called conjugate gradients\n", - " :f_Ax: function that returns Ax\n", - " :b: targets for Ax\n", - " :cg_iters: how many iterations this method should do\n", - " :residual_tol: epsilon for stability\n", - " \"\"\"\n", - " p = b.copy()\n", - " r = b.copy()\n", - " x = np.zeros_like(b)\n", - " rdotr = r.dot(r)\n", - " for i in range(cg_iters):\n", - " z = f_Ax(p)\n", - " v = rdotr / (p.dot(z) + 1e-8)\n", - " x += v * p\n", - " r -= v * z\n", - " newrdotr = r.dot(r)\n", - " mu = newrdotr / (rdotr + 1e-8)\n", - " p = r + mu * p\n", - " rdotr = newrdotr\n", - " if rdotr < residual_tol:\n", - " break\n", - " return x" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# This code validates conjugate gradients\n", - "A = np.random.rand(8, 8)\n", - "A = np.matmul(np.transpose(A), A)\n", - "\n", - "\n", - "def f_Ax(x):\n", - " return np.matmul(A, x.reshape(-1, 1)).reshape(-1)\n", - "\n", - "\n", - "b = np.random.rand(8)\n", - "\n", - "w = np.matmul(np.matmul(inv(np.matmul(np.transpose(A), A)),\n", - " np.transpose(A)), b.reshape((-1, 1))).reshape(-1)\n", - "print(w)\n", - "print(conjugate_gradient(f_Ax, b))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Compile a function that exports network weights as a vector\n", - "flat_weights = T.concatenate([var.ravel() for var in weights])\n", - "get_flat_weights = theano.function([], flat_weights)\n", - "\n", - "# ... and another function that imports vector back into network weights\n", - "flat_weights_placeholder = T.vector(\"flattened weights\")\n", - "assigns = slice_vector(flat_weights_placeholder, weight_shapes)\n", - "\n", - "load_flat_weights = theano.function(\n", - " [flat_weights_placeholder], updates=dict(zip(weights, assigns)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Step 5: Main TRPO loop\n", - "\n", - "Here we will train our network!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "from itertools import count\n", - "from collections import OrderedDict\n", - "\n", - "# this is hyperparameter of TRPO. It controls how big KL divergence may be between old and new policy every step.\n", - "max_kl = 0.01\n", - "cg_damping = 0.1 # This parameters regularize addition to\n", - "numeptotal = 0 # this is number of episodes that we played.\n", - "\n", - "start_time = time.time()\n", - "\n", - "for i in count(1):\n", - "\n", - " print(\"\\n********** Iteration %i ************\" % i)\n", - "\n", - " # Generating paths.\n", - " print(\"Rollout\")\n", - " paths = rollout(env, act)\n", - " print(\"Made rollout\")\n", - "\n", - " # Updating policy.\n", - " observations = np.concatenate([path[\"observations\"] for path in paths])\n", - " actions = np.concatenate([path[\"actions\"] for path in paths])\n", - " returns = np.concatenate([path[\"cumulative_returns\"] for path in paths])\n", - " old_probs = np.concatenate([path[\"policy\"] for path in paths])\n", - " inputs_batch = [observations, actions, returns, old_probs]\n", - "\n", - " old_weights = get_flat_weights()\n", - "\n", - " def fisher_vector_product(p):\n", - " \"\"\"gets intermediate grads (p) and computes fisher*vector \"\"\"\n", - " return compute_fisher_vector_product(observations, p) + cg_damping * p\n", - "\n", - " flat_grad = get_surrogate_gradients(*inputs_batch)\n", - "\n", - " stepdir = conjugate_gradient(fisher_vector_product, -flat_grad)\n", - " shs = .5 * stepdir.dot(fisher_vector_product(stepdir))\n", - " lm = np.sqrt(shs / max_kl)\n", - " fullstep = stepdir / lm\n", - "\n", - " # Compute new weights with linesearch in the direction we found with CG\n", - "\n", - " def losses_f(flat_weights):\n", - " load_flat_weights(flat_weights)\n", - " return compute_losses(*inputs_batch)\n", - "\n", - " new_weights = linesearch(losses_f, old_weights, fullstep, max_kl)\n", - "\n", - " load_flat_weights(new_weights)\n", - "\n", - " # Report current progress\n", - " L_surr, kl, entropy = compute_losses(*inputs_batch)\n", - " episode_rewards = np.array([path[\"rewards\"].sum() for path in paths])\n", - "\n", - " stats = OrderedDict()\n", - " numeptotal += len(episode_rewards)\n", - " stats[\"Total number of episodes\"] = numeptotal\n", - " stats[\"Average sum of rewards per episode\"] = episode_rewards.mean()\n", - " stats[\"Std of rewards per episode\"] = episode_rewards.std()\n", - " stats[\"Entropy\"] = entropy\n", - " stats[\"Time elapsed\"] = \"%.2f mins\" % ((time.time() - start_time)/60.)\n", - " stats[\"KL between old and new distribution\"] = kl\n", - " stats[\"Surrogate loss\"] = L_surr\n", - " for k, v in stats.items():\n", - " print(k + \": \" + \" \" * (40 - len(k)) + str(v))\n", - " i += 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Homework option I: better sampling (10+pts)\n", - "\n", - "In this section, you're invited to implement a better rollout strategy called _vine_.\n", - "\n", - "![img](https://s17.postimg.cc/i90chxgvj/vine.png)\n", - "\n", - "In most gym environments, you can actually backtrack by using states. You can find a wrapper that saves/loads states in [the mcts seminar](https://github.com/yandexdataschool/Practical_RL/blob/master/week10_planning/seminar_MCTS.ipynb).\n", - "\n", - "You can read more about in the [TRPO article](https://arxiv.org/abs/1502.05477) in section 5.2.\n", - "\n", - "The goal here is to implement such rollout policy (we recommend using tree data structure like in the seminar above).\n", - "Then you can assign cummulative rewards similar to `get_cummulative_rewards`, but for a tree.\n", - "\n", - "__bonus task__ - parallelize samples using multiple cores" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Homework option II (10+pts)\n", - "\n", - "Let's use TRPO to train evil robots! (pick any of two)\n", - "* [MuJoCo robots](https://gym.openai.com/envs#mujoco)\n", - "* [Box2d robot](https://gym.openai.com/envs/BipedalWalker-v2)\n", - "\n", - "The catch here is that those environments have continuous action spaces. \n", - "\n", - "Luckily, TRPO is a policy gradient method, so it's gonna work for any parametric $\\pi_\\theta(a|s)$. We recommend starting with gaussian policy:\n", - "\n", - "$$\\pi_\\theta(a|s) = N(\\mu_\\theta(s),\\sigma^2_\\theta(s)) = {1 \\over \\sqrt { 2 \\pi {\\sigma^2}_\\theta(s) } } e^{ (a - \n", - "\\mu_\\theta(s))^2 \\over 2 {\\sigma^2}_\\theta(s) } $$\n", - "\n", - "In the $\\sqrt { 2 \\pi {\\sigma^2}_\\theta(s) }$ clause, $\\pi$ means ~3.1415926, not agent's policy.\n", - "\n", - "This essentially means that you will need two output layers:\n", - "* $\\mu_\\theta(s)$, a dense layer with linear activation\n", - "* ${\\sigma^2}_\\theta(s)$, a dense layer with activation T.exp (to make it positive; like rho from bandits)\n", - "\n", - "For multidimensional actions, you can use fully factorized gaussian (basically a vector of gaussians).\n", - "\n", - "__bonus task__: compare performance of continuous action space method to action space discretization" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python", - "pygments_lexer": "ipython3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/week09_policy_II/td3_and_sac/hw-continuous-control_pytorch.ipynb b/week09_policy_II/td3_and_sac/hw-continuous-control_pytorch.ipynb new file mode 100644 index 000000000..714df58e0 --- /dev/null +++ b/week09_policy_II/td3_and_sac/hw-continuous-control_pytorch.ipynb @@ -0,0 +1,1066 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys, os\n", + "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + "\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week09_policy_II/td3_and_sac/logger.py\n", + "\n", + " !pip -q install gymnasium[mujoco]\n", + " !pip -q install tensorboardX\n", + "\n", + " !touch .setup_complete\n", + "\n", + "# This code creates a virtual display to draw game images on.\n", + "# It will have no effect if your machine has a monitor.\n", + "if type(os.environ.get(\"DISPLAY\")) is not str or len(os.environ.get(\"DISPLAY\")) == 0:\n", + " !bash ../xvfb start\n", + " os.environ['DISPLAY'] = ':1'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Continuous Control\n", + "\n", + "\n", + "In this notebook you will solve continuous control environment using either [Twin Delayed DDPG (TD3)](https://arxiv.org/pdf/1802.09477.pdf) or [Soft Actor-Critic (SAC)](https://arxiv.org/pdf/1801.01290.pdf). Both are off-policy algorithms that are current go-to algorithms for continuous control tasks.\n", + "\n", + "**Select one** of these two algorithms (TD3 or SAC) to implement. Both algorithms are extensions of basic [Deep Deterministic Policy Gradient (DDPG)](https://arxiv.org/abs/1509.02971) algorithm, and DDPG is kind of \"DQN with another neural net approximating greedy policy\", and all that differs is a set of stabilization tricks:\n", + "* TD3 trains deterministic policy, while SAC uses *stochastic policy*. This means that for SAC you can solve exploration-exploitation trade-off by simple sampling from policy, while in TD3 you will have to add noise to your actions.\n", + "* TD3 proposes to stabilize targets by adding a *clipped noise* to actions, which slightly prevents overestimation. In SAC, we formally switch to formalism of Maximum Entropy RL and add *entropy bonus* into our value function.\n", + "\n", + "Also both algorithms utilize a *twin trick*: train two critics and use pessimistic targets by taking minimum from two proposals. Standard trick with target networks is also necessary. We will go through all these tricks step-by-step.\n", + "\n", + "SAC is probably less clumsy scheme than TD3, but requires a bit more code to implement. More detailed description of algorithms can be found in Spinning Up documentation:\n", + "* on [DDPG](https://spinningup.openai.com/en/latest/algorithms/ddpg.html)\n", + "* on [TD3](https://spinningup.openai.com/en/latest/algorithms/td3.html)\n", + "* on [SAC](https://spinningup.openai.com/en/latest/algorithms/sac.html)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gymnasium as gym\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we will create an instance of the environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-09-16T18:41:00.003174Z", + "start_time": "2020-09-16T18:40:59.921640Z" + } + }, + "outputs": [], + "source": [ + "env = gym.make(\"Ant-v4\", render_mode=\"rgb_array\")\n", + "\n", + "# we want to look inside\n", + "env.reset()\n", + "\n", + "# examples of states and actions\n", + "print(\"observation space: \", env.observation_space,\n", + " \"\\nobservations:\", env.reset()[0])\n", + "print(\"action space: \", env.action_space,\n", + " \"\\naction_sample: \", env.action_space.sample())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.imshow(env.render())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's run random policy and see how it looks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class RandomActor():\n", + " def get_action(self, states):\n", + " assert len(states.shape) == 1, \"can't work with batches\"\n", + " return env.action_space.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s, _ = env.reset()\n", + "rewards_per_step = []\n", + "actor = RandomActor()\n", + "\n", + "for i in range(10000):\n", + " a = actor.get_action(s)\n", + " s, r, terminated, truncated, _ = env.step(a)\n", + "\n", + " rewards_per_step.append(r)\n", + "\n", + " if terminated or truncated:\n", + " s, _ = env.reset()\n", + " print(\"done: \", i)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So, basically most episodes are 1000 steps long (then happens termination by time), though sometimes we are terminated earlier if simulation discovers some obvious reasons to think that we crashed our ant. Important thing about continuous control tasks like this is that we receive non-trivial signal at each step: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "rewards_per_step[100:110]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This dense signal will guide our optimizations. It also partially explains why off-policy algorithms are more effective and sample-efficient than on-policy algorithms like PPO: 1-step targets are already quite informative." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will add only one wrapper to our environment to simply write summaries, mainly, the total reward during an episode." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from logger import TensorboardSummaries as Summaries\n", + "\n", + "env = gym.make(\"Ant-v4\", render_mode=\"rgb_array\")\n", + "env = Summaries(env, \"MyFirstWalkingAnt\");\n", + "\n", + "state_dim = env.observation_space.shape[0] # dimension of state space (27 numbers)\n", + "action_dim = env.action_space.shape[0] # dimension of action space (8 numbers)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Models" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start with *critic* model. On the one hand, it will function as an approximation of $Q^*(s, a)$, on the other hand it evaluates current actor $\\pi$ and can be viewed as $Q^{\\pi}(s, a)$. This critic will take both state $s$ and action $a$ as input and output a scalar value. Recommended architecture is 3-layered MLP.\n", + "\n", + "**Danger:** when models have a scalar output it is a good rule to squeeze it to avoid unexpected broadcasting, since [batch_size, 1] broadcasts with many tensor sizes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "class Critic(nn.Module):\n", + " def __init__(self, state_dim, action_dim):\n", + " super().__init__() \n", + "\n", + " \n", + "\n", + " def get_qvalues(self, states, actions):\n", + " '''\n", + " input:\n", + " states - tensor, (batch_size x features)\n", + " actions - tensor, (batch_size x actions_dim)\n", + " output:\n", + " qvalues - tensor, critic estimation, (batch_size)\n", + " '''\n", + " qvalues = \n", + "\n", + " assert len(qvalues.shape) == 1 and qvalues.shape[0] == states.shape[0]\n", + " \n", + " return qvalues" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let's define a policy, or an actor $\\pi$. Use architecture, similar to critic (3-layered MLP). The output depends on algorithm:\n", + "\n", + "For **TD3**, model *deterministic policy*. You should output `action_dim` numbers in range $[-1, 1]$. Unfortunately, deterministic policies lead to problems with stability and exploration, so we will need three \"modes\" of how this policy can be operating:\n", + "* First one - greedy - is a simple feedforward pass through network that will be used to train the actor.\n", + "* Second one - exploration mode - is when we need to add noise (e.g. Gaussian) to our actions to collect more diverse data. \n", + "* Third mode - \"clipped noised\" - will be used when we will require a target for critic, where we need to somehow \"noise\" our actor output, but not too much, so we add *clipped noise* to our output:\n", + "$$\\pi_{\\theta}(s) + \\varepsilon, \\quad \\varepsilon = \\operatorname{clip}(\\epsilon, -0.5, 0.5), \\epsilon \\sim \\mathcal{N}(0, \\sigma^2 I)$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-09-16T18:41:06.246418Z", + "start_time": "2020-09-16T18:41:05.841255Z" + } + }, + "outputs": [], + "source": [ + "# template for TD3; template for SAC is below\n", + "class TD3_Actor(nn.Module):\n", + " def __init__(self, state_dim, action_dim):\n", + " super().__init__() \n", + "\n", + " \n", + "\n", + " def get_action(self, states, std_noise=0.1):\n", + " '''\n", + " Used to collect data by interacting with environment,\n", + " so your have to add some noise to actions.\n", + " input:\n", + " states - numpy, (batch_size x features)\n", + " output:\n", + " actions - numpy, (batch_size x actions_dim)\n", + " '''\n", + " # no gradient computation is required here since we will use this only for interaction\n", + " with torch.no_grad():\n", + " actions = \n", + " \n", + " assert isinstance(actions, (list,np.ndarray)), \"convert actions to numpy to send into env\"\n", + " assert actions.max() <= 1. and actions.min() >= -1, \"actions must be in the range [-1, 1]\"\n", + " return actions\n", + " \n", + " def get_best_action(self, states):\n", + " '''\n", + " Will be used to optimize actor. Requires differentiable w.r.t. parameters actions.\n", + " input:\n", + " states - PyTorch tensor, (batch_size x features)\n", + " output:\n", + " actions - PyTorch tensor, (batch_size x actions_dim)\n", + " '''\n", + " actions = \n", + " \n", + " assert actions.requires_grad, \"you must be able to compute gradients through actions\"\n", + " return actions\n", + " \n", + " def get_target_action(self, states, std_noise=0.2, clip_eta=0.5):\n", + " '''\n", + " Will be used to create target for critic optimization.\n", + " Returns actions with added \"clipped noise\".\n", + " input:\n", + " states - PyTorch tensor, (batch_size x features)\n", + " output:\n", + " actions - PyTorch tensor, (batch_size x actions_dim)\n", + " '''\n", + " # no gradient computation is required here since we will use this only for interaction\n", + " with torch.no_grad():\n", + " actions = \n", + " \n", + " # actions can fly out of [-1, 1] range after added noise\n", + " return actions.clamp(-1, 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For **SAC**, model *gaussian policy*. This means policy distribution is going to be multivariate normal with diagonal covariance. The policy head will predict the mean and covariance, and it should be guaranteed that covariance is non-negative. **Important:** the way you model covariance strongly influences optimization procedure, so here are some options: let $f_{\\theta}$ be the output of covariance head, then:\n", + "* use exponential function $\\sigma(s) = \\exp(f_{\\theta}(s))$\n", + "* transform output to $[-1, 1]$ using `tanh`, then project output to some interval $[m, M]$, where $m = -20$, $M = 2$ and then use exponential function. This will guarantee the range of modeled covariance is adequate. So, the resulting formula is:\n", + "$$\\sigma(s) = \\exp^{m + 0.5(M - m)(\\tanh(f_{\\theta}(s)) + 1)}$$\n", + "* `softplus` operation $\\sigma(s) = \\log(1 + \\exp^{f_{\\theta}(s)})$ seems to work poorly here. o_O\n", + "\n", + "**Note**: `torch.distributions.Normal` already has everything you will need to work with such policy after you modeled mean and covariance, i.e. sampling via reparametrization trick (see `rsample` method) and compute log probability (see `log_prob` method).\n", + "\n", + "There is one more problem with gaussian distribution. We need to force our actions to be in $[-1, 1]$ bound. To achieve this, model unbounded gaussian $\\mathcal{N}(\\mu_{\\theta}(s), \\sigma_{\\theta}(s)^2I)$, where $\\mu$ can be arbitrary. Then every time you have samples $u$ from this gaussian policy, squash it using $\\operatorname{tanh}$ function to get a sample from $[-1, 1]$:\n", + "$$u \\sim \\mathcal{N}(\\mu, \\sigma^2I)$$\n", + "$$a = \\operatorname{tanh}(u)$$\n", + "\n", + "**Important:** after that you are required to use change of variable formula every time you compute likelihood (see appendix C in [paper on SAC](https://arxiv.org/pdf/1801.01290.pdf) for details):\n", + "$$\\log p(a \\mid \\mu, \\sigma) = \\log p(u \\mid \\mu, \\sigma) - \\sum_{i = 1}^D \\log (1 - \\operatorname{tanh}^2(u_i)),$$\n", + "where $D$ is `action_dim`. In practice, add something like 1e-6 inside logarithm to protect from computational instabilities." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-09-16T18:41:06.246418Z", + "start_time": "2020-09-16T18:41:05.841255Z" + } + }, + "outputs": [], + "source": [ + "# template for SAC\n", + "from torch.distributions import Normal\n", + "\n", + "class SAC_Actor(nn.Module):\n", + " def __init__(self, state_dim, action_dim):\n", + " super().__init__() \n", + "\n", + " \n", + " \n", + " def apply(self, states):\n", + " '''\n", + " For given batch of states samples actions and also returns its log prob.\n", + " input:\n", + " states - PyTorch tensor, (batch_size x features)\n", + " output:\n", + " actions - PyTorch tensor, (batch_size x action_dim)\n", + " log_prob - PyTorch tensor, (batch_size)\n", + " '''\n", + " \n", + " \n", + " return actions, log_prob \n", + "\n", + " def get_action(self, states):\n", + " '''\n", + " Used to interact with environment by sampling actions from policy\n", + " input:\n", + " states - numpy, (batch_size x features)\n", + " output:\n", + " actions - numpy, (batch_size x actions_dim)\n", + " '''\n", + " # no gradient computation is required here since we will use this only for interaction\n", + " with torch.no_grad():\n", + " \n", + " # hint: you can use `apply` method here\n", + " actions = \n", + " \n", + " assert isinstance(actions, (list,np.ndarray)), \"convert actions to numpy to send into env\"\n", + " assert actions.max() <= 1. and actions.min() >= -1, \"actions must be in the range [-1, 1]\"\n", + " return actions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ReplayBuffer\n", + "\n", + "The same as in DQN. You can copy code from your DQN assignment, just check that it works fine with continuous actions (probably it is). \n", + "\n", + "Let's recall the interface:\n", + "* `exp_replay.add(obs, act, rw, next_obs, done)` - saves (s,a,r,s',done) tuple into the buffer\n", + "* `exp_replay.sample(batch_size)` - returns observations, actions, rewards, next_observations and is_done for `batch_size` random samples.\n", + "* `len(exp_replay)` - returns number of elements stored in replay buffer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ReplayBuffer():\n", + " def __init__(self, size):\n", + " \"\"\"\n", + " Create Replay buffer.\n", + " Parameters\n", + " ----------\n", + " size: int\n", + " Max number of transitions to store in the buffer. When the buffer\n", + " overflows the old memories are dropped.\n", + "\n", + " Note: for this assignment you can pick any data structure you want.\n", + " If you want to keep it simple, you can store a list of tuples of (s, a, r, s') in self._storage\n", + " However you may find out there are faster and/or more memory-efficient ways to do so.\n", + " \"\"\"\n", + " self._storage = []\n", + " self._maxsize = size\n", + "\n", + " # OPTIONAL: YOUR CODE\n", + "\n", + " def __len__(self):\n", + " return len(self._storage)\n", + "\n", + " def add(self, obs_t, action, reward, obs_tp1, done):\n", + " '''\n", + " Make sure, _storage will not exceed _maxsize. \n", + " Make sure, FIFO rule is being followed: the oldest examples has to be removed earlier\n", + " ''' \n", + " data = (obs_t, action, reward, obs_tp1, done)\n", + " storage = self._storage\n", + " maxsize = self._maxsize\n", + " \n", + " # add data to storage\n", + "\n", + " def sample(self, batch_size):\n", + " \"\"\"Sample a batch of experiences.\n", + " Parameters\n", + " ----------\n", + " batch_size: int\n", + " How many transitions to sample.\n", + " Returns\n", + " -------\n", + " obs_batch: np.array\n", + " batch of observations\n", + " act_batch: np.array\n", + " batch of actions executed given obs_batch\n", + " rew_batch: np.array\n", + " rewards received as results of executing act_batch\n", + " next_obs_batch: np.array\n", + " next set of observations seen after executing act_batch\n", + " done_mask: np.array\n", + " done_mask[i] = 1 if executing act_batch[i] resulted in\n", + " the end of an episode and 0 otherwise.\n", + " \"\"\"\n", + " storage = self._storage\n", + " \n", + " # randomly generate batch_size integers\n", + " # to be used as indexes of samples\n", + " \n", + " \n", + " # collect for each index\n", + " \n", + " return \n", + " # , , , , " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exp_replay = ReplayBuffer(10)\n", + "\n", + "for _ in range(30):\n", + " exp_replay.add(env.reset()[0], env.action_space.sample(),\n", + " 1.0, env.reset()[0], done=False)\n", + "\n", + "obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", + " 5)\n", + "\n", + "assert len(exp_replay) == 10, \"experience replay size should be 10 because that's what maximum capacity is\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def play_and_record(initial_state, agent, env, exp_replay, n_steps=1):\n", + " \"\"\"\n", + " Play the game for exactly n steps, record every (s,a,r,s', done) to replay buffer. \n", + " Whenever game ends, add record with done=True and reset the game.\n", + " It is guaranteed that env has done=False when passed to this function.\n", + "\n", + " :returns: return sum of rewards over time and the state in which the env stays\n", + " \"\"\"\n", + " s = initial_state\n", + " sum_rewards = 0\n", + "\n", + " # Play the game for n_steps as per instructions above\n", + " for t in range(n_steps):\n", + " \n", + " # select action using policy with exploration\n", + " a = \n", + " \n", + " ns, r, terminated, truncated, _ = env.step(a)\n", + " \n", + " exp_replay.add(s, a, r, ns, terminated)\n", + " \n", + " s = env.reset()[0] if terminated or truncated else ns\n", + " \n", + " sum_rewards += r \n", + "\n", + " return sum_rewards, s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#testing your code.\n", + "exp_replay = ReplayBuffer(2000)\n", + "actor = (state_dim, action_dim).to(DEVICE)\n", + "\n", + "state, _ = env.reset()\n", + "play_and_record(state, actor, env, exp_replay, n_steps=1000)\n", + "\n", + "# if you're using your own experience replay buffer, some of those tests may need correction.\n", + "# just make sure you know what your code does\n", + "assert len(exp_replay) == 1000, \"play_and_record should have added exactly 1000 steps, \"\\\n", + " \"but instead added %i\" % len(exp_replay)\n", + "is_dones = list(zip(*exp_replay._storage))[-1]\n", + "\n", + "for _ in range(100):\n", + " obs_batch, act_batch, reward_batch, next_obs_batch, is_done_batch = exp_replay.sample(\n", + " 10)\n", + " assert obs_batch.shape == next_obs_batch.shape == (10,) + (state_dim,)\n", + " assert act_batch.shape == (\n", + " 10, action_dim), \"actions batch should have shape (10, 8) but is instead %s\" % str(act_batch.shape)\n", + " assert reward_batch.shape == (\n", + " 10,), \"rewards batch should have shape (10,) but is instead %s\" % str(reward_batch.shape)\n", + " assert is_done_batch.shape == (\n", + " 10,), \"is_done batch should have shape (10,) but is instead %s\" % str(is_done_batch.shape)\n", + " assert [int(i) in (0, 1)\n", + " for i in is_dones], \"is_done should be strictly True or False\"\n", + "\n", + "print(\"Well done!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialization\n", + "\n", + "Let's start initializing our algorithm. Here is our hyperparameters:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gamma=0.99 # discount factor\n", + "max_buffer_size = 10**5 # size of experience replay\n", + "start_timesteps = 5000 # size of experience replay when start training\n", + "timesteps_per_epoch=1 # steps in environment per step of network updates\n", + "batch_size=128 # batch size for all optimizations\n", + "max_grad_norm=10 # max grad norm for all optimizations\n", + "tau=0.005 # speed of updating target networks\n", + "policy_update_freq=<> # frequency of actor update; vanilla choice is 2 for TD3 or 1 for SAC\n", + "alpha=0.1 # temperature for SAC\n", + "\n", + "# iterations passed\n", + "n_iterations = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is our experience replay:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# experience replay\n", + "exp_replay = ReplayBuffer(max_buffer_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is our models: *two* critics and one actor." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# models to train\n", + "actor = (state_dim, action_dim).to(DEVICE)\n", + "critic1 = Critic(state_dim, action_dim).to(DEVICE)\n", + "critic2 = Critic(state_dim, action_dim).to(DEVICE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To stabilize training, we will require **target networks** - slow updating copies of our models. In **TD3**, both critics and actor have their copies, in **SAC** it is assumed that only critics require target copies while actor is always used fresh." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# target networks: slow-updated copies of actor and two critics\n", + "target_critic1 = Critic(state_dim, action_dim).to(DEVICE)\n", + "target_critic2 = Critic(state_dim, action_dim).to(DEVICE)\n", + "target_actor = TD3_Actor(state_dim, action_dim).to(DEVICE) # comment this line if you chose SAC\n", + "\n", + "# initialize them as copies of original models\n", + "target_critic1.load_state_dict(critic1.state_dict())\n", + "target_critic2.load_state_dict(critic2.state_dict())\n", + "target_actor.load_state_dict(actor.state_dict()) # comment this line if you chose SAC " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In continuous control, target networks are usually updated using exponential smoothing:\n", + "$$\\theta^{-} \\leftarrow \\tau \\theta + (1 - \\tau) \\theta^{-},$$\n", + "where $\\theta^{-}$ are target network weights, $\\theta$ - fresh parameters, $\\tau$ - hyperparameter. This util function will do it:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def update_target_networks(model, target_model):\n", + " for param, target_param in zip(model.parameters(), target_model.parameters()):\n", + " target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we will have three optimization procedures to train our three models, so let's welcome our three Adams:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# optimizers: for every model we have\n", + "opt_actor = torch.optim.Adam(actor.parameters(), lr=3e-4)\n", + "opt_critic1 = torch.optim.Adam(critic1.parameters(), lr=3e-4)\n", + "opt_critic2 = torch.optim.Adam(critic2.parameters(), lr=3e-4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# just to avoid writing this code three times\n", + "def optimize(name, model, optimizer, loss):\n", + " '''\n", + " Makes one step of SGD optimization, clips norm with max_grad_norm and \n", + " logs everything into tensorboard\n", + " '''\n", + " loss = loss.mean()\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)\n", + " optimizer.step()\n", + "\n", + " # logging\n", + " env.writer.add_scalar(name, loss.item(), n_iterations)\n", + " env.writer.add_scalar(name + \"_grad_norm\", grad_norm.item(), n_iterations)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Critic target computation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's discuss our losses for critic and actor.\n", + "\n", + "To train both critics we would like to minimize MSE using 1-step targets: for one sampled transition $(s, a, r, s')$ it should look something like this:\n", + "$$y(s, a) = r + \\gamma V(s').$$\n", + "\n", + "How do we evaluate next state and compute $V(s')$? Well, technically Monte-Carlo estimation looks simple:\n", + "$$V(s') \\approx Q(s', a')$$\n", + "where (important!) $a'$ is a sample from our current policy $\\pi(a' \\mid s')$.\n", + "\n", + "But out actor $\\pi$ will be actually trained to search for actions $a'$ where our critic gives big estimates, and this straightforward approach leads to serious overesimation issues. We require some hacks. First, we will use target networks for $Q$ (and **TD3** also uses target network for $\\pi$). Second, we will use *two* critics and take minimum across their estimations:\n", + "$$V(s') = \\min_{i = 1,2} Q^{-}_i(s', a'),$$\n", + "where $a'$ is sampled from target policy $\\pi^{-}(a' \\mid s')$ in **TD3** and from fresh policy $\\pi(a' \\mid s')$ in **SAC**.\n", + "\n", + "###### And the last but not the least:\n", + "* in **TD3** to compute $a'$ use *mode with clipped noise* that will prevent our policy from exploiting narrow peaks in our critic approximation;\n", + "* in **SAC** add (estimation of) entropy bonus in next state $s'$:\n", + "$$V(s') = \\min_{i = 1,2} Q^{-}_i(s', a') - \\alpha \\log \\pi (a' \\mid s')$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_critic_target(rewards, next_states, is_done):\n", + " '''\n", + " Important: use target networks for this method! Do not use \"fresh\" models except fresh policy in SAC!\n", + " input:\n", + " rewards - PyTorch tensor, (batch_size)\n", + " next_states - PyTorch tensor, (batch_size x features)\n", + " is_done - PyTorch tensor, (batch_size)\n", + " output:\n", + " critic target - PyTorch tensor, (batch_size)\n", + " '''\n", + " with torch.no_grad():\n", + " critic_target = \n", + " \n", + " assert not critic_target.requires_grad, \"target must not require grad.\"\n", + " assert len(critic_target.shape) == 1, \"dangerous extra dimension in target?\"\n", + "\n", + " return critic_target" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To train actor we want simply to maximize\n", + "$$\\mathbb{E}_{a \\sim \\pi(a \\mid s)} Q(s, a) \\to \\max_{\\pi}$$\n", + "\n", + "* in **TD3**, because of deterministic policy, the expectation reduces:\n", + "$$Q(s, \\pi(s)) \\to \\max_{\\pi}$$\n", + "* in **SAC**, use reparametrization trick to compute gradients and also do not forget to add entropy regularizer to motivate policy to be as stochastic as possible:\n", + "$$\\mathbb{E}_{a \\sim \\pi(a \\mid s)} Q(s, a) - \\alpha \\log \\pi(a \\mid s) \\to \\max_{\\pi}$$\n", + "\n", + "**Note:** We will use (fresh) critic1 here as Q-functon to \"exploit\". You can also use both critics and again take minimum across their estimations (this is done in original implementation of **SAC** and not done in **TD3**), but this seems to be not of high importance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_actor_loss(states):\n", + " '''\n", + " Returns actor loss on batch of states\n", + " input:\n", + " states - PyTorch tensor, (batch_size x features)\n", + " output:\n", + " actor loss - PyTorch tensor, (batch_size)\n", + " '''\n", + " # make sure you have gradients w.r.t. actor parameters\n", + " actions = \n", + " \n", + " assert actions.requires_grad, \"actions must be differentiable with respect to policy parameters\"\n", + " \n", + " # compute actor loss\n", + " actor_loss = \n", + " return actor_loss" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally combining all together and launching our algorithm. Your goal is to reach at least 1000 average reward during evaluation after training in this ant environment (*since this is a new hometask, this threshold might be updated, so at least just see if your ant learned to walk in the rendered simulation*).\n", + "\n", + "* rewards should rise more or less steadily in this environment. There can be some drops due to instabilities of algorithm, but it should eventually start rising after 100K-200K iterations. If no progress in reward is observed after these first 100K-200K iterations, there is a bug.\n", + "* gradient norm appears to be quite big for this task, it is ok if it reaches 100-200 (we handled it with clip_grad_norm). Consider everything exploded if it starts growing exponentially, then there is a bug." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "seed = \n", + "np.random.seed(seed)\n", + "env.unwrapped.seed(seed)\n", + "torch.manual_seed(seed);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm.notebook import trange\n", + "\n", + "interaction_state = env.reset()\n", + "random_actor = RandomActor()\n", + "\n", + "for n_iterations in trange(0, 1000000, timesteps_per_epoch):\n", + " # if experience replay is small yet, no training happens\n", + " # we also collect data using random policy to collect more diverse starting data\n", + " if len(exp_replay) < start_timesteps:\n", + " _, interaction_state = play_and_record(interaction_state, random_actor, env, exp_replay, timesteps_per_epoch)\n", + " continue\n", + " \n", + " # perform a step in environment and store it in experience replay\n", + " _, interaction_state = play_and_record(interaction_state, actor, env, exp_replay, timesteps_per_epoch)\n", + " \n", + " # sample a batch from experience replay\n", + " states, actions, rewards, next_states, is_done = exp_replay.sample(batch_size)\n", + " \n", + " # move everything to PyTorch tensors\n", + " states = torch.tensor(states, device=DEVICE, dtype=torch.float)\n", + " actions = torch.tensor(actions, device=DEVICE, dtype=torch.float)\n", + " rewards = torch.tensor(rewards, device=DEVICE, dtype=torch.float)\n", + " next_states = torch.tensor(next_states, device=DEVICE, dtype=torch.float)\n", + " is_done = torch.tensor(\n", + " is_done.astype('float32'),\n", + " device=DEVICE,\n", + " dtype=torch.float\n", + " )\n", + " \n", + " # losses\n", + " critic1_loss = \n", + " optimize(\"critic1\", critic1, opt_critic1, critic1_loss)\n", + "\n", + " critic2_loss = \n", + " optimize(\"critic2\", critic2, opt_critic2, critic2_loss)\n", + "\n", + " # actor update is less frequent in TD3\n", + " if n_iterations % policy_update_freq == 0:\n", + " actor_loss = \n", + " optimize(\"actor\", actor, opt_actor, actor_loss)\n", + "\n", + " # update target networks\n", + " update_target_networks(critic1, target_critic1)\n", + " update_target_networks(critic2, target_critic2)\n", + " update_target_networks(actor, target_actor) # comment this line if you chose SAC" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-09-16T18:41:47.560269Z", + "start_time": "2020-09-16T18:41:47.546277Z" + } + }, + "outputs": [], + "source": [ + "def evaluate(env, actor, n_games=1, t_max=1000):\n", + " '''\n", + " Plays n_games and returns rewards and rendered games\n", + " '''\n", + " rewards = []\n", + "\n", + " for _ in range(n_games):\n", + " s, _ = env.reset()\n", + "\n", + " R = 0\n", + " for _ in range(t_max):\n", + " # select action for final evaluation of your policy\n", + " action = \n", + "\n", + " assert (action.max() <= 1).all() and (action.min() >= -1).all()\n", + "\n", + " s, r, terminated, truncated, _ = env.step(action)\n", + "\n", + " R += r\n", + "\n", + " if terminated or truncated:\n", + " break\n", + "\n", + " rewards.append(R)\n", + " return np.array(rewards)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-09-16T18:38:45.130920Z", + "start_time": "2020-09-16T18:38:13.090472Z" + } + }, + "outputs": [], + "source": [ + "# evaluation will take some time!\n", + "sessions = evaluate(env, actor, n_games=20)\n", + "score = sessions.mean()\n", + "print(f\"Your score: {score}\")\n", + "\n", + "assert score >= 1000, \"Needs more training?\"\n", + "print(\"Well done!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Record" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-09-16T18:43:19.559507Z", + "start_time": "2020-09-16T18:43:19.522533Z" + } + }, + "outputs": [], + "source": [ + "from gymnasium.wrappers import RecordVideo\n", + "\n", + "# let's hope this will work\n", + "# don't forget to pray\n", + "with gym.make(\"Ant-v4\", render_mode=\"rgb_array\") as env, RecordVideo(\n", + " env=env, video_folder=\"./videos\"\n", + ") as env_monitor:\n", + " # note that t_max is 300, so collected reward will be smaller than 1000\n", + " evaluate(env_monitor, actor, n_games=1, t_max=300)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show video. This may not work in some setups. If it doesn't\n", + "# work for you, you can download the videos and view them locally.\n", + "\n", + "from pathlib import Path\n", + "from base64 import b64encode\n", + "from IPython.display import HTML\n", + "import sys\n", + "\n", + "video_paths = sorted([s for s in Path('videos').iterdir() if s.suffix == '.mp4'])\n", + "video_path = video_paths[-1] # You can also try other indices\n", + "\n", + "if 'google.colab' in sys.modules:\n", + " # https://stackoverflow.com/a/57378660/1214547\n", + " with video_path.open('rb') as fp:\n", + " mp4 = fp.read()\n", + " data_url = 'data:video/mp4;base64,' + b64encode(mp4).decode()\n", + "else:\n", + " data_url = str(video_path)\n", + "\n", + "HTML(\"\"\"\n", + "\n", + "\"\"\".format(data_url))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Report\n", + "\n", + "We'd like to collect some statistics about computational resources you spent on this task. Please, report:\n", + "* which GPU or CPU you used: \n", + "* number of iterations you used for training: \n", + "* wall-clock time spent (on computation =D): " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/week09_policy_II/td3_and_sac/logger.py b/week09_policy_II/td3_and_sac/logger.py new file mode 100644 index 000000000..c393ad2cc --- /dev/null +++ b/week09_policy_II/td3_and_sac/logger.py @@ -0,0 +1,91 @@ +from collections import deque + +import gymnasium as gym +import numpy as np +from tensorboardX import SummaryWriter + + +class TensorboardSummaries(gym.Wrapper): + """Writes env summaries.""" + + def __init__(self, env, prefix=None, running_mean_size=100, step_var=None): + super().__init__(env) + self.episode_counter = 0 + self.prefix = prefix or self.env.spec.id + self.writer = SummaryWriter(f"logs/{self.prefix}") + self.step_var = 0 + + self.nenvs = getattr(self.env.unwrapped, "nenvs", 1) + self.rewards = np.zeros(self.nenvs) + self.had_ended_episodes = np.zeros(self.nenvs, dtype=bool) + self.episode_lengths = np.zeros(self.nenvs) + self.reward_queues = [ + deque([], maxlen=running_mean_size) for _ in range(self.nenvs) + ] + + def should_write_summaries(self): + """Returns true if it's time to write summaries.""" + return np.all(self.had_ended_episodes) + + def add_summaries(self): + """Writes summaries.""" + self.writer.add_scalar( + f"Episodes/total_reward", + np.mean([q[-1] for q in self.reward_queues]), + self.step_var, + ) + self.writer.add_scalar( + f"Episodes/reward_mean_{self.reward_queues[0].maxlen}", + np.mean([np.mean(q) for q in self.reward_queues]), + self.step_var, + ) + self.writer.add_scalar( + f"Episodes/episode_length", np.mean(self.episode_lengths), self.step_var + ) + if self.had_ended_episodes.size > 1: + self.writer.add_scalar( + f"Episodes/min_reward", + min(q[-1] for q in self.reward_queues), + self.step_var, + ) + self.writer.add_scalar( + f"Episodes/max_reward", + max(q[-1] for q in self.reward_queues), + self.step_var, + ) + self.episode_lengths.fill(0) + self.had_ended_episodes.fill(False) + + def step(self, action): + obs, rew, terminated, truncated, info = self.env.step(action) + self.rewards += rew + self.episode_lengths[~self.had_ended_episodes] += 1 + + info_collection = [info] if isinstance(info, dict) else info + terminated_collection = ( + [terminated] if isinstance(terminated, bool) else terminated + ) + truncated_collection = [truncated] if isinstance(truncated, bool) else truncated + done_indices = [ + i + for i, info in enumerate(info_collection) + if info.get( + "real_done", terminated_collection[i] or truncated_collection[i] + ) + ] + for i in done_indices: + if not self.had_ended_episodes[i]: + self.had_ended_episodes[i] = True + self.reward_queues[i].append(self.rewards[i]) + self.rewards[i] = 0 + + self.step_var += self.nenvs + if self.should_write_summaries(): + self.add_summaries() + return obs, rew, terminated, truncated, info + + def reset(self, **kwargs): + self.rewards.fill(0) + self.episode_lengths.fill(0) + self.had_ended_episodes.fill(False) + return self.env.reset(**kwargs) diff --git a/week09_policy_II/test_ppo/actions.npy b/week09_policy_II/test_ppo/actions.npy new file mode 100644 index 000000000..3e7f71f0f Binary files /dev/null and b/week09_policy_II/test_ppo/actions.npy differ diff --git a/week09_policy_II/test_ppo/advantages.npy b/week09_policy_II/test_ppo/advantages.npy new file mode 100644 index 000000000..351e12821 Binary files /dev/null and b/week09_policy_II/test_ppo/advantages.npy differ diff --git a/week09_policy_II/test_ppo/log_probs.npy b/week09_policy_II/test_ppo/log_probs.npy new file mode 100644 index 000000000..63387ab55 Binary files /dev/null and b/week09_policy_II/test_ppo/log_probs.npy differ diff --git a/week09_policy_II/test_ppo/observations.npy b/week09_policy_II/test_ppo/observations.npy new file mode 100644 index 000000000..bca704d8b Binary files /dev/null and b/week09_policy_II/test_ppo/observations.npy differ diff --git a/week09_policy_II/test_ppo/policy b/week09_policy_II/test_ppo/policy new file mode 100644 index 000000000..5cb4c8169 Binary files /dev/null and b/week09_policy_II/test_ppo/policy differ diff --git a/week09_policy_II/test_ppo/resets.npy b/week09_policy_II/test_ppo/resets.npy new file mode 100644 index 000000000..53c506758 Binary files /dev/null and b/week09_policy_II/test_ppo/resets.npy differ diff --git a/week09_policy_II/test_ppo/rewards.npy b/week09_policy_II/test_ppo/rewards.npy new file mode 100644 index 000000000..972587a7d Binary files /dev/null and b/week09_policy_II/test_ppo/rewards.npy differ diff --git a/week09_policy_II/test_ppo/state.npy b/week09_policy_II/test_ppo/state.npy new file mode 100644 index 000000000..a17063d8e Binary files /dev/null and b/week09_policy_II/test_ppo/state.npy differ diff --git a/week09_policy_II/test_ppo/value_targets.npy b/week09_policy_II/test_ppo/value_targets.npy new file mode 100644 index 000000000..0d105755a Binary files /dev/null and b/week09_policy_II/test_ppo/value_targets.npy differ diff --git a/week09_policy_II/test_ppo/values.npy b/week09_policy_II/test_ppo/values.npy new file mode 100644 index 000000000..de84548f3 Binary files /dev/null and b/week09_policy_II/test_ppo/values.npy differ diff --git a/week10_planning/README.md b/week10_planning/README.md index 042d8c513..5591f2c0f 100644 --- a/week10_planning/README.md +++ b/week10_planning/README.md @@ -1,7 +1,7 @@ ## Assignments -Just as usual, start with `seminar_MCTS.ipynb` -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/spring20/week10_planning/seminar_MCTS.ipynb) +Just as usual, start with `seminar_MCTS.ipynb` +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/yandexdataschool/Practical_RL/blob/master/week10_planning/seminar_MCTS.ipynb) ## Materials: planning @@ -13,9 +13,9 @@ Just as usual, start with `seminar_MCTS.ipynb` * Monte-carlo tree search * Monte-carlo tree search step-by-step by J.Levine (very intuitive) - [video](https://www.youtube.com/watch?v=UXW2yZndl7U) * Udacity video on monte-carlo tree search (first part of a chain) - [video](https://www.youtube.com/watch?v=onBYsen2_eA) - * A Survey of Monte Carlo Tree Search Methods (2011-2012) [pdf](http://mcts.ai/pubs/mcts-survey-master.pdf) + * A Survey of Monte Carlo Tree Search Methods (2011-2012) [pdf](http://ccg.doc.gold.ac.uk/ccg_old/papers/browne_tciaig12_1.pdf) * Reminder: UCB-1 - [slides](https://www.cs.bham.ac.uk/internal/courses/robotics/lectures/ucb1.pdf) - * Guide to MCTS - [url](https://jeffbradberry.com/posts/2015/09/intro-to-monte-carlo-tree-search/) + * Guide to MCTS - [url](https://jeffbradberry.com/posts/2015/09/intro-to-monte-carlo-tree-search/) or [cached](https://webcache.googleusercontent.com/search?q=cache:jeffbradberry.com/posts/2015/09/intro-to-monte-carlo-tree-search/) ## Supplementary materials diff --git a/week10_planning/seminar_MCTS.ipynb b/week10_planning/seminar_MCTS.ipynb index 864b7d86e..0b11c0607 100644 --- a/week10_planning/seminar_MCTS.ipynb +++ b/week10_planning/seminar_MCTS.ipynb @@ -70,7 +70,8 @@ "source": [ "import sys, os\n", "if 'google.colab' in sys.modules and not os.path.exists('.setup_complete'):\n", - " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/spring20/setup_colab.sh -O- | bash\n", + " !wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash\n", + " !pip install -q gymnasium\n", "\n", " !touch .setup_complete\n", "\n", @@ -98,7 +99,7 @@ "source": [ "---\n", "\n", - "But before we do that, we first need to make a wrapper for Gym environments to allow saving and loading game states to facilitate backtracking." + "But before we do that, we first need to make a wrapper for Gymnasium environments to allow saving and loading game states to facilitate backtracking." ] }, { @@ -107,8 +108,7 @@ "metadata": {}, "outputs": [], "source": [ - "import gym\n", - "from gym.core import Wrapper\n", + "import gymnasium as gym\n", "from pickle import dumps, loads\n", "from collections import namedtuple\n", "\n", @@ -117,7 +117,7 @@ " \"action_result\", (\"snapshot\", \"observation\", \"reward\", \"is_done\", \"info\"))\n", "\n", "\n", - "class WithSnapshots(Wrapper):\n", + "class WithSnapshots(gym.Wrapper):\n", " \"\"\"\n", " Creates a wrapper that supports saving and loading environemnt states.\n", " Required for planning algorithms.\n", @@ -128,8 +128,8 @@ " - ...\n", "\n", " You can also use reset() and step() directly for convenience.\n", - " - s = self.reset() # same as self.env.reset()\n", - " - s, r, done, _ = self.step(action) # same as self.env.step(action)\n", + " - s, _ = self.reset() # same as self.env.reset()\n", + " - s, r, terminated, truncated, _ = self.step(action) # same as self.env.step(action)\n", " \n", " Note that while you may use self.render(), it will spawn a window that cannot be pickled.\n", " Thus, you will need to call self.close() before pickling will work again.\n", @@ -153,9 +153,10 @@ " self.render() # close popup windows since we can't pickle them\n", " self.close()\n", " \n", - " if self.unwrapped.viewer is not None:\n", - " self.unwrapped.viewer.close()\n", - " self.unwrapped.viewer = None\n", + " self.unwrapped.screen = None\n", + " self.unwrapped.clock = None\n", + " self.unwrapped.surf = None\n", + "\n", " return dumps(self.env)\n", "\n", " def load_snapshot(self, snapshot, render=False):\n", @@ -181,7 +182,8 @@ "\n", " :returns: next snapshot, next_observation, reward, is_done, info\n", "\n", - " Basically it returns next snapshot and everything that env.step would have returned.\n", + " Basically it returns next snapshot and almost everything that env.step would have returned.\n", + " Note that is_done = terminated or truncated\n", " \"\"\"\n", "\n", " \n", @@ -210,7 +212,9 @@ "outputs": [], "source": [ "# make env\n", - "env = WithSnapshots(gym.make(\"CartPole-v0\"))\n", + "env = WithSnapshots(gym.make(\"CartPole-v1\",\n", + " render_mode=\"rgb_array\",\n", + " max_episode_steps=200))\n", "env.reset()\n", "\n", "n_actions = env.action_space.n" @@ -223,7 +227,7 @@ "outputs": [], "source": [ "print(\"initial_state:\")\n", - "plt.imshow(env.render('rgb_array'))\n", + "plt.imshow(env.render())\n", "env.close()\n", "\n", "# create first snapshot\n", @@ -238,13 +242,16 @@ "source": [ "# play without making snapshots (faster)\n", "while True:\n", - " is_done = env.step(env.action_space.sample())[2]\n", - " if is_done:\n", + " _, _, terminated, truncated, _ = env.step(env.action_space.sample())\n", + " if terminated:\n", " print(\"Whoops! We died!\")\n", " break\n", + " if truncated:\n", + " print(\"Time is over!\")\n", + " break\n", "\n", "print(\"final state:\")\n", - "plt.imshow(env.render('rgb_array'))\n", + "plt.imshow(env.render())\n", "env.close()" ] }, @@ -258,7 +265,7 @@ "env.load_snapshot(snap0)\n", "\n", "print(\"\\n\\nAfter loading snapshot\")\n", - "plt.imshow(env.render('rgb_array'))\n", + "plt.imshow(env.render())\n", "env.close()" ] }, @@ -524,8 +531,8 @@ "metadata": {}, "outputs": [], "source": [ - "env = WithSnapshots(gym.make(\"CartPole-v0\"))\n", - "root_observation = env.reset()\n", + "env = WithSnapshots(gym.make(\"CartPole-v1\", render_mode=\"rgb_array\", max_episode_steps=200))\n", + "root_observation, _ = env.reset()\n", "root_snapshot = env.get_snapshot()\n", "root = Root(root_snapshot, root_observation)" ] @@ -559,7 +566,6 @@ "source": [ "from IPython.display import clear_output\n", "from itertools import count\n", - "from gym.wrappers import Monitor\n", "\n", "total_reward = 0 # sum of rewards\n", "test_env = loads(root_snapshot) # env used to show progress\n", @@ -570,16 +576,16 @@ " best_child = \n", "\n", " # take action\n", - " s, r, done, _ = test_env.step(best_child.action)\n", + " s, r, terminated, truncated, _ = test_env.step(best_child.action)\n", "\n", " # show image\n", " clear_output(True)\n", " plt.title(\"step %i\" % i)\n", - " plt.imshow(test_env.render('rgb_array'))\n", + " plt.imshow(test_env.render())\n", " plt.show()\n", "\n", " total_reward += r\n", - " if done:\n", + " if terminated or truncated:\n", " print(\"Finished with reward = \", total_reward)\n", " break\n", "\n", @@ -624,6 +630,11 @@ "\n", "\"Build this\" assignment\n", "\n", + "Don't forget to run:\n", + "``` \n", + "pip install gymnasium[atari,accept-rom-license]\n", + "``` \n", + "\n", "Apply MCTS to play Atari games. In particular, let's start with ```gym.make(\"MsPacman-ramDeterministic-v0\")```.\n", "\n", "This requires two things:\n", @@ -712,35 +723,9 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false + "pygments_lexer": "ipython3" } }, "nbformat": 4, diff --git a/youtube_dl_lectures.sh b/youtube_dl_lectures.sh index 313ead9ec..4d67e8538 100644 --- a/youtube_dl_lectures.sh +++ b/youtube_dl_lectures.sh @@ -25,7 +25,6 @@ youtube-dl 'https://www.youtube.com/watch?v=IL3gVyJMmhg' --output 'week03_model_ # week04_recap youtube-dl 'https://www.youtube.com/watch?v=uXt8qF2Zzfo' --output 'week04_[recap]_deep_learning/lect/Lecture_basics.mp4' youtube-dl 'https://www.youtube.com/watch?v=FmpDIaiMIeA' --output 'week04_[recap]_deep_learning/lect/Lecture_convnets.mp4' -youtube-dl 'https://www.youtube.com/watch?v=OU8I1oJ9HhI' --output 'week04_[recap]_deep_learning/lect/Tutorial_theano.mp4' # week04 youtube-dl 'https://www.youtube.com/watch?v=UoPei5o4fps' --output 'week04_approx_rl/lect/Lecture_Silver.mp4'