8000net
diff --git a/‎08a_Basics_of_Reinforcement_Learning.ipynb‎
Lines changed: 63 additions & 33 deletions b/‎08a_Basics_of_Reinforcement_Learning.ipynb‎
Lines changed: 63 additions & 33 deletions
diff --git a/‎PDF_slides/DL_6a_RL_intro.pdf‎
1.83 MB b/‎PDF_slides/DL_6a_RL_intro.pdf‎
1.83 MB
@@ -123,7 +123,7 @@
    "outputs": [],
    "source": [
     "# call this every iteration that we need to get\n",
-    "# a batch of episodes. All environment interaction happend here \n",
+    "# a batch of episodes. All environment interactions happend here \n",
     "def iterate_batches(env, net, batch_size):\n",
     "    # this function is called to generate training batches\n",
     "    # as discussed in lecture, the algorithm will \n",
@@ -379,7 +379,7 @@
     "        # change observation space to one hot encoded version \n",
     "        # we do this so that our neural network can stay the same\n",
     "        # this defines the vector of length N, with values of 0.0 up to 1.0\n",
-    "        # In the gym a box is like a tensor (ugh)\n",
+    "        # In the gym a box is like a tensor...\n",
     "        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), \n",
     "                                                dtype=np.float32)\n",
     "\n",
@@ -465,9 +465,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -545,7 +543,7 @@
     "\n",
     "**Why was this not working?**\n",
     "\n",
-    "Firstly, the input space is sparse so its harder to learn new observations from the randomized neural network, especially for rarely occurring observations (like when we get past the first few steps). Also, the reward is only given at the end and its unlikely for us to reach the end, so we need to do alot of exploring... And most of the time, there is not percentile that actually worked, so we never learn to emulate the output. \n",
+    "Firstly, the input space is sparse so its harder to learn new observations from the randomized neural network, especially for rarely occurring observations (like when we get past the first few steps). This is oour first insight into sample efficiency of an algorithm. The cross entropy method does not seem to be sample efficient when working with a sparse state space. Also, the reward is only given at the end and its unlikely for us to reach the end, so we need to do alot of exploring... And most of the time, there is not percentile that actually worked, so we never learn to emulate the output. \n",
     "\n",
     "It seems like even this simple problem is hard for cross entropy to solve. Perhaps we should go back to the basics of learning optimal policies? Yes! Let's see about value iteration.\n",
     "\n",
@@ -678,9 +676,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -729,9 +725,7 @@
   {
    "cell_type": "code",
    "execution_count": 21,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -924,9 +918,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -1176,9 +1168,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -1508,7 +1498,10 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
    },
    "outputs": [],
    "source": [
@@ -1649,7 +1642,10 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
    },
    "outputs": [],
    "source": [
@@ -1700,9 +1696,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -1861,7 +1855,44 @@
       "Best mean reward updated 0.540 -> 0.550, model saved\n",
       "Best mean reward updated 0.550 -> 0.560, model saved\n",
       "Best mean reward updated 0.560 -> 0.570, model saved\n",
-      "Best mean reward updated 0.570 -> 0.580, model saved\n"
+      "Best mean reward updated 0.570 -> 0.580, model saved\n",
+      "Best mean reward updated 0.580 -> 0.590, model saved\n",
+      "Best mean reward updated 0.590 -> 0.600, model saved\n",
+      "Best mean reward updated 0.600 -> 0.610, model saved\n",
+      "Best mean reward updated 0.610 -> 0.620, model saved\n",
+      "103300: done 8920 iterations, mean reward 0.620, eps 0.00\n",
+      "Best mean reward updated 0.620 -> 0.630, model saved\n",
+      "Best mean reward updated 0.630 -> 0.640, model saved\n",
+      "Best mean reward updated 0.640 -> 0.650, model saved\n",
+      "Best mean reward updated 0.650 -> 0.660, model saved\n",
+      "Best mean reward updated 0.660 -> 0.670, model saved\n",
+      "Best mean reward updated 0.670 -> 0.680, model saved\n",
+      "Best mean reward updated 0.680 -> 0.690, model saved\n",
+      "Best mean reward updated 0.690 -> 0.700, model saved\n",
+      "Best mean reward updated 0.700 -> 0.710, model saved\n",
+      "Best mean reward updated 0.710 -> 0.720, model saved\n",
+      "Best mean reward updated 0.720 -> 0.730, model saved\n",
+      "Best mean reward updated 0.730 -> 0.740, model saved\n",
+      "108400: done 9053 iterations, mean reward 0.690, eps 0.00\n",
+      "112700: done 9163 iterations, mean reward 0.580, eps 0.00\n",
+      "113200: done 9172 iterations, mean reward 0.590, eps 0.00\n",
+      "113900: done 9191 iterations, mean reward 0.570, eps 0.00\n",
+      "121200: done 9360 iterations, mean reward 0.690, eps 0.00\n",
+      "123400: done 9408 iterations, mean reward 0.710, eps 0.00\n",
+      "123600: done 9415 iterations, mean reward 0.710, eps 0.00\n",
+      "123800: done 9420 iterations, mean reward 0.730, eps 0.00\n",
+      "125300: done 9454 iterations, mean reward 0.650, eps 0.00\n",
+      "126500: done 9479 iterations, mean reward 0.630, eps 0.00\n",
+      "130100: done 9561 iterations, mean reward 0.730, eps 0.00\n",
+      "Best mean reward updated 0.740 -> 0.750, model saved\n",
+      "Best mean reward updated 0.750 -> 0.760, model saved\n",
+      "Best mean reward updated 0.760 -> 0.770, model saved\n",
+      "131000: done 9585 iterations, mean reward 0.770, eps 0.00\n",
+      "Best mean reward updated 0.770 -> 0.780, model saved\n",
+      "Best mean reward updated 0.780 -> 0.790, model saved\n",
+      "Best mean reward updated 0.790 -> 0.800, model saved\n",
+      "Best mean reward updated 0.800 -> 0.810, model saved\n",
+      "Solved in 132361 frames!\n"
      ]
     },
     {
@@ -2023,7 +2054,10 @@
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
    },
    "outputs": [],
    "source": [
@@ -2138,9 +2172,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# load up some utilities \n",
@@ -2208,9 +2240,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# training (no resets of the Agent or training values)\n",
@@ -2495,9 +2525,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.16"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }