Add terminate block

satojkovic · satojkovic · commit 31e328e048d0 · 2025-04-01T23:43:07.000+09:00
diff --git a/gpt_pytorch.ipynb b/gpt_pytorch.ipynb
@@ -1,165 +1,176 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "private_outputs": true,
-      "provenance": [],
-      "machine_shape": "hm",
-      "gpuType": "V100"
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "accelerator": "GPU"
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "xosv4DUIlvi9"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install torchinfo"
+   ]
   },
-  "cells": [
-    {
-      "cell_type": "code",
-      "source": [
-        "!pip install torchinfo"
-      ],
-      "metadata": {
-        "id": "xosv4DUIlvi9"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ma5VwOTFT0CX"
-      },
-      "outputs": [],
-      "source": [
-        "import torch\n",
-        "import torch.nn as nn\n",
-        "import math\n",
-        "import torch.nn.functional as F\n",
-        "from torchinfo import summary"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "class MaskedMultiSelfAttention(nn.Module):\n",
-        "  def __init__(self, h_dim, max_T, n_heads, drop_p):\n",
-        "    super().__init__()\n",
-        "    self.n_heads = n_heads\n",
-        "\n",
-        "    self.q_net = nn.Linear(h_dim, h_dim)\n",
-        "    self.k_net = nn.Linear(h_dim, h_dim)\n",
-        "    self.v_net = nn.Linear(h_dim, h_dim)\n",
-        "\n",
-        "    self.proj_net = nn.Linear(h_dim, h_dim)\n",
-        "\n",
-        "    self.attn_drop = nn.Dropout(drop_p)\n",
-        "    self.proj_drop = nn.Dropout(drop_p)\n",
-        "\n",
-        "    # Make lower triangle matrix with one\n",
-        "    ones = torch.ones((max_T, max_T))\n",
-        "    mask = torch.tril(ones).view(1, 1, max_T, max_T)\n",
-        "\n",
-        "    # mask is constant\n",
-        "    self.register_buffer('mask', mask)\n",
-        "\n",
-        "  def forward(self, x):\n",
-        "    B, T, C = x.shape\n",
-        "    N, D = self.n_heads, C // self.n_heads\n",
-        "\n",
-        "    q = self.q_net(x).view(B, T, N, D).transpose(1, 2)\n",
-        "    k = self.k_net(x).view(B, T, N, D).transpose(1, 2)\n",
-        "    v = self.v_net(x).view(B, T, N, D).transpose(1, 2)\n",
-        "\n",
-        "    weights = q @ k.transpose(2, 3) / math.sqrt(D)\n",
-        "\n",
-        "    # Masked causal weights\n",
-        "    weights.masked_fill(self.mask[..., :T, :T] == 0, float('-inf'))\n",
-        "\n",
-        "    # Normalize weights : all -inf -> 0 after softmax\n",
-        "    normalized_weights = F.softmax(weights, dim=-1)\n",
-        "\n",
-        "    # Masked causal attention (B, N, T, D)\n",
-        "    attention = self.attn_drop(normalized_weights @ v)\n",
-        "    attention = attention.transpose(1, 2).contiguous().view(B, T, N * D)\n",
-        "\n",
-        "    out = self.proj_drop(self.proj_net(attention))\n",
-        "    return out"
-      ],
-      "metadata": {
-        "id": "THxWrPdLVAd6"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "class TransformerDecoderBlock(nn.Module):\n",
-        "  def __init__(self, h_dim, max_T, n_heads, drop_p):\n",
-        "    super().__init__()\n",
-        "    self.attn = MaskedMultiSelfAttention(h_dim, max_T, n_heads, drop_p)\n",
-        "    self.mlp = nn.Sequential(\n",
-        "        nn.Linear(h_dim, 4 * h_dim),\n",
-        "        nn.GELU(),\n",
-        "        nn.Linear(4 * h_dim, h_dim),\n",
-        "        nn.Dropout(drop_p)\n",
-        "    )\n",
-        "    self.ln1 = nn.LayerNorm(h_dim)\n",
-        "    self.ln2 = nn.LayerNorm(h_dim)\n",
-        "\n",
-        "  def forward(self, x):\n",
-        "    # MaskedMultiSelfAttention -> LayerNorm -> FeedForward -> LayerNorm\n",
-        "    x = self.attn(x) + x\n",
-        "    x = self.ln1(x)\n",
-        "    x = self.mlp(x) + x\n",
-        "    x = self.ln2(x)\n",
-        "    return x"
-      ],
-      "metadata": {
-        "id": "copsi6yLUBtW"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "B, T, D = 4, 8, 64\n",
-        "n_heads = 12"
-      ],
-      "metadata": {
-        "id": "050mS7V4kyuO"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "block = TransformerDecoderBlock(h_dim=n_heads*D, max_T=T, n_heads=n_heads, drop_p=0.1)"
-      ],
-      "metadata": {
-        "id": "kZHgTOFuk6cw"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "summary(block, input_size=(B, T, n_heads * D))"
-      ],
-      "metadata": {
-        "id": "HS6ByiOPlars"
-      },
-      "execution_count": null,
-      "outputs": []
-    }
-  ]
-}
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "ma5VwOTFT0CX"
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import math\n",
+    "import torch.nn.functional as F\n",
+    "from torchinfo import summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "THxWrPdLVAd6"
+   },
+   "outputs": [],
+   "source": [
+    "class MaskedMultiSelfAttention(nn.Module):\n",
+    "  def __init__(self, h_dim, max_T, n_heads, drop_p):\n",
+    "    super().__init__()\n",
+    "    self.n_heads = n_heads\n",
+    "\n",
+    "    self.q_net = nn.Linear(h_dim, h_dim)\n",
+    "    self.k_net = nn.Linear(h_dim, h_dim)\n",
+    "    self.v_net = nn.Linear(h_dim, h_dim)\n",
+    "\n",
+    "    self.proj_net = nn.Linear(h_dim, h_dim)\n",
+    "\n",
+    "    self.attn_drop = nn.Dropout(drop_p)\n",
+    "    self.proj_drop = nn.Dropout(drop_p)\n",
+    "\n",
+    "    # Make lower triangle matrix with one\n",
+    "    ones = torch.ones((max_T, max_T))\n",
+    "    mask = torch.tril(ones).view(1, 1, max_T, max_T)\n",
+    "\n",
+    "    # mask is constant\n",
+    "    self.register_buffer('mask', mask)\n",
+    "\n",
+    "  def forward(self, x):\n",
+    "    B, T, C = x.shape\n",
+    "    N, D = self.n_heads, C // self.n_heads\n",
+    "\n",
+    "    q = self.q_net(x).view(B, T, N, D).transpose(1, 2)\n",
+    "    k = self.k_net(x).view(B, T, N, D).transpose(1, 2)\n",
+    "    v = self.v_net(x).view(B, T, N, D).transpose(1, 2)\n",
+    "\n",
+    "    weights = q @ k.transpose(2, 3) / math.sqrt(D)\n",
+    "\n",
+    "    # Masked causal weights\n",
+    "    weights.masked_fill(self.mask[..., :T, :T] == 0, float('-inf'))\n",
+    "\n",
+    "    # Normalize weights : all -inf -> 0 after softmax\n",
+    "    normalized_weights = F.softmax(weights, dim=-1)\n",
+    "\n",
+    "    # Masked causal attention (B, N, T, D)\n",
+    "    attention = self.attn_drop(normalized_weights @ v)\n",
+    "    attention = attention.transpose(1, 2).contiguous().view(B, T, N * D)\n",
+    "\n",
+    "    out = self.proj_drop(self.proj_net(attention))\n",
+    "    return out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "copsi6yLUBtW"
+   },
+   "outputs": [],
+   "source": [
+    "class TransformerDecoderBlock(nn.Module):\n",
+    "  def __init__(self, h_dim, max_T, n_heads, drop_p):\n",
+    "    super().__init__()\n",
+    "    self.attn = MaskedMultiSelfAttention(h_dim, max_T, n_heads, drop_p)\n",
+    "    self.mlp = nn.Sequential(\n",
+    "        nn.Linear(h_dim, 4 * h_dim),\n",
+    "        nn.GELU(),\n",
+    "        nn.Linear(4 * h_dim, h_dim),\n",
+    "        nn.Dropout(drop_p)\n",
+    "    )\n",
+    "    self.ln1 = nn.LayerNorm(h_dim)\n",
+    "    self.ln2 = nn.LayerNorm(h_dim)\n",
+    "\n",
+    "  def forward(self, x):\n",
+    "    # MaskedMultiSelfAttention -> LayerNorm -> FeedForward -> LayerNorm\n",
+    "    x = self.attn(x) + x\n",
+    "    x = self.ln1(x)\n",
+    "    x = self.mlp(x) + x\n",
+    "    x = self.ln2(x)\n",
+    "    return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "050mS7V4kyuO"
+   },
+   "outputs": [],
+   "source": [
+    "B, T, D = 4, 8, 64\n",
+    "n_heads = 12"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "kZHgTOFuk6cw"
+   },
+   "outputs": [],
+   "source": [
+    "block = TransformerDecoderBlock(h_dim=n_heads*D, max_T=T, n_heads=n_heads, drop_p=0.1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "HS6ByiOPlars"
+   },
+   "outputs": [],
+   "source": [
+    "summary(block, input_size=(B, T, n_heads * D))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from google.colab import runtime\n",
+    "\n",
+    "runtime.unassign()"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "V100",
+   "machine_shape": "hm",
+   "private_outputs": true,
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}