From a0c0c765a8d914d9b508fbd5633765ff7e6dc4fd Mon Sep 17 00:00:00 2001
From: Daniel Kleine <53251018+d-kleine@users.noreply.github.com>
Date: Sun, 6 Oct 2024 16:56:55 +0200
Subject: [PATCH] fixed Llama 2 to 3.2 NBs (#388)

* updated requirements

* fixes llama2 to llama3

* fixed llama 3.2 standalone

* fixed typo

* fixed rope formula

* Update requirements-extra.txt

* Update ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb

* Update ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb

* Update ch05/07_gpt_to_llama/standalone-llama32.ipynb

---------

Co-authored-by: Sebastian Raschka <mail@sebastianraschka.com>
---
 .../converting-llama2-to-llama3.ipynb         | 129 ++++++------------
 ch05/07_gpt_to_llama/requirements-extra.txt   |   2 +
 ch05/07_gpt_to_llama/standalone-llama32.ipynb |  45 +++---
 3 files changed, 70 insertions(+), 106 deletions(-)

diff --git a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
index 0f13d646..79659c3b 100644
--- a/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
+++ b/ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb
@@ -135,9 +135,9 @@
     "- If you are new to implementing LLM architectures, I recommend starting with [chapter 4](../../ch04/01_main-chapter-code/ch04.ipynb), which walks you through the implementation of the original GPT architecture step by step\n",
     "- The [Converting a From-Scratch GPT Architecture to Llama 2](./converting-gpt-to-llama2.ipynb) then implements the Llama-specific components, such as RMSNorm layers, SiLU and SwiGLU activations, RoPE (rotary position embeddings), and the SentencePiece tokenizer\n",
     "- This notebook takes the Llama 2 architecture and transforms it into Llama 3 architecture by\n",
-    " 1. modifying the rotary embeddings\n",
-    " 2. implementing grouped-query attention\n",
-    " 3. and using a customized version of the GPT-4 tokenizer\n",
+    "    1. modifying the rotary embeddings\n",
+    "    2. implementing grouped-query attention\n",
+    "    3. and using a customized version of the GPT-4 tokenizer\n",
     "- Later, we then load the original Llama 3 weights shared by Meta AI into the architecture"
    ]
   },
@@ -256,7 +256,7 @@
     " - Llama 3 now supports up to 8,192 tokens, twice as many as Llama 2 (4,096)\n",
     " - The base value for the so-called RoPE $\\theta$ (see equation below) was increased from 10,000 (Llama 2) to 50,000 (Llama 3) in the following equation (adapted from the [RoPE paper](https://arxiv.org/abs/2104.09864))\n",
     "\n",
-    "$$\\Theta = \\left\\{\\theta_i = \\text{base}^{\\frac{2(i-1)}{d}}, i \\in \\left[1, 2, ..., d/2\\right]\\right\\}$$\n",
+    "$$\\Theta = \\left\\{\\theta_i = \\text{base}^{\\frac{-2(i-1)}{d}}, i \\in \\left[1, 2, ..., d/2\\right]\\right\\}$$\n",
     "\n",
     "- These $\\theta$ values are a set of predefined parameters that are used to determine the rotational angles in the rotary matrix, where $d$ is the dimensionality of the embedding space\n",
     "- Increasing the base from 10,000 to 50,000 makes the frequencies (or rotation angles) decay more slowly across the dimensions, which means that higher dimensions will be associated with larger angles than before (essentially, it's a decompression of the frequencies)\n",
@@ -812,7 +812,6 @@
     "        self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False, dtype=cfg[\"dtype\"])\n",
     "\n",
     "    def forward(self, in_idx):\n",
-    "        batch_size, seq_len = in_idx.shape\n",
     "        tok_embeds = self.tok_emb(in_idx)\n",
     "        x = tok_embeds\n",
     "        x = self.trf_blocks(x)\n",
@@ -1068,7 +1067,6 @@
     "    def __init__(self, model_path):\n",
     "        assert os.path.isfile(model_path), f\"Model file {model_path} not found\"\n",
     "        mergeable_ranks = load_tiktoken_bpe(model_path)\n",
-    "        num_base_tokens = len(mergeable_ranks)\n",
     "\n",
     "        self.special_tokens = {\n",
     "            \"<|begin_of_text|>\": 128000,\n",
@@ -1189,20 +1187,7 @@
     "id": "69714ea8-b9b8-4687-8392-f3abb8f93a32",
     "outputId": "c9836ba8-5176-4dd5-b618-6cc36fdbe1f0"
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
-      "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
-      "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
-      "You will be able to reuse this secret in all of your notebooks.\n",
-      "Please note that authentication is recommended but still optional to access public models or datasets.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from huggingface_hub import hf_hub_download\n",
     "\n",
@@ -1275,8 +1260,8 @@
      "output_type": "stream",
      "text": [
       "Output text:\n",
-      " Every effort_dead aeros Ingredients başında.extension clangmissions.esp 사진 Ek Pars til DoctorsDaoеньostivan normal Ekized � Ekized � Ek rdr tık%,orgen>',\n",
-      "\n"
+      " Every effort_dead aeros Ingredients başında.extensionégor clangmissions güc như submodule.and report官方%，.Reader(\",\");\n",
+      "ामल ندار Parliamentary !!! HigginsDynamicZhgmt writeln Globalsletion 사진------\n"
      ]
     }
    ],
@@ -1394,7 +1379,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f3788acce34f4956b0727b58d0cf38c6",
+       "model_id": "245443330e4d40c887a5649cc1663e98",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1404,48 +1389,6 @@
      },
      "metadata": {},
      "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8ae98969541849efa356cf912ac39b1e",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9bb60a5a3710463ebe3a17f8d2a446be",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ed28e180d94a4b7aa548581612e31232",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
     }
    ],
    "source": [
@@ -1762,12 +1705,12 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "409470784b6346a981920350de4f6f28",
+       "model_id": "f7df6bbf8e63448c8a6cb5d2f6208403",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
-       "model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]"
+       "model-00001-of-00004.safetensors:  36%|###6      | 1.81G/4.98G [00:00<?, ?B/s]"
       ]
      },
      "metadata": {},
@@ -1776,7 +1719,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "d598f094c3ce4daeab19fac8094cba7e",
+       "model_id": "4772f31a1c5b4c168c9aabe7a1d2bacc",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1790,7 +1733,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "98b4680141ee423bb5e43c47613d8440",
+       "model_id": "ad49eeb9e1204ea2bd2e371df8ccdea2",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1804,7 +1747,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "05b502e1e3a9436297dafbb1ce7af722",
+       "model_id": "951b9e81613a40a2a503f61e69677f0a",
        "version_major": 2,
        "version_minor": 0
       },
@@ -1937,9 +1880,6 @@
    "outputs": [
     {
      "data": {
-      "application/vnd.google.colaboratory.intrinsic+json": {
-       "type": "string"
-      },
       "text/plain": [
        "'<|start_header_id|>user<|end_header_id|>\\n\\nHello World!<|eot_id|>'"
       ]
@@ -1984,15 +1924,12 @@
       "\n",
       "1. Grass: Llamas love to graze on grass, especially in the spring and summer months.\n",
       "2. Hay: Hay is a staple in a llama's diet. They like to eat timothy hay, alfalfa hay, and other types of hay.\n",
-      "3. Grains: Llamas may also be fed grains like oats, barley, and corn. However, grains should not make up more than 10% of a llama's diet.\n",
-      "4. Fruits and vegetables: Llamas may enjoy fruits and vegetables as treats, such as apples,\n"
+      "3. Grains: Llamas may also be fed grains like oats, barley, and corn. However, grains should not make up more than 10-15% of a llama's diet.\n",
+      "4. Fruits and vegetables: Llamas may enjoy fruits and vegetables as treats, such as\n"
      ]
     }
    ],
    "source": [
-    "import re\n",
-    "\n",
-    "\n",
     "torch.manual_seed(123)\n",
     "\n",
     "token_ids = generate(\n",
@@ -2144,7 +2081,7 @@
     "tokenizer_file_path = hf_hub_download(\n",
     "    repo_id=\"meta-llama/Llama-3.1-8B\",\n",
     "    filename=\"original/tokenizer.model\",\n",
-    "    local_dir=\"llama3-files\"\n",
+    "    local_dir=\"llama31-files\"\n",
     ")\n",
     "\n",
     "tokenizer = Tokenizer(tokenizer_file_path)"
@@ -2239,7 +2176,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5bbaa046d8934c8fae0a12c3d7bd991b",
+       "model_id": "eabfde3ef38b436ea750e6fb50a02b5c",
        "version_major": 2,
        "version_minor": 0
       },
@@ -2253,7 +2190,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "af985cf6fa26475eb2c4dd81e0c79ff4",
+       "model_id": "e117ad45771747ae95c16f9876e6dc19",
        "version_major": 2,
        "version_minor": 0
       },
@@ -2267,7 +2204,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "dffa208978f34e6a9aae94ecda92fe67",
+       "model_id": "170185f2f046437dab57c2ad23163c5c",
        "version_major": 2,
        "version_minor": 0
       },
@@ -2281,7 +2218,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "2ffd8dbed00e46d2887b9a2590cad297",
+       "model_id": "6e65f5d6c5af4ab78bc7b3778b98ef86",
        "version_major": 2,
        "version_minor": 0
       },
@@ -2300,7 +2237,7 @@
     "    weights_file = hf_hub_download(\n",
     "        repo_id=\"meta-llama/Llama-3.1-8B\",\n",
     "        filename=f\"model-0000{i}-of-00004.safetensors\",\n",
-    "        local_dir=\"llama3-files\"\n",
+    "        local_dir=\"llama31-files\"\n",
     "    )\n",
     "    current_weights = load_file(weights_file)\n",
     "    combined_weights.update(current_weights)\n",
@@ -2410,7 +2347,7 @@
     "    \"n_kv_groups\": 8,         # Key-Value groups for grouped-query attention\n",
     "    \"rope_base\": 50_000,      # The base in RoPE's \"theta\"\n",
     "    \"dtype\": torch.bfloat16,  # Lower-precision dtype to save memory\n",
-    "    \"rope_freq\": {          # RoPE frequency scaling\n",
+    "    \"rope_freq\": {            # RoPE frequency scaling\n",
     "        \"factor\": 8.0,\n",
     "        \"low_freq_factor\": 1.0,\n",
     "        \"high_freq_factor\": 4.0,\n",
@@ -2425,7 +2362,7 @@
     "    \"emb_dim\": 2048,          # NEW: Half the embedding dimension\n",
     "    \"n_heads\": 32,            # Number of attention heads\n",
     "    \"n_layers\": 16,           # NEW: Half the number of layers\n",
-    "    \"hidden_dim\": 8192,      # NEW: Almopst half the size of the intermediate dimension in FeedForward\n",
+    "    \"hidden_dim\": 8192,       # NEW: Almost half the size of the intermediate dimension in FeedForward\n",
     "    \"n_kv_groups\": 8,         # Key-Value groups for grouped-query attention\n",
     "    \"rope_base\": 50_000,      # The base in RoPE's \"theta\"\n",
     "    \"dtype\": torch.bfloat16,  # Lower-precision dtype to save memory\n",
@@ -2489,7 +2426,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 46,
    "id": "uf8KjasmRFSt",
    "metadata": {
     "colab": {
@@ -2532,6 +2469,20 @@
     "outputId": "35588405-e2e1-4871-a1db-1d4bcb852e49"
    },
    "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c309c56a6cdf426e8ba7967b6a21864e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -2642,7 +2593,7 @@
    "provenance": []
   },
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -2656,7 +2607,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.11"
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
diff --git a/ch05/07_gpt_to_llama/requirements-extra.txt b/ch05/07_gpt_to_llama/requirements-extra.txt
index b5161ea2..977a6346 100644
--- a/ch05/07_gpt_to_llama/requirements-extra.txt
+++ b/ch05/07_gpt_to_llama/requirements-extra.txt
@@ -1,3 +1,5 @@
 blobfile>=3.0.0
 huggingface_hub>=0.24.7
+ipywidgets>=8.1.2
+safetensors>=0.4.4
 sentencepiece>=0.1.99
diff --git a/ch05/07_gpt_to_llama/standalone-llama32.ipynb b/ch05/07_gpt_to_llama/standalone-llama32.ipynb
index ff743dd3..e2ac7479 100644
--- a/ch05/07_gpt_to_llama/standalone-llama32.ipynb
+++ b/ch05/07_gpt_to_llama/standalone-llama32.ipynb
@@ -69,9 +69,9 @@
      "output_type": "stream",
      "text": [
       "blobfile version: 3.0.0\n",
-      "huggingface_hub version: 0.25.1\n",
+      "huggingface_hub version: 0.25.0\n",
       "tiktoken version: 0.7.0\n",
-      "torch version: 2.4.0\n"
+      "torch version: 2.5.0.dev20240812+cu121\n"
      ]
     }
    ],
@@ -349,7 +349,6 @@
     "        self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False, dtype=cfg[\"dtype\"])\n",
     "\n",
     "    def forward(self, in_idx):\n",
-    "        batch_size, seq_len = in_idx.shape\n",
     "        tok_embeds = self.tok_emb(in_idx)\n",
     "        x = tok_embeds\n",
     "        x = self.trf_blocks(x)\n",
@@ -390,7 +389,7 @@
     "    \"emb_dim\": 2048,          # Embedding dimension\n",
     "    \"n_heads\": 32,            # Number of attention heads\n",
     "    \"n_layers\": 16,           # Number of layers\n",
-    "    \"hidden_dim\": 8192,      # Size of the intermediate dimension in FeedForward\n",
+    "    \"hidden_dim\": 8192,       # Size of the intermediate dimension in FeedForward\n",
     "    \"n_kv_groups\": 8,         # Key-Value groups for grouped-query attention\n",
     "    \"rope_base\": 50_000,      # The base in RoPE's \"theta\"\n",
     "    \"dtype\": torch.bfloat16,  # Lower-precision dtype to save memory\n",
@@ -410,7 +409,7 @@
     "#     \"emb_dim\": 3072,          # Embedding dimension\n",
     "#     \"n_heads\": 24,            # Number of attention heads\n",
     "#     \"n_layers\": 28,           # Number of layers\n",
-    "#     \"hidden_dim\": 8192,      # Size of the intermediate dimension in FeedForward\n",
+    "#     \"hidden_dim\": 8192,       # Size of the intermediate dimension in FeedForward\n",
     "#     \"n_kv_groups\": 8,         # Key-Value groups for grouped-query attention\n",
     "#     \"rope_base\": 50_000,      # The base in RoPE's \"theta\"\n",
     "#     \"dtype\": torch.bfloat16,  # Lower-precision dtype to save memory\n",
@@ -619,7 +618,7 @@
    "id": "b771b60c-c198-4b30-bf10-42031197ae86",
    "metadata": {},
    "source": [
-    "- Please note that Meta AI requires that you accept the Llama 3,2 licensing terms before you can download the files; to do this, you have to create a Hugging Face Hub account and visit the [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) repository to accept the terms\n",
+    "- Please note that Meta AI requires that you accept the Llama 3.2 licensing terms before you can download the files; to do this, you have to create a Hugging Face Hub account and visit the [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) repository to accept the terms\n",
     "- Next, you will need to create an access token; to generate an access token with READ permissions, click on the profile picture in the upper right and click on \"Settings\"\n",
     "\n",
     "\n",
@@ -632,10 +631,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "id": "e9d96dc8-603a-4cb5-8c3e-4d2ca56862ed",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fede18d637d24f79a27220fb83bc6d2b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "from huggingface_hub import login\n",
     "\n",
@@ -792,7 +806,7 @@
     "        weights_file = hf_hub_download(\n",
     "            repo_id=f\"meta-llama/Llama-3.2-{LLAMA_SIZE_STR}-Instruct\",\n",
     "            filename=f\"model-0000{i}-of-00002.safetensors\",\n",
-    "            local_dir=\"llama3-files\"\n",
+    "            local_dir=\"llama32-files\"\n",
     "        )\n",
     "        current_weights = load_file(weights_file)\n",
     "        combined_weights.update(current_weights)\n",
@@ -901,17 +915,14 @@
       "\n",
       "1. Grasses: Llamas love to graze on various types of grasses, including tall grasses and short grasses.\n",
       "2. Hay: Llamas also eat hay, which is a dry, compressed form of grass or other plants.\n",
-      "3. Alfalfa: Alfalfa is a legume that is commonly fed to llamas, as it is high in protein and fiber.\n",
-      "4. Other plants: Llamas will also eat other plants, such as clover, wild grasses, and shrubs.\n",
+      "3. Alfalfa: Alfalfa is a legume that is commonly used as a hay substitute in llama feed.\n",
+      "4. Other plants: Llamas will also eat other plants, such as clover, oats, and barley.\n",
       "\n",
-      "It's worth noting that llamas are adapted to high altitudes and\n"
+      "It's worth noting that llamas are adapted to high-altitude environments and can survive on low-quality hay and\n"
      ]
     }
    ],
    "source": [
-    "import re\n",
-    "\n",
-    "\n",
     "PROMPT = \"What do llamas eat?\"\n",
     "\n",
     "torch.manual_seed(123)\n",
@@ -971,7 +982,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -985,7 +996,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,