From a6d8e93da352eeddef820188e433c5cfd9e3d8c8 Mon Sep 17 00:00:00 2001 From: rasbt Date: Tue, 24 Sep 2024 18:49:17 -0500 Subject: [PATCH] improve formatting --- ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb index 489dad70..5cb493d9 100644 --- a/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb +++ b/ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb @@ -39,10 +39,10 @@ "id": "aFmxTQbwCUMl" }, "source": [ - "- In this notebook, we convert the original GPT and GPT-2 architecture into a Llama 2 model step by step\n", + "- In this notebook, we convert the original GPT architecture into a Llama 2 model step by step (note the GPT and GPT-2 share the same architecture)\n", "- Why not Llama 1 or Llama 3?\n", - " - The Llama 1 architecture is similar to Llama 2, except that Llama 2 has a larger context window (which is nice); the Llama 1 weights are not readily available and have more usage restrictions, so it makes more sense to focus on Llama 2\n", - " - Regarding Llama 3, I will share a separate notebook to convert Llama 2 to Llama 3 (there are only a few small additional changes)\n", + " - The Llama 1 architecture is similar to Llama 2, except that Llama 2 has a larger context window (which is nice); the Llama 1 weights are not readily available and have more usage restrictions, so it makes more sense to focus on Llama 2\n", + " - Regarding Llama 3, I will share a separate notebook to convert Llama 2 to Llama 3 (there are only a few small additional changes)\n", "- The explanations are purposefully kept minimal in this notebook not to bloat it unnecessarily and focus on the main code\n", "- For more information, please see the Llama 2 paper: [Llama 2: Open Foundation and Fine-Tuned Chat Models (2023)](https://arxiv.org/abs/2307.09288)" ] @@ -143,7 +143,7 @@ "- LayerNorm normalizes inputs using mean and variance, while RMSNorm uses only the root mean square, which improves computational efficiency\n", "- The RMSNorm operation is as follows, where $x$ is the input $\\gamma$ is a trainable parameter (vector), and $\\epsilon$ is a small constant to avoid zero-division errors:\n", "\n", - "$$y = \\frac{x}{\\sqrt{\\text{RMS}[x^2]} + \\epsilon} * \\gamma$$\n", + "$$y_i = \\frac{x_i}{\\text{RMS}(x)} \\gamma_i, \\quad \\text{where} \\quad \\text{RMS}(x) = \\sqrt{\\epsilon + \\frac{1}{n} \\sum x_i^2}$$\n", "\n", "- For more details, please see the paper [Root Mean Square Layer Normalization (2019)](https://arxiv.org/abs/1910.07467)" ] @@ -1565,7 +1565,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.11.4" } }, "nbformat": 4,