tuning

Jeronymous · Jeronymous · commit 528e13b2b7c7 · 2024-12-07T18:01:36.000+01:00
diff --git a/tokenizers.ipynb b/tokenizers.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 🔎🔢 Tokenizers (in Deep Learning)"
+    "# 🔎🔢 Hands-On Tokenizers (in Deep Learning)"
    ]
   },
   {
@@ -42,7 +42,9 @@
    "source": [
     "A tokenizer maps `string` $\\rightleftharpoons$ `list of tokens`.\n",
     "* `encode`(\"string\") $\\mapsto$ [\"list\", \"of\", \"tokens\"]\n",
-    "* `decode`([\"list\", \"of\", \"tokens\"]) $\\mapsto$ \"string\""
+    "* `decode`([\"list\", \"of\", \"tokens\"]) $\\mapsto$ \"string\"\n",
+    "\n",
+    "In deep learning, a tokenizer is a <u>pre-processing</u> and/or <u>post-processing</u> brick for an artificial neural network that process and/or generates text."
    ]
   },
   {
@@ -297,20 +299,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "== CharSplitter ==\n",
-      "-> ['M', 'a', 'i', 's', ',', ' ', 'm', 'a', 'i', 's', '…', ' ', 'v', 'a', 's', ' ', 't', \"'\", 'e', 'n', ' ', 'l', 'à', '-', 'b', 'a', 's', ' ', '!']\n",
-      "-> Mais, mais… vas t'en là-bas !\n",
+      "➡️ ['M', 'a', 'i', 's', ',', ' ', 'm', 'a', 'i', 's', '…', ' ', 'v', 'a', 's', ' ', 't', \"'\", 'e', 'n', ' ', 'l', 'à', '-', 'b', 'a', 's', ' ', '!']\n",
+      "➡️ Mais, mais… vas t'en là-bas !\n",
       "\n",
       "== WordSplitter ==\n",
-      "-> ['Mais,', '▁mais…', '▁vas', \"▁t'en\", '▁là-bas', '▁!']\n",
-      "-> Mais, mais… vas t'en là-bas !\n",
+      "➡️ ['Mais,', '▁mais…', '▁vas', \"▁t'en\", '▁là-bas', '▁!']\n",
+      "➡️ Mais, mais… vas t'en là-bas !\n",
       "\n"
      ]
     }
@@ -335,21 +337,17 @@
     "    def join(self, tokens: list) -> str:\n",
     "        return \"\".join(tokens).replace(self._SPACE, \" \")\n",
     "\n",
-    "\n",
     "input = \"Mais, mais… vas t'en là-bas !\"\n",
     "\n",
-    "for tokenizer in [\n",
-    "    CharSplitter(),\n",
-    "    WordSplitter(),\n",
-    "    ]:\n",
+    "for tokenizer in [CharSplitter(), WordSplitter()]:\n",
     "\n",
     "    encoded = tokenizer.split(input)\n",
     "\n",
     "    # Round-trip test\n",
     "    encoded_decoded = tokenizer.join(encoded)\n",
     "    assert encoded_decoded == input\n",
     "    \n",
-    "    print(f\"== {tokenizer.__class__.__name__} ==\\n-> {encoded}\\n-> {encoded_decoded}\\n\")"
+    "    print(f\"== {tokenizer.__class__.__name__} ==\\n➡️ {encoded}\\n➡️ {encoded_decoded}\\n\")"
    ]
   },
   {
@@ -363,7 +361,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 209,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -395,17 +393,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Mais, mais… vas t'en là-bas !\n",
-      "-> [80, 100, 108, 118, 47, 35, 112, 100, 108, 118, 0, 35, 121, 100, 118, 35, 119, 42, 104, 113, 35, 111, 0, 48, 101, 100, 118, 35, 36]\n",
-      "-> ['M', 'a', 'i', 's', ',', ' ', 'm', 'a', 'i', 's', '<unk>', ' ', 'v', 'a', 's', ' ', 't', \"'\", 'e', 'n', ' ', 'l', '<unk>', '-', 'b', 'a', 's', ' ', '!']\n",
-      "-> Mais, mais<unk> vas t'en l<unk>-bas !\n"
+      "➡️ [80, 100, 108, 118, 47, 35, 112, 100, 108, 118, 0, 35, 121, 100, 118, 35, 119, 42, 104, 113, 35, 111, 0, 48, 101, 100, 118, 35, 36]\n",
+      "➡️ ['M', 'a', 'i', 's', ',', ' ', 'm', 'a', 'i', 's', '<unk>', ' ', 'v', 'a', 's', ' ', 't', \"'\", 'e', 'n', ' ', 'l', '<unk>', '-', 'b', 'a', 's', ' ', '!']\n",
+      "➡️ Mais, mais<unk> vas t'en l<unk>-bas !\n"
      ]
     }
    ],
@@ -416,7 +414,7 @@
     "encoded = tokenizer.encode(input)\n",
     "encoded_str = tokenizer.encode_str(input)\n",
     "encoded_decoded = tokenizer.join(encoded_str)\n",
-    "print(f\"{input}\\n-> {encoded}\\n-> {encoded_str}\\n-> {encoded_decoded}\")"
+    "print(f\"{input}\\n➡️ {encoded}\\n➡️ {encoded_str}\\n➡️ {encoded_decoded}\")"
    ]
   },
   {
@@ -863,7 +861,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1031,7 +1029,6 @@
     "\n",
     "    return pd.DataFrame(combined_results)\n",
     "\n",
-    "\n",
     "# Cache benchmark results (that are long to compute)\n",
     "_file_to_cache_results = \"expes/tokenizers_fertilities.json\"\n",
     "if \"fertilities\" not in globals():\n",
@@ -1040,7 +1037,6 @@
     "        with open(_file_to_cache_results, \"r\") as f:\n",
     "            fertilities = json.load(f)\n",
     "\n",
-    "\n",
     "def benchmark_fertility(\n",
     "    dataset_configs, # = [\n",
     "    #     (\"wikimedia/wikipedia\", \"20231101.\" + lan)\n",
@@ -1168,7 +1164,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 189,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1225,7 +1221,7 @@
        "</style>\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
-       "    <tr style=\"text-align: left;\">\n",
+       "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>Original</th>\n",
        "      <th>Display</th>\n",
@@ -1256,25 +1252,27 @@
        "</div>"
       ],
       "text/plain": [
-       "  Original                       Display                             \\\n",
+       "                        Original                            Display  \\\n",
        "0  مرحباً Jean-Pierre، كيف حالك؟  ‏، كيف حالك؟‎Jean-Pierre ‏مرحباً    \n",
        "1         Jean-Pierre، كيف حالك؟           ‏، كيف حالك؟‎Jean-Pierre   \n",
        "2             مرحباً Jean-Pierre              ‎Jean-Pierre ‏مرحباً    \n",
        "\n",
-       "  Display of tokens                    \n",
+       "                    Display of tokens  \n",
        "0  ‏┃،┃كيف┃حالك┃؟‎Jean-Pierre┃‏مرحباً  \n",
        "1  ‏┃،┃كيف┃حالك┃؟‎Jean-Pierre┃‏مرحباً  \n",
        "2  ‏┃،┃كيف┃حالك┃؟‎Jean-Pierre┃‏مرحباً  "
       ]
      },
-     "execution_count": 189,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "import re\n",
     "\n",
+    "# from bidi.algorithm import get_display # Did not find a good thing in python-bidi (?)\n",
+    "\n",
     "# Unicode characters for Right-to-Left Mark (RLM) and Left-to-Right Mark (LRM)\n",
     "_RLM = '\\u200F'\n",
     "_LRM = '\\u200E'\n",
@@ -1343,9 +1341,6 @@
     "\n",
     "    return text\n",
     "\n",
-    "# from bidi.algorithm import get_display # Did not find a good thing in python-bidi (?)\n",
-    "\n",
-    "\n",
     "def is_separator(char):\n",
     "    return char in \"┃\"\n",
     "\n",
@@ -1441,8 +1436,7 @@
     "\n",
     "    return \"\".join(chunks_by_language)\n",
     "\n",
-    "\n",
-    "if \"TEST\":\n",
+    "def test_arabic_codeswitching_display():\n",
     "    title = \"☪ Test: Fix of display for text with Arabic and code-switching ☪\"\n",
     "    print(f\"{title}\\n\" + \"-\"*(len(title)+2))\n",
     "\n",
@@ -1491,90 +1485,13 @@
     "        print(\"\\n😎 String for display (from right to left 👈🏽)\")\n",
     "        print(display_input)\n",
     "\n",
-    "pd.DataFrame({\n",
-    "    \"Original\": inputs,\n",
-    "    \"Display\": inputs_for_display,\n",
-    "    \"Display of tokens\": [normalize_for_display(words, is_token=True) for input in inputs],\n",
-    "})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 103,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: left;\">\n",
-       "      <th></th>\n",
-       "      <th>Original</th>\n",
-       "      <th>Display</th>\n",
-       "      <th>Internal of display</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>مرحباً Jean-Pierre، كيف حالك؟</td>\n",
-       "      <td>‏، كيف حالك؟‎Jean-Pierre ‏مرحباً</td>\n",
-       "      <td>‎┃▁‏مرحباً‎&lt;LRM&gt;Jean-Pierre┃▁┃&lt;RLM&gt;‏حالك؟┃‎┃▁‏كيف┃‎┃▁‏،‎&lt;RLM&gt;</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Jean-Pierre، كيف حالك؟</td>\n",
-       "      <td>‏، كيف حالك؟‎Jean-Pierre</td>\n",
-       "      <td>‎&lt;LRM&gt;Jean-Pierre‏حالك؟┃‎┃▁‏كيف┃‎┃▁‏،‎&lt;RLM&gt;</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>مرحباً Jean-Pierre</td>\n",
-       "      <td>‎Jean-Pierre ‏مرحباً</td>\n",
-       "      <td>‎┃▁‏مرحباً‎&lt;LRM&gt;Jean-Pierre┃▁┃&lt;RLM&gt;</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "  Original                       Display                             \\\n",
-       "0  مرحباً Jean-Pierre، كيف حالك؟  ‏، كيف حالك؟‎Jean-Pierre ‏مرحباً    \n",
-       "1         Jean-Pierre، كيف حالك؟           ‏، كيف حالك؟‎Jean-Pierre   \n",
-       "2             مرحباً Jean-Pierre              ‎Jean-Pierre ‏مرحباً    \n",
-       "\n",
-       "  Internal of display                                             \n",
-       "0  ‎┃▁‏مرحباً‎<LRM>Jean-Pierre┃▁┃<RLM>‏حالك؟┃‎┃▁‏كيف┃‎┃▁‏،‎<RLM>  \n",
-       "1                    ‎<LRM>Jean-Pierre‏حالك؟┃‎┃▁‏كيف┃‎┃▁‏،‎<RLM>  \n",
-       "2                            ‎┃▁‏مرحباً‎<LRM>Jean-Pierre┃▁┃<RLM>  "
-      ]
-     },
-     "execution_count": 103,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "pd.DataFrame({\n",
-    "    \"Original\": inputs,\n",
-    "    \"Display\": inputs_for_display,\n",
-    "    \"Internal of display\": [normalize_for_display([w for w in re.split(r\"( |\\<\\w+\\>)\", input) if w], is_token=True) for input in inputs_for_display],\n",
-    "})"
+    "    return pd.DataFrame({\n",
+    "        \"Original\": inputs,\n",
+    "        \"Display\": inputs_for_display,\n",
+    "        \"Display of tokens\": [normalize_for_display(words, is_token=True) for input in inputs],\n",
+    "    })\n",
+    "\n",
+    "test_arabic_codeswitching_display()"
    ]
   },
   {