|
4 | 4 | "cell_type": "markdown",
|
5 | 5 | "metadata": {},
|
6 | 6 | "source": [
|
7 |
| - "# 🔎🔢 Tokenizers (in Deep Learning)" |
| 7 | + "# 🔎🔢 Hands-On Tokenizers (in Deep Learning)" |
8 | 8 | ]
|
9 | 9 | },
|
10 | 10 | {
|
|
42 | 42 | "source": [
|
43 | 43 | "A tokenizer maps `string` $\\rightleftharpoons$ `list of tokens`.\n",
|
44 | 44 | "* `encode`(\"string\") $\\mapsto$ [\"list\", \"of\", \"tokens\"]\n",
|
45 |
| - "* `decode`([\"list\", \"of\", \"tokens\"]) $\\mapsto$ \"string\"" |
| 45 | + "* `decode`([\"list\", \"of\", \"tokens\"]) $\\mapsto$ \"string\"\n", |
| 46 | + "\n", |
| 47 | + "In deep learning, a tokenizer is a <u>pre-processing</u> and/or <u>post-processing</u> brick for an artificial neural network that process and/or generates text." |
46 | 48 | ]
|
47 | 49 | },
|
48 | 50 | {
|
|
297 | 299 | },
|
298 | 300 | {
|
299 | 301 | "cell_type": "code",
|
300 |
| - "execution_count": 2, |
| 302 | + "execution_count": 19, |
301 | 303 | "metadata": {},
|
302 | 304 | "outputs": [
|
303 | 305 | {
|
304 | 306 | "name": "stdout",
|
305 | 307 | "output_type": "stream",
|
306 | 308 | "text": [
|
307 | 309 | "== CharSplitter ==\n",
|
308 |
| - "-> ['M', 'a', 'i', 's', ',', ' ', 'm', 'a', 'i', 's', '…', ' ', 'v', 'a', 's', ' ', 't', \"'\", 'e', 'n', ' ', 'l', 'à', '-', 'b', 'a', 's', ' ', '!']\n", |
309 |
| - "-> Mais, mais… vas t'en là-bas !\n", |
| 310 | + "➡️ ['M', 'a', 'i', 's', ',', ' ', 'm', 'a', 'i', 's', '…', ' ', 'v', 'a', 's', ' ', 't', \"'\", 'e', 'n', ' ', 'l', 'à', '-', 'b', 'a', 's', ' ', '!']\n", |
| 311 | + "➡️ Mais, mais… vas t'en là-bas !\n", |
310 | 312 | "\n",
|
311 | 313 | "== WordSplitter ==\n",
|
312 |
| - "-> ['Mais,', '▁mais…', '▁vas', \"▁t'en\", '▁là-bas', '▁!']\n", |
313 |
| - "-> Mais, mais… vas t'en là-bas !\n", |
| 314 | + "➡️ ['Mais,', '▁mais…', '▁vas', \"▁t'en\", '▁là-bas', '▁!']\n", |
| 315 | + "➡️ Mais, mais… vas t'en là-bas !\n", |
314 | 316 | "\n"
|
315 | 317 | ]
|
316 | 318 | }
|
|
335 | 337 | " def join(self, tokens: list) -> str:\n",
|
336 | 338 | " return \"\".join(tokens).replace(self._SPACE, \" \")\n",
|
337 | 339 | "\n",
|
338 |
| - "\n", |
339 | 340 | "input = \"Mais, mais… vas t'en là-bas !\"\n",
|
340 | 341 | "\n",
|
341 |
| - "for tokenizer in [\n", |
342 |
| - " CharSplitter(),\n", |
343 |
| - " WordSplitter(),\n", |
344 |
| - " ]:\n", |
| 342 | + "for tokenizer in [CharSplitter(), WordSplitter()]:\n", |
345 | 343 | "\n",
|
346 | 344 | " encoded = tokenizer.split(input)\n",
|
347 | 345 | "\n",
|
348 | 346 | " # Round-trip test\n",
|
349 | 347 | " encoded_decoded = tokenizer.join(encoded)\n",
|
350 | 348 | " assert encoded_decoded == input\n",
|
351 | 349 | " \n",
|
352 |
| - " print(f\"== {tokenizer.__class__.__name__} ==\\n-> {encoded}\\n-> {encoded_decoded}\\n\")" |
| 350 | + " print(f\"== {tokenizer.__class__.__name__} ==\\n➡️ {encoded}\\n➡️ {encoded_decoded}\\n\")" |
353 | 351 | ]
|
354 | 352 | },
|
355 | 353 | {
|
|
363 | 361 | },
|
364 | 362 | {
|
365 | 363 | "cell_type": "code",
|
366 |
| - "execution_count": 209, |
| 364 | + "execution_count": 17, |
367 | 365 | "metadata": {},
|
368 | 366 | "outputs": [],
|
369 | 367 | "source": [
|
|
395 | 393 | },
|
396 | 394 | {
|
397 | 395 | "cell_type": "code",
|
398 |
| - "execution_count": 16, |
| 396 | + "execution_count": 18, |
399 | 397 | "metadata": {},
|
400 | 398 | "outputs": [
|
401 | 399 | {
|
402 | 400 | "name": "stdout",
|
403 | 401 | "output_type": "stream",
|
404 | 402 | "text": [
|
405 | 403 | "Mais, mais… vas t'en là-bas !\n",
|
406 |
| - "-> [80, 100, 108, 118, 47, 35, 112, 100, 108, 118, 0, 35, 121, 100, 118, 35, 119, 42, 104, 113, 35, 111, 0, 48, 101, 100, 118, 35, 36]\n", |
407 |
| - "-> ['M', 'a', 'i', 's', ',', ' ', 'm', 'a', 'i', 's', '<unk>', ' ', 'v', 'a', 's', ' ', 't', \"'\", 'e', 'n', ' ', 'l', '<unk>', '-', 'b', 'a', 's', ' ', '!']\n", |
408 |
| - "-> Mais, mais<unk> vas t'en l<unk>-bas !\n" |
| 404 | + "➡️ [80, 100, 108, 118, 47, 35, 112, 100, 108, 118, 0, 35, 121, 100, 118, 35, 119, 42, 104, 113, 35, 111, 0, 48, 101, 100, 118, 35, 36]\n", |
| 405 | + "➡️ ['M', 'a', 'i', 's', ',', ' ', 'm', 'a', 'i', 's', '<unk>', ' ', 'v', 'a', 's', ' ', 't', \"'\", 'e', 'n', ' ', 'l', '<unk>', '-', 'b', 'a', 's', ' ', '!']\n", |
| 406 | + "➡️ Mais, mais<unk> vas t'en l<unk>-bas !\n" |
409 | 407 | ]
|
410 | 408 | }
|
411 | 409 | ],
|
|
416 | 414 | "encoded = tokenizer.encode(input)\n",
|
417 | 415 | "encoded_str = tokenizer.encode_str(input)\n",
|
418 | 416 | "encoded_decoded = tokenizer.join(encoded_str)\n",
|
419 |
| - "print(f\"{input}\\n-> {encoded}\\n-> {encoded_str}\\n-> {encoded_decoded}\")" |
| 417 | + "print(f\"{input}\\n➡️ {encoded}\\n➡️ {encoded_str}\\n➡️ {encoded_decoded}\")" |
420 | 418 | ]
|
421 | 419 | },
|
422 | 420 | {
|
|
863 | 861 | },
|
864 | 862 | {
|
865 | 863 | "cell_type": "code",
|
866 |
| - "execution_count": 127, |
| 864 | + "execution_count": null, |
867 | 865 | "metadata": {},
|
868 | 866 | "outputs": [],
|
869 | 867 | "source": [
|
|
1031 | 1029 | "\n",
|
1032 | 1030 | " return pd.DataFrame(combined_results)\n",
|
1033 | 1031 | "\n",
|
1034 |
| - "\n", |
1035 | 1032 | "# Cache benchmark results (that are long to compute)\n",
|
1036 | 1033 | "_file_to_cache_results = \"expes/tokenizers_fertilities.json\"\n",
|
1037 | 1034 | "if \"fertilities\" not in globals():\n",
|
|
1040 | 1037 | " with open(_file_to_cache_results, \"r\") as f:\n",
|
1041 | 1038 | " fertilities = json.load(f)\n",
|
1042 | 1039 | "\n",
|
1043 |
| - "\n", |
1044 | 1040 | "def benchmark_fertility(\n",
|
1045 | 1041 | " dataset_configs, # = [\n",
|
1046 | 1042 | " # (\"wikimedia/wikipedia\", \"20231101.\" + lan)\n",
|
|
1168 | 1164 | },
|
1169 | 1165 | {
|
1170 | 1166 | "cell_type": "code",
|
1171 |
| - "execution_count": 189, |
| 1167 | + "execution_count": null, |
1172 | 1168 | "metadata": {},
|
1173 | 1169 | "outputs": [
|
1174 | 1170 | {
|
|
1225 | 1221 | "</style>\n",
|
1226 | 1222 | "<table border=\"1\" class=\"dataframe\">\n",
|
1227 | 1223 | " <thead>\n",
|
1228 |
| - " <tr style=\"text-align: left;\">\n", |
| 1224 | + " <tr style=\"text-align: right;\">\n", |
1229 | 1225 | " <th></th>\n",
|
1230 | 1226 | " <th>Original</th>\n",
|
1231 | 1227 | " <th>Display</th>\n",
|
|
1256 | 1252 | "</div>"
|
1257 | 1253 | ],
|
1258 | 1254 | "text/plain": [
|
1259 |
| - " Original Display \\\n", |
| 1255 | + " Original Display \\\n", |
1260 | 1256 | "0 مرحباً Jean-Pierre، كيف حالك؟ ، كيف حالك؟Jean-Pierre مرحباً \n",
|
1261 | 1257 | "1 Jean-Pierre، كيف حالك؟ ، كيف حالك؟Jean-Pierre \n",
|
1262 | 1258 | "2 مرحباً Jean-Pierre Jean-Pierre مرحباً \n",
|
1263 | 1259 | "\n",
|
1264 |
| - " Display of tokens \n", |
| 1260 | + " Display of tokens \n", |
1265 | 1261 | "0 ┃،┃كيف┃حالك┃؟Jean-Pierre┃مرحباً \n",
|
1266 | 1262 | "1 ┃،┃كيف┃حالك┃؟Jean-Pierre┃مرحباً \n",
|
1267 | 1263 | "2 ┃،┃كيف┃حالك┃؟Jean-Pierre┃مرحباً "
|
1268 | 1264 | ]
|
1269 | 1265 | },
|
1270 |
| - "execution_count": 189, |
| 1266 | + "execution_count": 14, |
1271 | 1267 | "metadata": {},
|
1272 | 1268 | "output_type": "execute_result"
|
1273 | 1269 | }
|
1274 | 1270 | ],
|
1275 | 1271 | "source": [
|
1276 | 1272 | "import re\n",
|
1277 | 1273 | "\n",
|
| 1274 | + "# from bidi.algorithm import get_display # Did not find a good thing in python-bidi (?)\n", |
| 1275 | + "\n", |
1278 | 1276 | "# Unicode characters for Right-to-Left Mark (RLM) and Left-to-Right Mark (LRM)\n",
|
1279 | 1277 | "_RLM = '\\u200F'\n",
|
1280 | 1278 | "_LRM = '\\u200E'\n",
|
|
1343 | 1341 | "\n",
|
1344 | 1342 | " return text\n",
|
1345 | 1343 | "\n",
|
1346 |
| - "# from bidi.algorithm import get_display # Did not find a good thing in python-bidi (?)\n", |
1347 |
| - "\n", |
1348 |
| - "\n", |
1349 | 1344 | "def is_separator(char):\n",
|
1350 | 1345 | " return char in \"┃\"\n",
|
1351 | 1346 | "\n",
|
|
1441 | 1436 | "\n",
|
1442 | 1437 | " return \"\".join(chunks_by_language)\n",
|
1443 | 1438 | "\n",
|
1444 |
| - "\n", |
1445 |
| - "if \"TEST\":\n", |
| 1439 | + "def test_arabic_codeswitching_display():\n", |
1446 | 1440 | " title = \"☪ Test: Fix of display for text with Arabic and code-switching ☪\"\n",
|
1447 | 1441 | " print(f\"{title}\\n\" + \"-\"*(len(title)+2))\n",
|
1448 | 1442 | "\n",
|
|
1491 | 1485 | " print(\"\\n😎 String for display (from right to left 👈🏽)\")\n",
|
1492 | 1486 | " print(display_input)\n",
|
1493 | 1487 | "\n",
|
1494 |
| - "pd.DataFrame({\n", |
1495 |
| - " \"Original\": inputs,\n", |
1496 |
| - " \"Display\": inputs_for_display,\n", |
1497 |
| - " \"Display of tokens\": [normalize_for_display(words, is_token=True) for input in inputs],\n", |
1498 |
| - "})" |
1499 |
| - ] |
1500 |
| - }, |
1501 |
| - { |
1502 |
| - "cell_type": "code", |
1503 |
| - "execution_count": 103, |
1504 |
| - "metadata": {}, |
1505 |
| - "outputs": [ |
1506 |
| - { |
1507 |
| - "data": { |
1508 |
| - "text/html": [ |
1509 |
| - "<div>\n", |
1510 |
| - "<style scoped>\n", |
1511 |
| - " .dataframe tbody tr th:only-of-type {\n", |
1512 |
| - " vertical-align: middle;\n", |
1513 |
| - " }\n", |
1514 |
| - "\n", |
1515 |
| - " .dataframe tbody tr th {\n", |
1516 |
| - " vertical-align: top;\n", |
1517 |
| - " }\n", |
1518 |
| - "\n", |
1519 |
| - " .dataframe thead th {\n", |
1520 |
| - " text-align: right;\n", |
1521 |
| - " }\n", |
1522 |
| - "</style>\n", |
1523 |
| - "<table border=\"1\" class=\"dataframe\">\n", |
1524 |
| - " <thead>\n", |
1525 |
| - " <tr style=\"text-align: left;\">\n", |
1526 |
| - " <th></th>\n", |
1527 |
| - " <th>Original</th>\n", |
1528 |
| - " <th>Display</th>\n", |
1529 |
| - " <th>Internal of display</th>\n", |
1530 |
| - " </tr>\n", |
1531 |
| - " </thead>\n", |
1532 |
| - " <tbody>\n", |
1533 |
| - " <tr>\n", |
1534 |
| - " <th>0</th>\n", |
1535 |
| - " <td>مرحباً Jean-Pierre، كيف حالك؟</td>\n", |
1536 |
| - " <td>، كيف حالك؟Jean-Pierre مرحباً</td>\n", |
1537 |
| - " <td>┃▁مرحباً<LRM>Jean-Pierre┃▁┃<RLM>حالك؟┃┃▁كيف┃┃▁،<RLM></td>\n", |
1538 |
| - " </tr>\n", |
1539 |
| - " <tr>\n", |
1540 |
| - " <th>1</th>\n", |
1541 |
| - " <td>Jean-Pierre، كيف حالك؟</td>\n", |
1542 |
| - " <td>، كيف حالك؟Jean-Pierre</td>\n", |
1543 |
| - " <td><LRM>Jean-Pierreحالك؟┃┃▁كيف┃┃▁،<RLM></td>\n", |
1544 |
| - " </tr>\n", |
1545 |
| - " <tr>\n", |
1546 |
| - " <th>2</th>\n", |
1547 |
| - " <td>مرحباً Jean-Pierre</td>\n", |
1548 |
| - " <td>Jean-Pierre مرحباً</td>\n", |
1549 |
| - " <td>┃▁مرحباً<LRM>Jean-Pierre┃▁┃<RLM></td>\n", |
1550 |
| - " </tr>\n", |
1551 |
| - " </tbody>\n", |
1552 |
| - "</table>\n", |
1553 |
| - "</div>" |
1554 |
| - ], |
1555 |
| - "text/plain": [ |
1556 |
| - " Original Display \\\n", |
1557 |
| - "0 مرحباً Jean-Pierre، كيف حالك؟ ، كيف حالك؟Jean-Pierre مرحباً \n", |
1558 |
| - "1 Jean-Pierre، كيف حالك؟ ، كيف حالك؟Jean-Pierre \n", |
1559 |
| - "2 مرحباً Jean-Pierre Jean-Pierre مرحباً \n", |
1560 |
| - "\n", |
1561 |
| - " Internal of display \n", |
1562 |
| - "0 ┃▁مرحباً<LRM>Jean-Pierre┃▁┃<RLM>حالك؟┃┃▁كيف┃┃▁،<RLM> \n", |
1563 |
| - "1 <LRM>Jean-Pierreحالك؟┃┃▁كيف┃┃▁،<RLM> \n", |
1564 |
| - "2 ┃▁مرحباً<LRM>Jean-Pierre┃▁┃<RLM> " |
1565 |
| - ] |
1566 |
| - }, |
1567 |
| - "execution_count": 103, |
1568 |
| - "metadata": {}, |
1569 |
| - "output_type": "execute_result" |
1570 |
| - } |
1571 |
| - ], |
1572 |
| - "source": [ |
1573 |
| - "pd.DataFrame({\n", |
1574 |
| - " \"Original\": inputs,\n", |
1575 |
| - " \"Display\": inputs_for_display,\n", |
1576 |
| - " \"Internal of display\": [normalize_for_display([w for w in re.split(r\"( |\\<\\w+\\>)\", input) if w], is_token=True) for input in inputs_for_display],\n", |
1577 |
| - "})" |
| 1488 | + " return pd.DataFrame({\n", |
| 1489 | + " \"Original\": inputs,\n", |
| 1490 | + " \"Display\": inputs_for_display,\n", |
| 1491 | + " \"Display of tokens\": [normalize_for_display(words, is_token=True) for input in inputs],\n", |
| 1492 | + " })\n", |
| 1493 | + "\n", |
| 1494 | + "test_arabic_codeswitching_display()" |
1578 | 1495 | ]
|
1579 | 1496 | },
|
1580 | 1497 | {
|
|
0 commit comments