Skip to content

Commit 528e13b

Browse files
committed
tuning
1 parent 6121c19 commit 528e13b

File tree

1 file changed

+33
-116
lines changed

1 file changed

+33
-116
lines changed

tokenizers.ipynb

Lines changed: 33 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"# 🔎🔢 Tokenizers (in Deep Learning)"
7+
"# 🔎🔢 Hands-On Tokenizers (in Deep Learning)"
88
]
99
},
1010
{
@@ -42,7 +42,9 @@
4242
"source": [
4343
"A tokenizer maps `string` $\\rightleftharpoons$ `list of tokens`.\n",
4444
"* `encode`(\"string\") $\\mapsto$ [\"list\", \"of\", \"tokens\"]\n",
45-
"* `decode`([\"list\", \"of\", \"tokens\"]) $\\mapsto$ \"string\""
45+
"* `decode`([\"list\", \"of\", \"tokens\"]) $\\mapsto$ \"string\"\n",
46+
"\n",
47+
"In deep learning, a tokenizer is a <u>pre-processing</u> and/or <u>post-processing</u> brick for an artificial neural network that process and/or generates text."
4648
]
4749
},
4850
{
@@ -297,20 +299,20 @@
297299
},
298300
{
299301
"cell_type": "code",
300-
"execution_count": 2,
302+
"execution_count": 19,
301303
"metadata": {},
302304
"outputs": [
303305
{
304306
"name": "stdout",
305307
"output_type": "stream",
306308
"text": [
307309
"== CharSplitter ==\n",
308-
"-> ['M', 'a', 'i', 's', ',', ' ', 'm', 'a', 'i', 's', '…', ' ', 'v', 'a', 's', ' ', 't', \"'\", 'e', 'n', ' ', 'l', 'à', '-', 'b', 'a', 's', ' ', '!']\n",
309-
"-> Mais, mais… vas t'en là-bas !\n",
310+
"➡️ ['M', 'a', 'i', 's', ',', ' ', 'm', 'a', 'i', 's', '…', ' ', 'v', 'a', 's', ' ', 't', \"'\", 'e', 'n', ' ', 'l', 'à', '-', 'b', 'a', 's', ' ', '!']\n",
311+
"➡️ Mais, mais… vas t'en là-bas !\n",
310312
"\n",
311313
"== WordSplitter ==\n",
312-
"-> ['Mais,', '▁mais…', '▁vas', \"▁t'en\", '▁là-bas', '▁!']\n",
313-
"-> Mais, mais… vas t'en là-bas !\n",
314+
"➡️ ['Mais,', '▁mais…', '▁vas', \"▁t'en\", '▁là-bas', '▁!']\n",
315+
"➡️ Mais, mais… vas t'en là-bas !\n",
314316
"\n"
315317
]
316318
}
@@ -335,21 +337,17 @@
335337
" def join(self, tokens: list) -> str:\n",
336338
" return \"\".join(tokens).replace(self._SPACE, \" \")\n",
337339
"\n",
338-
"\n",
339340
"input = \"Mais, mais… vas t'en là-bas !\"\n",
340341
"\n",
341-
"for tokenizer in [\n",
342-
" CharSplitter(),\n",
343-
" WordSplitter(),\n",
344-
" ]:\n",
342+
"for tokenizer in [CharSplitter(), WordSplitter()]:\n",
345343
"\n",
346344
" encoded = tokenizer.split(input)\n",
347345
"\n",
348346
" # Round-trip test\n",
349347
" encoded_decoded = tokenizer.join(encoded)\n",
350348
" assert encoded_decoded == input\n",
351349
" \n",
352-
" print(f\"== {tokenizer.__class__.__name__} ==\\n-> {encoded}\\n-> {encoded_decoded}\\n\")"
350+
" print(f\"== {tokenizer.__class__.__name__} ==\\n➡️ {encoded}\\n➡️ {encoded_decoded}\\n\")"
353351
]
354352
},
355353
{
@@ -363,7 +361,7 @@
363361
},
364362
{
365363
"cell_type": "code",
366-
"execution_count": 209,
364+
"execution_count": 17,
367365
"metadata": {},
368366
"outputs": [],
369367
"source": [
@@ -395,17 +393,17 @@
395393
},
396394
{
397395
"cell_type": "code",
398-
"execution_count": 16,
396+
"execution_count": 18,
399397
"metadata": {},
400398
"outputs": [
401399
{
402400
"name": "stdout",
403401
"output_type": "stream",
404402
"text": [
405403
"Mais, mais… vas t'en là-bas !\n",
406-
"-> [80, 100, 108, 118, 47, 35, 112, 100, 108, 118, 0, 35, 121, 100, 118, 35, 119, 42, 104, 113, 35, 111, 0, 48, 101, 100, 118, 35, 36]\n",
407-
"-> ['M', 'a', 'i', 's', ',', ' ', 'm', 'a', 'i', 's', '<unk>', ' ', 'v', 'a', 's', ' ', 't', \"'\", 'e', 'n', ' ', 'l', '<unk>', '-', 'b', 'a', 's', ' ', '!']\n",
408-
"-> Mais, mais<unk> vas t'en l<unk>-bas !\n"
404+
"➡️ [80, 100, 108, 118, 47, 35, 112, 100, 108, 118, 0, 35, 121, 100, 118, 35, 119, 42, 104, 113, 35, 111, 0, 48, 101, 100, 118, 35, 36]\n",
405+
"➡️ ['M', 'a', 'i', 's', ',', ' ', 'm', 'a', 'i', 's', '<unk>', ' ', 'v', 'a', 's', ' ', 't', \"'\", 'e', 'n', ' ', 'l', '<unk>', '-', 'b', 'a', 's', ' ', '!']\n",
406+
"➡️ Mais, mais<unk> vas t'en l<unk>-bas !\n"
409407
]
410408
}
411409
],
@@ -416,7 +414,7 @@
416414
"encoded = tokenizer.encode(input)\n",
417415
"encoded_str = tokenizer.encode_str(input)\n",
418416
"encoded_decoded = tokenizer.join(encoded_str)\n",
419-
"print(f\"{input}\\n-> {encoded}\\n-> {encoded_str}\\n-> {encoded_decoded}\")"
417+
"print(f\"{input}\\n➡️ {encoded}\\n➡️ {encoded_str}\\n➡️ {encoded_decoded}\")"
420418
]
421419
},
422420
{
@@ -863,7 +861,7 @@
863861
},
864862
{
865863
"cell_type": "code",
866-
"execution_count": 127,
864+
"execution_count": null,
867865
"metadata": {},
868866
"outputs": [],
869867
"source": [
@@ -1031,7 +1029,6 @@
10311029
"\n",
10321030
" return pd.DataFrame(combined_results)\n",
10331031
"\n",
1034-
"\n",
10351032
"# Cache benchmark results (that are long to compute)\n",
10361033
"_file_to_cache_results = \"expes/tokenizers_fertilities.json\"\n",
10371034
"if \"fertilities\" not in globals():\n",
@@ -1040,7 +1037,6 @@
10401037
" with open(_file_to_cache_results, \"r\") as f:\n",
10411038
" fertilities = json.load(f)\n",
10421039
"\n",
1043-
"\n",
10441040
"def benchmark_fertility(\n",
10451041
" dataset_configs, # = [\n",
10461042
" # (\"wikimedia/wikipedia\", \"20231101.\" + lan)\n",
@@ -1168,7 +1164,7 @@
11681164
},
11691165
{
11701166
"cell_type": "code",
1171-
"execution_count": 189,
1167+
"execution_count": null,
11721168
"metadata": {},
11731169
"outputs": [
11741170
{
@@ -1225,7 +1221,7 @@
12251221
"</style>\n",
12261222
"<table border=\"1\" class=\"dataframe\">\n",
12271223
" <thead>\n",
1228-
" <tr style=\"text-align: left;\">\n",
1224+
" <tr style=\"text-align: right;\">\n",
12291225
" <th></th>\n",
12301226
" <th>Original</th>\n",
12311227
" <th>Display</th>\n",
@@ -1256,25 +1252,27 @@
12561252
"</div>"
12571253
],
12581254
"text/plain": [
1259-
" Original Display \\\n",
1255+
" Original Display \\\n",
12601256
"0 مرحباً Jean-Pierre، كيف حالك؟ ‏، كيف حالك؟‎Jean-Pierre ‏مرحباً \n",
12611257
"1 Jean-Pierre، كيف حالك؟ ‏، كيف حالك؟‎Jean-Pierre \n",
12621258
"2 مرحباً Jean-Pierre ‎Jean-Pierre ‏مرحباً \n",
12631259
"\n",
1264-
" Display of tokens \n",
1260+
" Display of tokens \n",
12651261
"0 ‏┃،┃كيف┃حالك┃؟‎Jean-Pierre┃‏مرحباً \n",
12661262
"1 ‏┃،┃كيف┃حالك┃؟‎Jean-Pierre┃‏مرحباً \n",
12671263
"2 ‏┃،┃كيف┃حالك┃؟‎Jean-Pierre┃‏مرحباً "
12681264
]
12691265
},
1270-
"execution_count": 189,
1266+
"execution_count": 14,
12711267
"metadata": {},
12721268
"output_type": "execute_result"
12731269
}
12741270
],
12751271
"source": [
12761272
"import re\n",
12771273
"\n",
1274+
"# from bidi.algorithm import get_display # Did not find a good thing in python-bidi (?)\n",
1275+
"\n",
12781276
"# Unicode characters for Right-to-Left Mark (RLM) and Left-to-Right Mark (LRM)\n",
12791277
"_RLM = '\\u200F'\n",
12801278
"_LRM = '\\u200E'\n",
@@ -1343,9 +1341,6 @@
13431341
"\n",
13441342
" return text\n",
13451343
"\n",
1346-
"# from bidi.algorithm import get_display # Did not find a good thing in python-bidi (?)\n",
1347-
"\n",
1348-
"\n",
13491344
"def is_separator(char):\n",
13501345
" return char in \"\"\n",
13511346
"\n",
@@ -1441,8 +1436,7 @@
14411436
"\n",
14421437
" return \"\".join(chunks_by_language)\n",
14431438
"\n",
1444-
"\n",
1445-
"if \"TEST\":\n",
1439+
"def test_arabic_codeswitching_display():\n",
14461440
" title = \"☪ Test: Fix of display for text with Arabic and code-switching ☪\"\n",
14471441
" print(f\"{title}\\n\" + \"-\"*(len(title)+2))\n",
14481442
"\n",
@@ -1491,90 +1485,13 @@
14911485
" print(\"\\n😎 String for display (from right to left 👈🏽)\")\n",
14921486
" print(display_input)\n",
14931487
"\n",
1494-
"pd.DataFrame({\n",
1495-
" \"Original\": inputs,\n",
1496-
" \"Display\": inputs_for_display,\n",
1497-
" \"Display of tokens\": [normalize_for_display(words, is_token=True) for input in inputs],\n",
1498-
"})"
1499-
]
1500-
},
1501-
{
1502-
"cell_type": "code",
1503-
"execution_count": 103,
1504-
"metadata": {},
1505-
"outputs": [
1506-
{
1507-
"data": {
1508-
"text/html": [
1509-
"<div>\n",
1510-
"<style scoped>\n",
1511-
" .dataframe tbody tr th:only-of-type {\n",
1512-
" vertical-align: middle;\n",
1513-
" }\n",
1514-
"\n",
1515-
" .dataframe tbody tr th {\n",
1516-
" vertical-align: top;\n",
1517-
" }\n",
1518-
"\n",
1519-
" .dataframe thead th {\n",
1520-
" text-align: right;\n",
1521-
" }\n",
1522-
"</style>\n",
1523-
"<table border=\"1\" class=\"dataframe\">\n",
1524-
" <thead>\n",
1525-
" <tr style=\"text-align: left;\">\n",
1526-
" <th></th>\n",
1527-
" <th>Original</th>\n",
1528-
" <th>Display</th>\n",
1529-
" <th>Internal of display</th>\n",
1530-
" </tr>\n",
1531-
" </thead>\n",
1532-
" <tbody>\n",
1533-
" <tr>\n",
1534-
" <th>0</th>\n",
1535-
" <td>مرحباً Jean-Pierre، كيف حالك؟</td>\n",
1536-
" <td>‏، كيف حالك؟‎Jean-Pierre ‏مرحباً</td>\n",
1537-
" <td>‎┃▁‏مرحباً‎&lt;LRM&gt;Jean-Pierre┃▁┃&lt;RLM&gt;‏حالك؟┃‎┃▁‏كيف┃‎┃▁‏،‎&lt;RLM&gt;</td>\n",
1538-
" </tr>\n",
1539-
" <tr>\n",
1540-
" <th>1</th>\n",
1541-
" <td>Jean-Pierre، كيف حالك؟</td>\n",
1542-
" <td>‏، كيف حالك؟‎Jean-Pierre</td>\n",
1543-
" <td>‎&lt;LRM&gt;Jean-Pierre‏حالك؟┃‎┃▁‏كيف┃‎┃▁‏،‎&lt;RLM&gt;</td>\n",
1544-
" </tr>\n",
1545-
" <tr>\n",
1546-
" <th>2</th>\n",
1547-
" <td>مرحباً Jean-Pierre</td>\n",
1548-
" <td>‎Jean-Pierre ‏مرحباً</td>\n",
1549-
" <td>‎┃▁‏مرحباً‎&lt;LRM&gt;Jean-Pierre┃▁┃&lt;RLM&gt;</td>\n",
1550-
" </tr>\n",
1551-
" </tbody>\n",
1552-
"</table>\n",
1553-
"</div>"
1554-
],
1555-
"text/plain": [
1556-
" Original Display \\\n",
1557-
"0 مرحباً Jean-Pierre، كيف حالك؟ ‏، كيف حالك؟‎Jean-Pierre ‏مرحباً \n",
1558-
"1 Jean-Pierre، كيف حالك؟ ‏، كيف حالك؟‎Jean-Pierre \n",
1559-
"2 مرحباً Jean-Pierre ‎Jean-Pierre ‏مرحباً \n",
1560-
"\n",
1561-
" Internal of display \n",
1562-
"0 ‎┃▁‏مرحباً‎<LRM>Jean-Pierre┃▁┃<RLM>‏حالك؟┃‎┃▁‏كيف┃‎┃▁‏،‎<RLM> \n",
1563-
"1 ‎<LRM>Jean-Pierre‏حالك؟┃‎┃▁‏كيف┃‎┃▁‏،‎<RLM> \n",
1564-
"2 ‎┃▁‏مرحباً‎<LRM>Jean-Pierre┃▁┃<RLM> "
1565-
]
1566-
},
1567-
"execution_count": 103,
1568-
"metadata": {},
1569-
"output_type": "execute_result"
1570-
}
1571-
],
1572-
"source": [
1573-
"pd.DataFrame({\n",
1574-
" \"Original\": inputs,\n",
1575-
" \"Display\": inputs_for_display,\n",
1576-
" \"Internal of display\": [normalize_for_display([w for w in re.split(r\"( |\\<\\w+\\>)\", input) if w], is_token=True) for input in inputs_for_display],\n",
1577-
"})"
1488+
" return pd.DataFrame({\n",
1489+
" \"Original\": inputs,\n",
1490+
" \"Display\": inputs_for_display,\n",
1491+
" \"Display of tokens\": [normalize_for_display(words, is_token=True) for input in inputs],\n",
1492+
" })\n",
1493+
"\n",
1494+
"test_arabic_codeswitching_display()"
15781495
]
15791496
},
15801497
{

0 commit comments

Comments
 (0)