Skip to content

Commit e3680fa

Browse files
author
sungjin.712
committed
[Python]<fix.> 7.2.2.bert_finetune_KorNLI.ipynb
1 parent eb57bb1 commit e3680fa

File tree

1 file changed

+95
-12
lines changed

1 file changed

+95
-12
lines changed

7.PRETRAIN_METHOD/7.2.2.bert_finetune_KorNLI.ipynb

Lines changed: 95 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,20 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": null,
5+
"execution_count": 1,
66
"metadata": {},
7-
"outputs": [],
7+
"outputs": [
8+
{
9+
"name": "stderr",
10+
"output_type": "stream",
11+
"text": [
12+
"WARNING:root:Limited tf.compat.v2.summary API due to missing TensorBoard installation.\n",
13+
"WARNING:root:Limited tf.compat.v2.summary API due to missing TensorBoard installation.\n",
14+
"WARNING:root:Limited tf.compat.v2.summary API due to missing TensorBoard installation.\n",
15+
"WARNING:root:Limited tf.summary API due to missing TensorBoard installation.\n"
16+
]
17+
}
18+
],
819
"source": [
920
"import os\n",
1021
"import tensorflow as tf\n",
@@ -21,7 +32,7 @@
2132
},
2233
{
2334
"cell_type": "code",
24-
"execution_count": null,
35+
"execution_count": 2,
2536
"metadata": {},
2637
"outputs": [],
2738
"source": [
@@ -38,7 +49,7 @@
3849
},
3950
{
4051
"cell_type": "code",
41-
"execution_count": null,
52+
"execution_count": 3,
4253
"metadata": {},
4354
"outputs": [],
4455
"source": [
@@ -68,9 +79,17 @@
6879
},
6980
{
7081
"cell_type": "code",
71-
"execution_count": null,
82+
"execution_count": 4,
7283
"metadata": {},
73-
"outputs": [],
84+
"outputs": [
85+
{
86+
"name": "stdout",
87+
"output_type": "stream",
88+
"text": [
89+
"Total # dataset: train - 942808, dev - 2490\n"
90+
]
91+
}
92+
],
7493
"source": [
7594
"# Load Train dataset\n",
7695
"\n",
@@ -93,9 +112,31 @@
93112
},
94113
{
95114
"cell_type": "code",
96-
"execution_count": null,
115+
"execution_count": 5,
97116
"metadata": {},
98-
"outputs": [],
117+
"outputs": [
118+
{
119+
"data": {
120+
"application/vnd.jupyter.widget-view+json": {
121+
"model_id": "15c0a56436d84bbc80567d3a2e15befb",
122+
"version_major": 2,
123+
"version_minor": 0
124+
},
125+
"text/plain": [
126+
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…"
127+
]
128+
},
129+
"metadata": {},
130+
"output_type": "display_data"
131+
},
132+
{
133+
"name": "stdout",
134+
"output_type": "stream",
135+
"text": [
136+
"\n"
137+
]
138+
}
139+
],
99140
"source": [
100141
"# Bert Tokenizer\n",
101142
"\n",
@@ -126,7 +167,7 @@
126167
},
127168
{
128169
"cell_type": "code",
129-
"execution_count": null,
170+
"execution_count": 13,
130171
"metadata": {},
131172
"outputs": [],
132173
"source": [
@@ -152,6 +193,38 @@
152193
"train_snli_xnli_inputs = (train_snli_xnli_input_ids, train_snli_xnli_attention_masks, train_snli_xnli_type_ids)"
153194
]
154195
},
196+
{
197+
"cell_type": "code",
198+
"execution_count": 18,
199+
"metadata": {},
200+
"outputs": [
201+
{
202+
"name": "stdout",
203+
"output_type": "stream",
204+
"text": [
205+
"[ 101 9251 10622 9847 97802 8888 13890 33305 9379 25549 12310 9619\n",
206+
" 11261 9150 12965 28188 66346 119 102 9405 61250 10892 9538 78705\n",
207+
" 11489 9251 10622 9845 11664 11506 119 102 0 0 0 0\n",
208+
" 0 0 0 0 0 0 0 0 0 0 0 0]\n",
209+
"[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0\n",
210+
" 0 0 0 0 0 0 0 0 0 0 0]\n",
211+
"[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0\n",
212+
" 0 0 0 0 0 0 0 0 0 0 0]\n",
213+
"[CLS] 말을 탄 사람이 고장난 비행기 위로 뛰어오른다. [SEP] 사람은 야외에서 말을 타고 있다. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]\n"
214+
]
215+
}
216+
],
217+
"source": [
218+
"input_id = train_snli_xnli_input_ids[2]\n",
219+
"attention_mask = train_snli_xnli_attention_masks[2]\n",
220+
"token_type_id = train_snli_xnli_type_ids[2]\n",
221+
"\n",
222+
"print(input_id)\n",
223+
"print(attention_mask)\n",
224+
"print(token_type_id)\n",
225+
"print(tokenizer.decode(input_id))"
226+
]
227+
},
155228
{
156229
"cell_type": "markdown",
157230
"metadata": {},
@@ -161,9 +234,19 @@
161234
},
162235
{
163236
"cell_type": "code",
164-
"execution_count": null,
237+
"execution_count": 6,
165238
"metadata": {},
166-
"outputs": [],
239+
"outputs": [
240+
{
241+
"name": "stderr",
242+
"output_type": "stream",
243+
"text": [
244+
"Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
245+
"/Users/user/anaconda3/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2016: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
246+
" warnings.warn(\n"
247+
]
248+
}
249+
],
167250
"source": [
168251
"# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n",
169252
"input_ids = []\n",
@@ -390,7 +473,7 @@
390473
"name": "python",
391474
"nbconvert_exporter": "python",
392475
"pygments_lexer": "ipython3",
393-
"version": "3.7.3"
476+
"version": "3.8.3"
394477
}
395478
},
396479
"nbformat": 4,

0 commit comments

Comments
 (0)