|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "code",
|
5 |
| - "execution_count": null, |
| 5 | + "execution_count": 1, |
6 | 6 | "metadata": {},
|
7 |
| - "outputs": [], |
| 7 | + "outputs": [ |
| 8 | + { |
| 9 | + "name": "stderr", |
| 10 | + "output_type": "stream", |
| 11 | + "text": [ |
| 12 | + "WARNING:root:Limited tf.compat.v2.summary API due to missing TensorBoard installation.\n", |
| 13 | + "WARNING:root:Limited tf.compat.v2.summary API due to missing TensorBoard installation.\n", |
| 14 | + "WARNING:root:Limited tf.compat.v2.summary API due to missing TensorBoard installation.\n", |
| 15 | + "WARNING:root:Limited tf.summary API due to missing TensorBoard installation.\n" |
| 16 | + ] |
| 17 | + } |
| 18 | + ], |
8 | 19 | "source": [
|
9 | 20 | "import os\n",
|
10 | 21 | "import tensorflow as tf\n",
|
|
21 | 32 | },
|
22 | 33 | {
|
23 | 34 | "cell_type": "code",
|
24 |
| - "execution_count": null, |
| 35 | + "execution_count": 2, |
25 | 36 | "metadata": {},
|
26 | 37 | "outputs": [],
|
27 | 38 | "source": [
|
|
38 | 49 | },
|
39 | 50 | {
|
40 | 51 | "cell_type": "code",
|
41 |
| - "execution_count": null, |
| 52 | + "execution_count": 3, |
42 | 53 | "metadata": {},
|
43 | 54 | "outputs": [],
|
44 | 55 | "source": [
|
|
68 | 79 | },
|
69 | 80 | {
|
70 | 81 | "cell_type": "code",
|
71 |
| - "execution_count": null, |
| 82 | + "execution_count": 4, |
72 | 83 | "metadata": {},
|
73 |
| - "outputs": [], |
| 84 | + "outputs": [ |
| 85 | + { |
| 86 | + "name": "stdout", |
| 87 | + "output_type": "stream", |
| 88 | + "text": [ |
| 89 | + "Total # dataset: train - 942808, dev - 2490\n" |
| 90 | + ] |
| 91 | + } |
| 92 | + ], |
74 | 93 | "source": [
|
75 | 94 | "# Load Train dataset\n",
|
76 | 95 | "\n",
|
|
93 | 112 | },
|
94 | 113 | {
|
95 | 114 | "cell_type": "code",
|
96 |
| - "execution_count": null, |
| 115 | + "execution_count": 5, |
97 | 116 | "metadata": {},
|
98 |
| - "outputs": [], |
| 117 | + "outputs": [ |
| 118 | + { |
| 119 | + "data": { |
| 120 | + "application/vnd.jupyter.widget-view+json": { |
| 121 | + "model_id": "15c0a56436d84bbc80567d3a2e15befb", |
| 122 | + "version_major": 2, |
| 123 | + "version_minor": 0 |
| 124 | + }, |
| 125 | + "text/plain": [ |
| 126 | + "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…" |
| 127 | + ] |
| 128 | + }, |
| 129 | + "metadata": {}, |
| 130 | + "output_type": "display_data" |
| 131 | + }, |
| 132 | + { |
| 133 | + "name": "stdout", |
| 134 | + "output_type": "stream", |
| 135 | + "text": [ |
| 136 | + "\n" |
| 137 | + ] |
| 138 | + } |
| 139 | + ], |
99 | 140 | "source": [
|
100 | 141 | "# Bert Tokenizer\n",
|
101 | 142 | "\n",
|
|
126 | 167 | },
|
127 | 168 | {
|
128 | 169 | "cell_type": "code",
|
129 |
| - "execution_count": null, |
| 170 | + "execution_count": 13, |
130 | 171 | "metadata": {},
|
131 | 172 | "outputs": [],
|
132 | 173 | "source": [
|
|
152 | 193 | "train_snli_xnli_inputs = (train_snli_xnli_input_ids, train_snli_xnli_attention_masks, train_snli_xnli_type_ids)"
|
153 | 194 | ]
|
154 | 195 | },
|
| 196 | + { |
| 197 | + "cell_type": "code", |
| 198 | + "execution_count": 18, |
| 199 | + "metadata": {}, |
| 200 | + "outputs": [ |
| 201 | + { |
| 202 | + "name": "stdout", |
| 203 | + "output_type": "stream", |
| 204 | + "text": [ |
| 205 | + "[ 101 9251 10622 9847 97802 8888 13890 33305 9379 25549 12310 9619\n", |
| 206 | + " 11261 9150 12965 28188 66346 119 102 9405 61250 10892 9538 78705\n", |
| 207 | + " 11489 9251 10622 9845 11664 11506 119 102 0 0 0 0\n", |
| 208 | + " 0 0 0 0 0 0 0 0 0 0 0 0]\n", |
| 209 | + "[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0\n", |
| 210 | + " 0 0 0 0 0 0 0 0 0 0 0]\n", |
| 211 | + "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0\n", |
| 212 | + " 0 0 0 0 0 0 0 0 0 0 0]\n", |
| 213 | + "[CLS] 말을 탄 사람이 고장난 비행기 위로 뛰어오른다. [SEP] 사람은 야외에서 말을 타고 있다. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]\n" |
| 214 | + ] |
| 215 | + } |
| 216 | + ], |
| 217 | + "source": [ |
| 218 | + "input_id = train_snli_xnli_input_ids[2]\n", |
| 219 | + "attention_mask = train_snli_xnli_attention_masks[2]\n", |
| 220 | + "token_type_id = train_snli_xnli_type_ids[2]\n", |
| 221 | + "\n", |
| 222 | + "print(input_id)\n", |
| 223 | + "print(attention_mask)\n", |
| 224 | + "print(token_type_id)\n", |
| 225 | + "print(tokenizer.decode(input_id))" |
| 226 | + ] |
| 227 | + }, |
155 | 228 | {
|
156 | 229 | "cell_type": "markdown",
|
157 | 230 | "metadata": {},
|
|
161 | 234 | },
|
162 | 235 | {
|
163 | 236 | "cell_type": "code",
|
164 |
| - "execution_count": null, |
| 237 | + "execution_count": 6, |
165 | 238 | "metadata": {},
|
166 |
| - "outputs": [], |
| 239 | + "outputs": [ |
| 240 | + { |
| 241 | + "name": "stderr", |
| 242 | + "output_type": "stream", |
| 243 | + "text": [ |
| 244 | + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", |
| 245 | + "/Users/user/anaconda3/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2016: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", |
| 246 | + " warnings.warn(\n" |
| 247 | + ] |
| 248 | + } |
| 249 | + ], |
167 | 250 | "source": [
|
168 | 251 | "# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n",
|
169 | 252 | "input_ids = []\n",
|
|
390 | 473 | "name": "python",
|
391 | 474 | "nbconvert_exporter": "python",
|
392 | 475 | "pygments_lexer": "ipython3",
|
393 |
| - "version": "3.7.3" |
| 476 | + "version": "3.8.3" |
394 | 477 | }
|
395 | 478 | },
|
396 | 479 | "nbformat": 4,
|
|
0 commit comments