[Python]<fix.> 7.2.2.bert_finetune_KorNLI.ipynb

sungjin.712 · sungjin.712 · commit e3680fa06684 · 2021-05-30T12:07:13.000+09:00
diff --git a/7.PRETRAIN_METHOD/7.2.2.bert_finetune_KorNLI.ipynb b/7.PRETRAIN_METHOD/7.2.2.bert_finetune_KorNLI.ipynb
@@ -2,9 +2,20 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:Limited tf.compat.v2.summary API due to missing TensorBoard installation.\n",
+      "WARNING:root:Limited tf.compat.v2.summary API due to missing TensorBoard installation.\n",
+      "WARNING:root:Limited tf.compat.v2.summary API due to missing TensorBoard installation.\n",
+      "WARNING:root:Limited tf.summary API due to missing TensorBoard installation.\n"
+     ]
+    }
+   ],
    "source": [
     "import os\n",
     "import tensorflow as tf\n",
@@ -21,7 +32,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -38,7 +49,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -68,9 +79,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total # dataset: train - 942808, dev - 2490\n"
+     ]
+    }
+   ],
    "source": [
     "# Load Train dataset\n",
     "\n",
@@ -93,9 +112,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "15c0a56436d84bbc80567d3a2e15befb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "# Bert Tokenizer\n",
     "\n",
@@ -126,7 +167,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -152,6 +193,38 @@
     "train_snli_xnli_inputs = (train_snli_xnli_input_ids, train_snli_xnli_attention_masks, train_snli_xnli_type_ids)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[  101  9251 10622  9847 97802  8888 13890 33305  9379 25549 12310  9619\n",
+      " 11261  9150 12965 28188 66346   119   102  9405 61250 10892  9538 78705\n",
+      " 11489  9251 10622  9845 11664 11506   119   102     0     0     0     0\n",
+      "     0     0     0     0     0     0     0     0     0     0     0     0]\n",
+      "[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0\n",
+      " 0 0 0 0 0 0 0 0 0 0 0]\n",
+      "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0\n",
+      " 0 0 0 0 0 0 0 0 0 0 0]\n",
+      "[CLS] 말을 탄 사람이 고장난 비행기 위로 뛰어오른다. [SEP] 사람은 야외에서 말을 타고 있다. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]\n"
+     ]
+    }
+   ],
+   "source": [
+    "input_id = train_snli_xnli_input_ids[2]\n",
+    "attention_mask = train_snli_xnli_attention_masks[2]\n",
+    "token_type_id = train_snli_xnli_type_ids[2]\n",
+    "\n",
+    "print(input_id)\n",
+    "print(attention_mask)\n",
+    "print(token_type_id)\n",
+    "print(tokenizer.decode(input_id))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -161,9 +234,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n",
+      "/Users/user/anaconda3/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2016: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
    "source": [
     "# 토크나이저를 제외하고는 5장에서 처리한 방식과 유사하게 접근\n",
     "input_ids = []\n",
@@ -390,7 +473,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.8.3"
   }
  },
  "nbformat": 4,