version1

anhthuan1999 · Dec 5, 2019 · 7793a8b · 7793a8b
1 parent 754d88a
commit 7793a8b
Show file tree

Hide file tree

Showing 9 changed files with 1,592 additions and 0 deletions.
diff --git a/images/bert.png b/images/bert.png
diff --git a/images/bilstm.png b/images/bilstm.png
diff --git a/images/lstm.png b/images/lstm.png
diff --git a/models/BERT.ipynb b/models/BERT.ipynb
diff --git a/models/BiLSTM.ipynb b/models/BiLSTM.ipynb
@@ -0,0 +1,366 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "BiLSTM.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_EAmAfts104F",
+        "colab_type": "text"
+      },
+      "source": [
+        "Library"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "GfEUUbmIYBZp",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "from pyvi import ViTokenizer, ViPosTagger\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "import gensim\n",
+        "import sklearn\n",
+        "import tensorflow as tf\n",
+        "from sklearn.model_selection import train_test_split\n",
+        "from sklearn.decomposition import TruncatedSVD\n",
+        "from sklearn.metrics import classification_report\n",
+        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+        "from tensorflow.keras.layers import *\n",
+        "from keras.layers import *\n",
+        "from keras.preprocessing.text import Tokenizer\n",
+        "from keras.preprocessing.sequence import pad_sequences\n",
+        "from keras.models import Model\n",
+        "from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional\n",
+        "from tensorflow.keras.optimizers import Adam"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8sjoCQj2YEgS",
+        "colab_type": "text"
+      },
+      "source": [
+        "Chuẩn bị data"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "D8V78zDiYB06",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "datanewscontent=pd.read_excel(r'path_to_newscontent.xlsx','Sheet1')\n",
+        "datacomment=pd.read_excel(r'path_to_comments.xlsx','Sheet1')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "hATdRopJYGJT",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "datacmt=[]\n",
+        "for d in datacomment['comment']:\n",
+        "    e=ViTokenizer.tokenize(str(d))\n",
+        "    datacmt.append(e)\n",
+        "labelcmt=datacomment['label']"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "lDMCLRpGgjy6",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "datanews=[]\n",
+        "for d in datanewscontent['all_lower']:\n",
+        "    e=ViTokenizer.tokenize(str(d))\n",
+        "    datanews.append(e)\n",
+        "labelnews=datanewscontent['label']"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "4O9hoYc618vH",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "def truncatedvectors(data,n_components=300):\n",
+        "  svd_ngram = TruncatedSVD(n_components=n_components, random_state=42)\n",
+        "  svd_ngram.fit(data)\n",
+        "  return svd_ngram.transform(data)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "3DkRKt8jgWX6",
+        "colab_type": "text"
+      },
+      "source": [
+        "Word2vec"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "MhjtoeymgXww",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "from gensim.models import Word2Vec,KeyedVectors \n",
+        "import os\n",
+        "word2vec_model_path =\"path_to_wikivimodel.bin\"\n",
+        "model = KeyedVectors.load_word2vec_format(word2vec_model_path,binary=True, unicode_errors='ignore')\n",
+        "vocab = model.wv.vocab\n",
+        "wv = model.wv"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "fUY9bAzqgcFL",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "def get_word2vec_data(X):\n",
+        "    word2vec_data = []\n",
+        "    for x in X:\n",
+        "        sentence = []\n",
+        "        for word in x.split(\" \"):\n",
+        "            if word in vocab:\n",
+        "              sentence=sentence+wv[word].ravel().tolist()\n",
+        "        word2vec_data.append(sentence)\n",
+        "\n",
+        "    return word2vec_data\n",
+        "def change_to_word2vec(data):\n",
+        "  data2vec=get_word2vec_data(data)\n",
+        "  lengthOfdata=[len(data2vec[i]) for i,n in enumerate(data2vec)]\n",
+        "  for i,n in enumerate(data):\n",
+        "    if(len(data2vec[i])<max(lengthOfdata)):\n",
+        "      for j in range(1,(max(lengthOfdata)-len(data2vec[i]))+1):\n",
+        "        data2vec[i].append(0)\n",
+        "  return truncatedvectors(np.array(data2vec))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "lYenN_XxgduY",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "X_data_w2v_cmt=change_to_word2vec(datacmt)\n",
+        "X_data_w2v_news=change_to_word2vec(datanews)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bzQ6SfQ6gUu2",
+        "colab_type": "text"
+      },
+      "source": [
+        "TF-IDF"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "2nt6QoayYaAC",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "def tfidf(data):\n",
+        "  tfidf_vect_ngram = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(1, 2))\n",
+        "  tfidf_vect_ngram.fit(data)\n",
+        "  X_data_tfidf_ngram =  tfidf_vect_ngram.transform(data)\n",
+        "  return truncatedvectors(X_data_tfidf_ngram)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ICtCihzZ2ICZ",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "X_data_tfidf_cmt=tfidf(datacmt)\n",
+        "X_data_tfidf_news=tfidf(datanews)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hjd_8vQtgrDJ",
+        "colab_type": "text"
+      },
+      "source": [
+        "Bag of words"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "qUXykOtEgsnc",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "def bow(data):\n",
+        "  tokenizer = Tokenizer()\n",
+        "  tokenizer.fit_on_texts(dataall)\n",
+        "  datacmtbow = tokenizer.texts_to_sequences(data)\n",
+        "  datacmtbow= pad_sequences(datacmtbow, maxlen=300)\n",
+        "  return datacmtbow"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "fsoybw4z2KDG",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "X_data_bow_cmt=bow(datacmt)\n",
+        "X_data_bow_news=bow(datanews)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "pUPnj8I02MbX",
+        "colab_type": "text"
+      },
+      "source": [
+        "BiLSTM Model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-OqFCpLOYgcC",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#Change: [X_data_tfidf_cmt,  X_data_w2v_cmt, X_data_bow_cmt],labelcmt , [X_data_tfidf_news,X_data_bow_news,X_data_w2v_news], labelnews\n",
+        "X_train, X_val, y_train, y_val = train_test_split(X_data_tfidf_cmt, labelcmt, test_size=0.2, random_state=42)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "hRNGXEsVEe77",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "#learning_rate: [1e-5,5e-5,1e-4,5e-4,1e-3,5e-3]\n",
+        "opt = Adam(lr=0.001)\n",
+        "def create_lstm_model():\n",
+        "    input_layer = Input(shape=(300,))\n",
+        "    layer = Reshape((10, 30,))(input_layer)\n",
+        "    layer = Bidirectional(LSTM(128, activation='relu',dropout=0.5, recurrent_dropout=0.5))(layer)\n",
+        "    layer = Dense(64, activation='relu')(layer)\n",
+        "    layer = Dense(32, activation='relu')(layer)\n",
+        "    output_layer = Dense(3, activation='softmax')(layer)\n",
+        "    classifier = Model(input_layer, output_layer)\n",
+        "    classifier.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])\n",
+        "    return classifier"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "_qv7i9NVncbl",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "y_train_lstm_encode = to_categorical(y_train_lstm)\n",
+        "y_val_lstm_encode=to_categorical(y_val_lstm)\n",
+        "classifier = create_lstm_model()\n",
+        "classifier.fit(x_train_lstm, y_train_lstm_encode, validation_data=(x_val_lstm, y_val_lstm_encode), epochs=200, batch_size=32)\n",
+        "#change different epoch and batch_size"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9Cbt1elLYiVB",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "y_pred = classifier.predict(x_val_lstm, batch_size=32, verbose=1)\n",
+        "y_pred_bool = np.argmax(y_pred, axis=1)\n",
+        "print(classification_report(y_val_lstm, y_pred_bool))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    }
+  ]
+}