Skip to content

Commit

Permalink
version1
Browse files Browse the repository at this point in the history
  • Loading branch information
anhthuan1999 committed Dec 5, 2019
1 parent 754d88a commit 7793a8b
Show file tree
Hide file tree
Showing 9 changed files with 1,592 additions and 0 deletions.
Binary file added images/bert.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/bilstm.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/lstm.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
497 changes: 497 additions & 0 deletions models/BERT.ipynb

Large diffs are not rendered by default.

366 changes: 366 additions & 0 deletions models/BiLSTM.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,366 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "BiLSTM.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "_EAmAfts104F",
"colab_type": "text"
},
"source": [
"Library"
]
},
{
"cell_type": "code",
"metadata": {
"id": "GfEUUbmIYBZp",
"colab_type": "code",
"colab": {}
},
"source": [
"from pyvi import ViTokenizer, ViPosTagger\n",
"import numpy as np\n",
"import pandas as pd\n",
"import gensim\n",
"import sklearn\n",
"import tensorflow as tf\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.decomposition import TruncatedSVD\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from tensorflow.keras.layers import *\n",
"from keras.layers import *\n",
"from keras.preprocessing.text import Tokenizer\n",
"from keras.preprocessing.sequence import pad_sequences\n",
"from keras.models import Model\n",
"from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional\n",
"from tensorflow.keras.optimizers import Adam"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "8sjoCQj2YEgS",
"colab_type": "text"
},
"source": [
"Chuẩn bị data"
]
},
{
"cell_type": "code",
"metadata": {
"id": "D8V78zDiYB06",
"colab_type": "code",
"colab": {}
},
"source": [
"datanewscontent=pd.read_excel(r'path_to_newscontent.xlsx','Sheet1')\n",
"datacomment=pd.read_excel(r'path_to_comments.xlsx','Sheet1')"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "hATdRopJYGJT",
"colab_type": "code",
"colab": {}
},
"source": [
"datacmt=[]\n",
"for d in datacomment['comment']:\n",
" e=ViTokenizer.tokenize(str(d))\n",
" datacmt.append(e)\n",
"labelcmt=datacomment['label']"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "lDMCLRpGgjy6",
"colab_type": "code",
"colab": {}
},
"source": [
"datanews=[]\n",
"for d in datanewscontent['all_lower']:\n",
" e=ViTokenizer.tokenize(str(d))\n",
" datanews.append(e)\n",
"labelnews=datanewscontent['label']"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "4O9hoYc618vH",
"colab_type": "code",
"colab": {}
},
"source": [
"def truncatedvectors(data,n_components=300):\n",
" svd_ngram = TruncatedSVD(n_components=n_components, random_state=42)\n",
" svd_ngram.fit(data)\n",
" return svd_ngram.transform(data)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "3DkRKt8jgWX6",
"colab_type": "text"
},
"source": [
"Word2vec"
]
},
{
"cell_type": "code",
"metadata": {
"id": "MhjtoeymgXww",
"colab_type": "code",
"colab": {}
},
"source": [
"from gensim.models import Word2Vec,KeyedVectors \n",
"import os\n",
"word2vec_model_path =\"path_to_wikivimodel.bin\"\n",
"model = KeyedVectors.load_word2vec_format(word2vec_model_path,binary=True, unicode_errors='ignore')\n",
"vocab = model.wv.vocab\n",
"wv = model.wv"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "fUY9bAzqgcFL",
"colab_type": "code",
"colab": {}
},
"source": [
"def get_word2vec_data(X):\n",
" word2vec_data = []\n",
" for x in X:\n",
" sentence = []\n",
" for word in x.split(\" \"):\n",
" if word in vocab:\n",
" sentence=sentence+wv[word].ravel().tolist()\n",
" word2vec_data.append(sentence)\n",
"\n",
" return word2vec_data\n",
"def change_to_word2vec(data):\n",
" data2vec=get_word2vec_data(data)\n",
" lengthOfdata=[len(data2vec[i]) for i,n in enumerate(data2vec)]\n",
" for i,n in enumerate(data):\n",
" if(len(data2vec[i])<max(lengthOfdata)):\n",
" for j in range(1,(max(lengthOfdata)-len(data2vec[i]))+1):\n",
" data2vec[i].append(0)\n",
" return truncatedvectors(np.array(data2vec))"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "lYenN_XxgduY",
"colab_type": "code",
"colab": {}
},
"source": [
"X_data_w2v_cmt=change_to_word2vec(datacmt)\n",
"X_data_w2v_news=change_to_word2vec(datanews)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "bzQ6SfQ6gUu2",
"colab_type": "text"
},
"source": [
"TF-IDF"
]
},
{
"cell_type": "code",
"metadata": {
"id": "2nt6QoayYaAC",
"colab_type": "code",
"colab": {}
},
"source": [
"def tfidf(data):\n",
" tfidf_vect_ngram = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(1, 2))\n",
" tfidf_vect_ngram.fit(data)\n",
" X_data_tfidf_ngram = tfidf_vect_ngram.transform(data)\n",
" return truncatedvectors(X_data_tfidf_ngram)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ICtCihzZ2ICZ",
"colab_type": "code",
"colab": {}
},
"source": [
"X_data_tfidf_cmt=tfidf(datacmt)\n",
"X_data_tfidf_news=tfidf(datanews)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "hjd_8vQtgrDJ",
"colab_type": "text"
},
"source": [
"Bag of words"
]
},
{
"cell_type": "code",
"metadata": {
"id": "qUXykOtEgsnc",
"colab_type": "code",
"colab": {}
},
"source": [
"def bow(data):\n",
" tokenizer = Tokenizer()\n",
" tokenizer.fit_on_texts(dataall)\n",
" datacmtbow = tokenizer.texts_to_sequences(data)\n",
" datacmtbow= pad_sequences(datacmtbow, maxlen=300)\n",
" return datacmtbow"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "fsoybw4z2KDG",
"colab_type": "code",
"colab": {}
},
"source": [
"X_data_bow_cmt=bow(datacmt)\n",
"X_data_bow_news=bow(datanews)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "pUPnj8I02MbX",
"colab_type": "text"
},
"source": [
"BiLSTM Model"
]
},
{
"cell_type": "code",
"metadata": {
"id": "-OqFCpLOYgcC",
"colab_type": "code",
"colab": {}
},
"source": [
"#Change: [X_data_tfidf_cmt, X_data_w2v_cmt, X_data_bow_cmt],labelcmt , [X_data_tfidf_news,X_data_bow_news,X_data_w2v_news], labelnews\n",
"X_train, X_val, y_train, y_val = train_test_split(X_data_tfidf_cmt, labelcmt, test_size=0.2, random_state=42)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "hRNGXEsVEe77",
"colab_type": "code",
"colab": {}
},
"source": [
"#learning_rate: [1e-5,5e-5,1e-4,5e-4,1e-3,5e-3]\n",
"opt = Adam(lr=0.001)\n",
"def create_lstm_model():\n",
" input_layer = Input(shape=(300,))\n",
" layer = Reshape((10, 30,))(input_layer)\n",
" layer = Bidirectional(LSTM(128, activation='relu',dropout=0.5, recurrent_dropout=0.5))(layer)\n",
" layer = Dense(64, activation='relu')(layer)\n",
" layer = Dense(32, activation='relu')(layer)\n",
" output_layer = Dense(3, activation='softmax')(layer)\n",
" classifier = Model(input_layer, output_layer)\n",
" classifier.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])\n",
" return classifier"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "_qv7i9NVncbl",
"colab_type": "code",
"colab": {}
},
"source": [
"y_train_lstm_encode = to_categorical(y_train_lstm)\n",
"y_val_lstm_encode=to_categorical(y_val_lstm)\n",
"classifier = create_lstm_model()\n",
"classifier.fit(x_train_lstm, y_train_lstm_encode, validation_data=(x_val_lstm, y_val_lstm_encode), epochs=200, batch_size=32)\n",
"#change different epoch and batch_size"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "9Cbt1elLYiVB",
"colab_type": "code",
"colab": {}
},
"source": [
"y_pred = classifier.predict(x_val_lstm, batch_size=32, verbose=1)\n",
"y_pred_bool = np.argmax(y_pred, axis=1)\n",
"print(classification_report(y_val_lstm, y_pred_bool))"
],
"execution_count": 0,
"outputs": []
}
]
}
Loading

0 comments on commit 7793a8b

Please sign in to comment.