-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
754d88a
commit 7793a8b
Showing
9 changed files
with
1,592 additions
and
0 deletions.
There are no files selected for viewing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,366 @@ | ||
{ | ||
"nbformat": 4, | ||
"nbformat_minor": 0, | ||
"metadata": { | ||
"colab": { | ||
"name": "BiLSTM.ipynb", | ||
"provenance": [], | ||
"collapsed_sections": [] | ||
}, | ||
"kernelspec": { | ||
"name": "python3", | ||
"display_name": "Python 3" | ||
}, | ||
"accelerator": "GPU" | ||
}, | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "_EAmAfts104F", | ||
"colab_type": "text" | ||
}, | ||
"source": [ | ||
"Library" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "GfEUUbmIYBZp", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"from pyvi import ViTokenizer, ViPosTagger\n", | ||
"import numpy as np\n", | ||
"import pandas as pd\n", | ||
"import gensim\n", | ||
"import sklearn\n", | ||
"import tensorflow as tf\n", | ||
"from sklearn.model_selection import train_test_split\n", | ||
"from sklearn.decomposition import TruncatedSVD\n", | ||
"from sklearn.metrics import classification_report\n", | ||
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | ||
"from tensorflow.keras.layers import *\n", | ||
"from keras.layers import *\n", | ||
"from keras.preprocessing.text import Tokenizer\n", | ||
"from keras.preprocessing.sequence import pad_sequences\n", | ||
"from keras.models import Model\n", | ||
"from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional\n", | ||
"from tensorflow.keras.optimizers import Adam" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "8sjoCQj2YEgS", | ||
"colab_type": "text" | ||
}, | ||
"source": [ | ||
"Chuẩn bị data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "D8V78zDiYB06", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"datanewscontent=pd.read_excel(r'path_to_newscontent.xlsx','Sheet1')\n", | ||
"datacomment=pd.read_excel(r'path_to_comments.xlsx','Sheet1')" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "hATdRopJYGJT", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"datacmt=[]\n", | ||
"for d in datacomment['comment']:\n", | ||
" e=ViTokenizer.tokenize(str(d))\n", | ||
" datacmt.append(e)\n", | ||
"labelcmt=datacomment['label']" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "lDMCLRpGgjy6", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"datanews=[]\n", | ||
"for d in datanewscontent['all_lower']:\n", | ||
" e=ViTokenizer.tokenize(str(d))\n", | ||
" datanews.append(e)\n", | ||
"labelnews=datanewscontent['label']" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "4O9hoYc618vH", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"def truncatedvectors(data,n_components=300):\n", | ||
" svd_ngram = TruncatedSVD(n_components=n_components, random_state=42)\n", | ||
" svd_ngram.fit(data)\n", | ||
" return svd_ngram.transform(data)" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "3DkRKt8jgWX6", | ||
"colab_type": "text" | ||
}, | ||
"source": [ | ||
"Word2vec" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "MhjtoeymgXww", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"from gensim.models import Word2Vec,KeyedVectors \n", | ||
"import os\n", | ||
"word2vec_model_path =\"path_to_wikivimodel.bin\"\n", | ||
"model = KeyedVectors.load_word2vec_format(word2vec_model_path,binary=True, unicode_errors='ignore')\n", | ||
"vocab = model.wv.vocab\n", | ||
"wv = model.wv" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "fUY9bAzqgcFL", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"def get_word2vec_data(X):\n", | ||
" word2vec_data = []\n", | ||
" for x in X:\n", | ||
" sentence = []\n", | ||
" for word in x.split(\" \"):\n", | ||
" if word in vocab:\n", | ||
" sentence=sentence+wv[word].ravel().tolist()\n", | ||
" word2vec_data.append(sentence)\n", | ||
"\n", | ||
" return word2vec_data\n", | ||
"def change_to_word2vec(data):\n", | ||
" data2vec=get_word2vec_data(data)\n", | ||
" lengthOfdata=[len(data2vec[i]) for i,n in enumerate(data2vec)]\n", | ||
" for i,n in enumerate(data):\n", | ||
" if(len(data2vec[i])<max(lengthOfdata)):\n", | ||
" for j in range(1,(max(lengthOfdata)-len(data2vec[i]))+1):\n", | ||
" data2vec[i].append(0)\n", | ||
" return truncatedvectors(np.array(data2vec))" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "lYenN_XxgduY", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"X_data_w2v_cmt=change_to_word2vec(datacmt)\n", | ||
"X_data_w2v_news=change_to_word2vec(datanews)" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "bzQ6SfQ6gUu2", | ||
"colab_type": "text" | ||
}, | ||
"source": [ | ||
"TF-IDF" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "2nt6QoayYaAC", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"def tfidf(data):\n", | ||
" tfidf_vect_ngram = TfidfVectorizer(analyzer='word', max_features=30000, ngram_range=(1, 2))\n", | ||
" tfidf_vect_ngram.fit(data)\n", | ||
" X_data_tfidf_ngram = tfidf_vect_ngram.transform(data)\n", | ||
" return truncatedvectors(X_data_tfidf_ngram)" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "ICtCihzZ2ICZ", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"X_data_tfidf_cmt=tfidf(datacmt)\n", | ||
"X_data_tfidf_news=tfidf(datanews)" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "hjd_8vQtgrDJ", | ||
"colab_type": "text" | ||
}, | ||
"source": [ | ||
"Bag of words" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "qUXykOtEgsnc", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"def bow(data):\n", | ||
" tokenizer = Tokenizer()\n", | ||
" tokenizer.fit_on_texts(dataall)\n", | ||
" datacmtbow = tokenizer.texts_to_sequences(data)\n", | ||
" datacmtbow= pad_sequences(datacmtbow, maxlen=300)\n", | ||
" return datacmtbow" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "fsoybw4z2KDG", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"X_data_bow_cmt=bow(datacmt)\n", | ||
"X_data_bow_news=bow(datanews)" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "pUPnj8I02MbX", | ||
"colab_type": "text" | ||
}, | ||
"source": [ | ||
"BiLSTM Model" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "-OqFCpLOYgcC", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"#Change: [X_data_tfidf_cmt, X_data_w2v_cmt, X_data_bow_cmt],labelcmt , [X_data_tfidf_news,X_data_bow_news,X_data_w2v_news], labelnews\n", | ||
"X_train, X_val, y_train, y_val = train_test_split(X_data_tfidf_cmt, labelcmt, test_size=0.2, random_state=42)" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "hRNGXEsVEe77", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"#learning_rate: [1e-5,5e-5,1e-4,5e-4,1e-3,5e-3]\n", | ||
"opt = Adam(lr=0.001)\n", | ||
"def create_lstm_model():\n", | ||
" input_layer = Input(shape=(300,))\n", | ||
" layer = Reshape((10, 30,))(input_layer)\n", | ||
" layer = Bidirectional(LSTM(128, activation='relu',dropout=0.5, recurrent_dropout=0.5))(layer)\n", | ||
" layer = Dense(64, activation='relu')(layer)\n", | ||
" layer = Dense(32, activation='relu')(layer)\n", | ||
" output_layer = Dense(3, activation='softmax')(layer)\n", | ||
" classifier = Model(input_layer, output_layer)\n", | ||
" classifier.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])\n", | ||
" return classifier" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "_qv7i9NVncbl", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"y_train_lstm_encode = to_categorical(y_train_lstm)\n", | ||
"y_val_lstm_encode=to_categorical(y_val_lstm)\n", | ||
"classifier = create_lstm_model()\n", | ||
"classifier.fit(x_train_lstm, y_train_lstm_encode, validation_data=(x_val_lstm, y_val_lstm_encode), epochs=200, batch_size=32)\n", | ||
"#change different epoch and batch_size" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"metadata": { | ||
"id": "9Cbt1elLYiVB", | ||
"colab_type": "code", | ||
"colab": {} | ||
}, | ||
"source": [ | ||
"y_pred = classifier.predict(x_val_lstm, batch_size=32, verbose=1)\n", | ||
"y_pred_bool = np.argmax(y_pred, axis=1)\n", | ||
"print(classification_report(y_val_lstm, y_pred_bool))" | ||
], | ||
"execution_count": 0, | ||
"outputs": [] | ||
} | ||
] | ||
} |
Oops, something went wrong.