From c248f01c3b5a2236dc93af5e65cbd64179bb0fee Mon Sep 17 00:00:00 2001 From: LeoVS09 Date: Wed, 9 Sep 2020 23:45:53 +0300 Subject: [PATCH] migrate to twitter sentiment dataset --- data.dvc | 2 +- metrics/training.csv | 11 ------- src/datasets.py | 47 +++++++++++++++++---------- src/libs/__init__.py | 4 +-- src/libs/save_metrics.py | 2 +- src/model.py | 7 ++-- src/normalize.py | 70 +++++++++++++++++++++------------------- src/test.py | 6 ++-- src/train.py | 25 +++++++------- 9 files changed, 90 insertions(+), 84 deletions(-) delete mode 100644 metrics/training.csv diff --git a/data.dvc b/data.dvc index c1a7006..94e0dfd 100644 --- a/data.dvc +++ b/data.dvc @@ -1,3 +1,3 @@ outs: -- md5: 5e1e1637ef6e7039a40a0ee22dd6f7e8.dir +- md5: 7dafc45044d12a46008aebfb6bf28435.dir path: data diff --git a/metrics/training.csv b/metrics/training.csv deleted file mode 100644 index 2762950..0000000 --- a/metrics/training.csv +++ /dev/null @@ -1,11 +0,0 @@ -epoch,accuracy,loss,val_accuracy,val_loss -0,0.5713000297546387,0.6759029030799866,0.6633999943733215,0.6271519064903259 -1,0.6543499827384949,0.630271315574646,0.6636000275611877,0.6300450563430786 -2,0.7340499758720398,0.5443506836891174,0.7157999873161316,0.5586917400360107 -3,0.7401000261306763,0.5569710731506348,0.7929999828338623,0.4714183211326599 -4,0.7710999846458435,0.5034257769584656,0.8087999820709229,0.4767420291900635 -5,0.7905499935150146,0.4809580445289612,0.819599986076355,0.42892083525657654 -6,0.7063999772071838,0.5683839321136475,0.7202000021934509,0.5532671809196472 -7,0.7695500254631042,0.5051141381263733,0.7685999870300293,0.4865252375602722 -8,0.7476500272750854,0.5163740515708923,0.7983999848365784,0.4633433520793915 -9,0.7580000162124634,0.5119927525520325,0.7888000011444092,0.45342880487442017 diff --git a/src/datasets.py b/src/datasets.py index 2324a0c..553c053 100644 --- a/src/datasets.py +++ b/src/datasets.py @@ -1,33 +1,46 @@ import tensorflow_datasets as tfds import tensorflow as tf +from .libs import params +import pandas as pd +import os + +# Loaded from http://help.sentiment140.com/for-students +# kaggle copy https://www.kaggle.com/kazanova/sentiment140 -dataset_name = 'imdb_reviews' datasets_folder = './data' -# Will return: -# 20 000 train data -# 5 000 validation data -# 10 000 test data -# in tuples (string, int) -# Download dataset if it not exists locally +train_dataset_path = os.path.join(datasets_folder, 'training.1600000.processed.noemoticon.csv') +test_dataset_path = os.path.join(datasets_folder, 'testdata.manual.2009.06.14.csv') + +LABEL_COLUMN = 'target' +TEXT_COLUMN = 'text' +BATCH_SIZE = params['input']['batch_size'] +COLUMNS = ["target", "id", "date", "flag", "user", "text"] + +def get_dataset(file_path): + df = pd.read_csv(file_path, encoding = "ISO-8859-1", names=COLUMNS) + + df[LABEL_COLUMN] = pd.Categorical(df[LABEL_COLUMN]) + df[LABEL_COLUMN] = df[LABEL_COLUMN].cat.codes + + labels = df.pop(LABEL_COLUMN) + texts = df.pop(TEXT_COLUMN) + + return tf.data.Dataset.from_tensor_slices((texts.values, labels.values)) + def download(): - train_data, validation_data, test_data = tfds.load( - name=dataset_name, - data_dir=datasets_folder, - split=('train[:80%]', 'train[80%:]', 'test'), - as_supervised=True - ) + train_dataset = get_dataset(train_dataset_path) + test_dataset = get_dataset(test_dataset_path) - return train_data, validation_data, test_data + return train_dataset, test_dataset # Will print dataset sizes # Not use it in production, # size of dataset can be computed only by transformation to list -def print_dataset_sizes(train_data, validation_data, test_data): +def print_dataset_sizes(train_data, test_data): print( '\nLoaded dataset', - '\ntrain size:', len(list(train_data)), - '\nvalidation sise:', len(list(validation_data)), + '\ntrain size:', len(list(train_data)), '\ntest size:', len(list(test_data)), '\n' ) diff --git a/src/libs/__init__.py b/src/libs/__init__.py index 3fc652a..a5ce649 100644 --- a/src/libs/__init__.py +++ b/src/libs/__init__.py @@ -1,5 +1,5 @@ from .prepare_tf import prepare from .params import params -from .save_metrict import save_metrict +from .save_metrics import save_metrics from .save_and_restore import save, load -import .checkpoints as checkpoints \ No newline at end of file +from . import checkpoints \ No newline at end of file diff --git a/src/libs/save_metrics.py b/src/libs/save_metrics.py index 461b0dc..2adfaca 100644 --- a/src/libs/save_metrics.py +++ b/src/libs/save_metrics.py @@ -1,6 +1,6 @@ import json -def save_metrict(model, results, outfile): +def save_metrics(model, results, outfile): metrics = {} for name, value in zip(model.metrics_names, results): diff --git a/src/model.py b/src/model.py index 1c02234..b55dd4e 100644 --- a/src/model.py +++ b/src/model.py @@ -1,9 +1,10 @@ from tensorflow.keras import Sequential, layers, losses, optimizers -def build_model(vector_dimensions): +def build_model(vocab_size): model = Sequential([ - layers.Bidirectional(layers.LSTM(64, return_sequences=True), input_shape=(None, vector_dimensions)), - layers.Bidirectional(layers.LSTM(32)), + layers.Embedding(vocab_size, 1000), + # layers.Bidirectional(layers.LSTM(64, return_sequences=True), input_shape=(None, vector_dimensions)), + layers.Bidirectional(layers.LSTM(64)), layers.Dense(64, activation='relu'), layers.Dropout(0.5), # Two dense layer allow make separate predictions about each class diff --git a/src/normalize.py b/src/normalize.py index 154d1da..0a7ae3a 100644 --- a/src/normalize.py +++ b/src/normalize.py @@ -1,55 +1,59 @@ import spacy import tensorflow as tf +import tensorflow_datasets as tfds from .datasets import download # Unfortunately Tensorflow doesn't allow save dataset or tensor in easy way, # but Spacy process mostly work fast on data processing, # so we can direcly load datasets and normalaise data each time -VECTOR_SIZE = 300 +tokenizer = tfds.features.text.Tokenizer() -nlp = spacy.load("en_core_web_lg") +encoder=None -def extract_sentences(text): - doc = nlp(text) - return list(doc.sents) +def build_encoder(labeled_data): + vocabulary_set = set() + for text_tensor, _ in labeled_data: + some_tokens = tokenizer.tokenize(text_tensor.numpy()) + vocabulary_set.update(some_tokens) -# text = "Peach emoji is where it has always been. Peach is the superior emoji. It's outranking eggplant 🍑 " + encoder = tfds.features.text.TokenTextEncoder(vocabulary_set) -def print_sentencies(text): - for sentence in extract_sentences(text): - print(sentence) + vocab_size = len(vocabulary_set) + return vocab_size -def token_to_vector(token): - return token.vector +def print_text(labeled_data, index = 0): + text = next(iter(labeled_data))[index].numpy() + print(text) -# normalise text vector of words (vectors of 300 dimension) def text_to_vector(text): - doc = nlp(text) + return encoder.encode(text) - # map all tokens in sentence to his vectors - sentence = list(map(token_to_vector, doc)) - # TODO: filter words which out of vocalabirity +def encode(text_tensor, label): + encoded_text = text_to_vector(text_tensor.numpy()) - return sentence - -def bytes_to_tensor(bytes): - text = bytes.numpy().decode("utf-8") - vector = text_to_vector(text) - - return tf.constant(vector) + return encoded_text, label def map_func(bytes, label): - [tensor, ] = tf.py_function(bytes_to_tensor, [bytes], [tf.float32]) - tensor.set_shape([None, VECTOR_SIZE]) - return tensor, label + # py_func doesn't set the shape of the returned tensors. + encoded_text, label = tf.py_function(encode, [bytes, label], Tout=[tf.int64, tf.int64]) + + # `tf.data.Datasets` work best if all components have a shape set + # so set the shapes manually: + encoded_text.set_shape([None]) + label.set_shape([]) + + return encoded_text, label -def normalize_datasets(train, validation, test): - norm_train = train.map(map_func) - norm_valid = validation.map(map_func) - norm_test = test.map(map_func) - return (norm_train, norm_valid, norm_test, VECTOR_SIZE) +def normalize_dataset(dataset): + return dataset.map(map_func) def datasets(): - train_data, validation_data, test_data = download() - return normalize_datasets(train_data, validation_data, test_data) \ No newline at end of file + train_data, test_data = download() + + vocab_size = build_encoder(train_data) + + train_data = normalize_dataset(train_data) + test_data = normalize_dataset(test_data) + + return train_data, test_data, vocab_size \ No newline at end of file diff --git a/src/test.py b/src/test.py index 4ea9177..574c041 100644 --- a/src/test.py +++ b/src/test.py @@ -1,6 +1,6 @@ import tensorflow as tf from .normalize import datasets -from .libs import params, prepare, save_metrict, load +from .libs import params, prepare, save_metrics, load prepare(tf) @@ -14,7 +14,7 @@ # For track results better save metrics # Load normalised datasets -training, validation, testing, input_shape = datasets() +training, testing, vocab_size = datasets() model = load() @@ -25,4 +25,4 @@ ) with open(metrics_file, 'w') as outfile: - save_metrict(model, results, outfile) \ No newline at end of file + save_metrics(model, results, outfile) \ No newline at end of file diff --git a/src/train.py b/src/train.py index 652158c..56fb658 100644 --- a/src/train.py +++ b/src/train.py @@ -13,28 +13,27 @@ metrics_file='metrics/training.csv' # Load normalised datasets -training, validation, testing, input_shape = datasets() +training, testing, vocab_size = datasets() # Dataset data is array of tensors # if symplify array of tuples: (text: string, label: int) -# where 0 mean bad, and 1 mean good, -# text normalised to input_shape dimension embeed vector +# where 0 mean bad, and 1 mean good # Build neural network model -model = build_model(input_shape) +model = build_model(vocab_size=vocab_size) train_batches = training.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE) -validation_batches = validation.padded_batch(BATCH_SIZE) +validation_batches = testing.padded_batch(BATCH_SIZE) # Train network model.fit( - train_batches, - epochs=EPOCHS, - validation_data=validation_batches, - callbacks=[ - checkpoints.save_weights(), - CSVLogger(metrics_file) - ] - ) + train_batches, + epochs=EPOCHS, + validation_data=validation_batches, + callbacks=[ + checkpoints.save_weights(), + CSVLogger(metrics_file) + ] +) # Save for restore in next time save(model)