From c248f01c3b5a2236dc93af5e65cbd64179bb0fee Mon Sep 17 00:00:00 2001
From: LeoVS09 <leovs010@gmail.com>
Date: Wed, 9 Sep 2020 23:45:53 +0300
Subject: [PATCH] migrate to twitter sentiment dataset

---
 data.dvc                 |  2 +-
 metrics/training.csv     | 11 -------
 src/datasets.py          | 47 +++++++++++++++++----------
 src/libs/__init__.py     |  4 +--
 src/libs/save_metrics.py |  2 +-
 src/model.py             |  7 ++--
 src/normalize.py         | 70 +++++++++++++++++++++-------------------
 src/test.py              |  6 ++--
 src/train.py             | 25 +++++++-------
 9 files changed, 90 insertions(+), 84 deletions(-)
 delete mode 100644 metrics/training.csv

diff --git a/data.dvc b/data.dvc
index c1a7006..94e0dfd 100644
--- a/data.dvc
+++ b/data.dvc
@@ -1,3 +1,3 @@
 outs:
-- md5: 5e1e1637ef6e7039a40a0ee22dd6f7e8.dir
+- md5: 7dafc45044d12a46008aebfb6bf28435.dir
   path: data
diff --git a/metrics/training.csv b/metrics/training.csv
deleted file mode 100644
index 2762950..0000000
--- a/metrics/training.csv
+++ /dev/null
@@ -1,11 +0,0 @@
-epoch,accuracy,loss,val_accuracy,val_loss
-0,0.5713000297546387,0.6759029030799866,0.6633999943733215,0.6271519064903259
-1,0.6543499827384949,0.630271315574646,0.6636000275611877,0.6300450563430786
-2,0.7340499758720398,0.5443506836891174,0.7157999873161316,0.5586917400360107
-3,0.7401000261306763,0.5569710731506348,0.7929999828338623,0.4714183211326599
-4,0.7710999846458435,0.5034257769584656,0.8087999820709229,0.4767420291900635
-5,0.7905499935150146,0.4809580445289612,0.819599986076355,0.42892083525657654
-6,0.7063999772071838,0.5683839321136475,0.7202000021934509,0.5532671809196472
-7,0.7695500254631042,0.5051141381263733,0.7685999870300293,0.4865252375602722
-8,0.7476500272750854,0.5163740515708923,0.7983999848365784,0.4633433520793915
-9,0.7580000162124634,0.5119927525520325,0.7888000011444092,0.45342880487442017
diff --git a/src/datasets.py b/src/datasets.py
index 2324a0c..553c053 100644
--- a/src/datasets.py
+++ b/src/datasets.py
@@ -1,33 +1,46 @@
 import tensorflow_datasets as tfds
 import tensorflow as tf
+from .libs import params
+import pandas as pd
+import os
+
+# Loaded from http://help.sentiment140.com/for-students
+# kaggle copy https://www.kaggle.com/kazanova/sentiment140
 
-dataset_name = 'imdb_reviews'
 datasets_folder = './data'
 
-# Will return: 
-#  20 000 train data
-#   5 000 validation data
-#  10 000 test data
-# in tuples (string, int)
-# Download dataset if it not exists locally
+train_dataset_path = os.path.join(datasets_folder, 'training.1600000.processed.noemoticon.csv')
+test_dataset_path = os.path.join(datasets_folder, 'testdata.manual.2009.06.14.csv')
+
+LABEL_COLUMN = 'target'
+TEXT_COLUMN = 'text'
+BATCH_SIZE = params['input']['batch_size']
+COLUMNS = ["target", "id", "date", "flag", "user", "text"]
+
+def get_dataset(file_path):
+    df = pd.read_csv(file_path, encoding = "ISO-8859-1", names=COLUMNS)
+    
+    df[LABEL_COLUMN] = pd.Categorical(df[LABEL_COLUMN])
+    df[LABEL_COLUMN] = df[LABEL_COLUMN].cat.codes
+
+    labels = df.pop(LABEL_COLUMN)
+    texts = df.pop(TEXT_COLUMN)
+
+    return tf.data.Dataset.from_tensor_slices((texts.values, labels.values))
+
 def download():
-    train_data, validation_data, test_data = tfds.load(
-        name=dataset_name, 
-        data_dir=datasets_folder,
-        split=('train[:80%]', 'train[80%:]', 'test'),
-        as_supervised=True
-    )
+    train_dataset = get_dataset(train_dataset_path)
+    test_dataset = get_dataset(test_dataset_path)
 
-    return train_data, validation_data, test_data
+    return train_dataset, test_dataset
 
 # Will print dataset sizes
 # Not use it in production, 
 # size of dataset can be computed only by transformation to list
-def print_dataset_sizes(train_data, validation_data, test_data):
+def print_dataset_sizes(train_data, test_data):
     print(
         '\nLoaded dataset',
-        '\ntrain size:', len(list(train_data)), 
-        '\nvalidation sise:', len(list(validation_data)),
+        '\ntrain size:', len(list(train_data)),
         '\ntest size:', len(list(test_data)), '\n'
     )
 
diff --git a/src/libs/__init__.py b/src/libs/__init__.py
index 3fc652a..a5ce649 100644
--- a/src/libs/__init__.py
+++ b/src/libs/__init__.py
@@ -1,5 +1,5 @@
 from .prepare_tf import prepare
 from .params import params
-from .save_metrict import save_metrict
+from .save_metrics import save_metrics
 from .save_and_restore import save, load
-import .checkpoints as checkpoints
\ No newline at end of file
+from . import checkpoints
\ No newline at end of file
diff --git a/src/libs/save_metrics.py b/src/libs/save_metrics.py
index 461b0dc..2adfaca 100644
--- a/src/libs/save_metrics.py
+++ b/src/libs/save_metrics.py
@@ -1,6 +1,6 @@
 import json
 
-def save_metrict(model, results, outfile):
+def save_metrics(model, results, outfile):
     metrics = {}
 
     for name, value in zip(model.metrics_names, results):
diff --git a/src/model.py b/src/model.py
index 1c02234..b55dd4e 100644
--- a/src/model.py
+++ b/src/model.py
@@ -1,9 +1,10 @@
 from tensorflow.keras import Sequential, layers, losses, optimizers
 
-def build_model(vector_dimensions):
+def build_model(vocab_size):
     model = Sequential([
-        layers.Bidirectional(layers.LSTM(64, return_sequences=True), input_shape=(None, vector_dimensions)),
-        layers.Bidirectional(layers.LSTM(32)),
+        layers.Embedding(vocab_size, 1000),
+        # layers.Bidirectional(layers.LSTM(64, return_sequences=True), input_shape=(None, vector_dimensions)),
+        layers.Bidirectional(layers.LSTM(64)),
         layers.Dense(64, activation='relu'),
         layers.Dropout(0.5),
         # Two dense layer allow make separate predictions about each class
diff --git a/src/normalize.py b/src/normalize.py
index 154d1da..0a7ae3a 100644
--- a/src/normalize.py
+++ b/src/normalize.py
@@ -1,55 +1,59 @@
 import spacy
 import tensorflow as tf
+import tensorflow_datasets as tfds
 from .datasets import download
 
 # Unfortunately Tensorflow doesn't allow save dataset or tensor in easy way,
 # but Spacy process mostly work fast on data processing,
 # so we can direcly load datasets and normalaise data each time
 
-VECTOR_SIZE = 300
+tokenizer = tfds.features.text.Tokenizer()
 
-nlp = spacy.load("en_core_web_lg")
+encoder=None
 
-def extract_sentences(text):
-    doc = nlp(text)
-    return list(doc.sents)
+def build_encoder(labeled_data):
+    vocabulary_set = set()
+    for text_tensor, _ in labeled_data:
+        some_tokens = tokenizer.tokenize(text_tensor.numpy())
+        vocabulary_set.update(some_tokens)
 
-# text = "Peach emoji is where it has always been. Peach is the superior emoji. It's outranking eggplant 🍑 "
+    encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)
 
-def print_sentencies(text):
-    for sentence in extract_sentences(text):
-        print(sentence)
+    vocab_size = len(vocabulary_set)
+    return vocab_size
 
-def token_to_vector(token):
-    return token.vector
+def print_text(labeled_data, index = 0):
+    text = next(iter(labeled_data))[index].numpy()
+    print(text)
 
-# normalise text vector of words (vectors of 300 dimension)
 def text_to_vector(text):
-    doc = nlp(text)
+    return encoder.encode(text)
 
-    # map all tokens in sentence to his vectors
-    sentence = list(map(token_to_vector, doc))
-    # TODO: filter words which out of vocalabirity
+def encode(text_tensor, label):
+    encoded_text = text_to_vector(text_tensor.numpy())
 
-    return sentence 
-
-def bytes_to_tensor(bytes):
-    text = bytes.numpy().decode("utf-8")
-    vector = text_to_vector(text)
-
-    return tf.constant(vector)
+    return encoded_text, label
 
 def map_func(bytes, label):
-    [tensor, ] = tf.py_function(bytes_to_tensor, [bytes], [tf.float32])
-    tensor.set_shape([None, VECTOR_SIZE])
-    return tensor, label
+    # py_func doesn't set the shape of the returned tensors.
+    encoded_text, label = tf.py_function(encode, [bytes, label], Tout=[tf.int64, tf.int64])
+    
+    # `tf.data.Datasets` work best if all components have a shape set
+    #  so set the shapes manually: 
+    encoded_text.set_shape([None])
+    label.set_shape([])
+
+    return encoded_text, label
 
-def normalize_datasets(train, validation, test):
-    norm_train = train.map(map_func)
-    norm_valid = validation.map(map_func)
-    norm_test = test.map(map_func)
-    return (norm_train, norm_valid, norm_test, VECTOR_SIZE)
+def normalize_dataset(dataset):
+    return dataset.map(map_func)
 
 def datasets():
-    train_data, validation_data, test_data = download()
-    return normalize_datasets(train_data, validation_data, test_data)
\ No newline at end of file
+    train_data, test_data = download()
+
+    vocab_size = build_encoder(train_data)
+
+    train_data = normalize_dataset(train_data)
+    test_data = normalize_dataset(test_data)
+
+    return train_data, test_data, vocab_size
\ No newline at end of file
diff --git a/src/test.py b/src/test.py
index 4ea9177..574c041 100644
--- a/src/test.py
+++ b/src/test.py
@@ -1,6 +1,6 @@
 import tensorflow as tf
 from .normalize import datasets
-from .libs import params, prepare, save_metrict, load
+from .libs import params, prepare, save_metrics, load
 
 prepare(tf)
 
@@ -14,7 +14,7 @@
 # For track results better save metrics
 
 # Load normalised datasets
-training, validation, testing, input_shape = datasets()
+training, testing, vocab_size = datasets()
 
 model = load()
 
@@ -25,4 +25,4 @@
 )
 
 with open(metrics_file, 'w') as outfile:
-    save_metrict(model, results, outfile)
\ No newline at end of file
+    save_metrics(model, results, outfile)
\ No newline at end of file
diff --git a/src/train.py b/src/train.py
index 652158c..56fb658 100644
--- a/src/train.py
+++ b/src/train.py
@@ -13,28 +13,27 @@
 metrics_file='metrics/training.csv'
 
 # Load normalised datasets
-training, validation, testing, input_shape = datasets()
+training, testing, vocab_size = datasets()
 # Dataset data is array of tensors
 # if symplify array of tuples: (text: string, label: int)
-# where 0 mean bad, and 1 mean good,
-# text normalised to input_shape dimension embeed vector
+# where 0 mean bad, and 1 mean good
 
 # Build neural network model
-model = build_model(input_shape)
+model = build_model(vocab_size=vocab_size)
 
 train_batches = training.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
-validation_batches = validation.padded_batch(BATCH_SIZE)
+validation_batches = testing.padded_batch(BATCH_SIZE)
 
 # Train network
 model.fit(
-        train_batches,
-        epochs=EPOCHS,
-        validation_data=validation_batches,
-        callbacks=[
-            checkpoints.save_weights(), 
-            CSVLogger(metrics_file)
-        ]
-    )
+    train_batches,
+    epochs=EPOCHS,
+    validation_data=validation_batches,
+    callbacks=[
+        checkpoints.save_weights(), 
+        CSVLogger(metrics_file)
+    ]
+)
 
 # Save for restore in next time
 save(model)