migrate to twitter sentiment dataset

LeoVS09 · Sep 9, 2020 · c248f01 · c248f01
1 parent ba9942f
commit c248f01
Show file tree

Hide file tree

Showing 9 changed files with 90 additions and 84 deletions.
diff --git a/data.dvc b/data.dvc
@@ -1,3 +1,3 @@
 outs:
-- md5: 5e1e1637ef6e7039a40a0ee22dd6f7e8.dir
+- md5: 7dafc45044d12a46008aebfb6bf28435.dir
   path: data
diff --git a/metrics/training.csv b/metrics/training.csv
diff --git a/src/datasets.py b/src/datasets.py
@@ -1,33 +1,46 @@
 import tensorflow_datasets as tfds
 import tensorflow as tf
+from .libs import params
+import pandas as pd
+import os
+
+# Loaded from http://help.sentiment140.com/for-students
+# kaggle copy https://www.kaggle.com/kazanova/sentiment140
 
-dataset_name = 'imdb_reviews'
 datasets_folder = './data'
 
-# Will return: 
-#  20 000 train data
-#   5 000 validation data
-#  10 000 test data
-# in tuples (string, int)
-# Download dataset if it not exists locally
+train_dataset_path = os.path.join(datasets_folder, 'training.1600000.processed.noemoticon.csv')
+test_dataset_path = os.path.join(datasets_folder, 'testdata.manual.2009.06.14.csv')
+
+LABEL_COLUMN = 'target'
+TEXT_COLUMN = 'text'
+BATCH_SIZE = params['input']['batch_size']
+COLUMNS = ["target", "id", "date", "flag", "user", "text"]
+
+def get_dataset(file_path):
+    df = pd.read_csv(file_path, encoding = "ISO-8859-1", names=COLUMNS)
+
+    df[LABEL_COLUMN] = pd.Categorical(df[LABEL_COLUMN])
+    df[LABEL_COLUMN] = df[LABEL_COLUMN].cat.codes
+
+    labels = df.pop(LABEL_COLUMN)
+    texts = df.pop(TEXT_COLUMN)
+
+    return tf.data.Dataset.from_tensor_slices((texts.values, labels.values))
+
 def download():
-    train_data, validation_data, test_data = tfds.load(
-        name=dataset_name, 
-        data_dir=datasets_folder,
-        split=('train[:80%]', 'train[80%:]', 'test'),
-        as_supervised=True
-    )
+    train_dataset = get_dataset(train_dataset_path)
+    test_dataset = get_dataset(test_dataset_path)
 
-    return train_data, validation_data, test_data
+    return train_dataset, test_dataset
 
 # Will print dataset sizes
 # Not use it in production, 
 # size of dataset can be computed only by transformation to list
-def print_dataset_sizes(train_data, validation_data, test_data):
+def print_dataset_sizes(train_data, test_data):
     print(
         '\nLoaded dataset',
-        '\ntrain size:', len(list(train_data)), 
-        '\nvalidation sise:', len(list(validation_data)),
+        '\ntrain size:', len(list(train_data)),
         '\ntest size:', len(list(test_data)), '\n'
     )
 

diff --git a/src/libs/__init__.py b/src/libs/__init__.py
@@ -1,5 +1,5 @@
 from .prepare_tf import prepare
 from .params import params
-from .save_metrict import save_metrict
+from .save_metrics import save_metrics
 from .save_and_restore import save, load
-import .checkpoints as checkpoints
+from . import checkpoints
diff --git a/src/libs/save_metrics.py b/src/libs/save_metrics.py
@@ -1,6 +1,6 @@
 import json
 
-def save_metrict(model, results, outfile):
+def save_metrics(model, results, outfile):
     metrics = {}
 
     for name, value in zip(model.metrics_names, results):

diff --git a/src/model.py b/src/model.py
@@ -1,9 +1,10 @@
 from tensorflow.keras import Sequential, layers, losses, optimizers
 
-def build_model(vector_dimensions):
+def build_model(vocab_size):
     model = Sequential([
-        layers.Bidirectional(layers.LSTM(64, return_sequences=True), input_shape=(None, vector_dimensions)),
-        layers.Bidirectional(layers.LSTM(32)),
+        layers.Embedding(vocab_size, 1000),
+        # layers.Bidirectional(layers.LSTM(64, return_sequences=True), input_shape=(None, vector_dimensions)),
+        layers.Bidirectional(layers.LSTM(64)),
         layers.Dense(64, activation='relu'),
         layers.Dropout(0.5),
         # Two dense layer allow make separate predictions about each class

diff --git a/src/normalize.py b/src/normalize.py
@@ -1,55 +1,59 @@
 import spacy
 import tensorflow as tf
+import tensorflow_datasets as tfds
 from .datasets import download
 
 # Unfortunately Tensorflow doesn't allow save dataset or tensor in easy way,
 # but Spacy process mostly work fast on data processing,
 # so we can direcly load datasets and normalaise data each time
 
-VECTOR_SIZE = 300
+tokenizer = tfds.features.text.Tokenizer()
 
-nlp = spacy.load("en_core_web_lg")
+encoder=None
 
-def extract_sentences(text):
-    doc = nlp(text)
-    return list(doc.sents)
+def build_encoder(labeled_data):
+    vocabulary_set = set()
+    for text_tensor, _ in labeled_data:
+        some_tokens = tokenizer.tokenize(text_tensor.numpy())
+        vocabulary_set.update(some_tokens)
 
-# text = "Peach emoji is where it has always been. Peach is the superior emoji. It's outranking eggplant 🍑 "
+    encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)
 
-def print_sentencies(text):
-    for sentence in extract_sentences(text):
-        print(sentence)
+    vocab_size = len(vocabulary_set)
+    return vocab_size
 
-def token_to_vector(token):
-    return token.vector
+def print_text(labeled_data, index = 0):
+    text = next(iter(labeled_data))[index].numpy()
+    print(text)
 
-# normalise text vector of words (vectors of 300 dimension)
 def text_to_vector(text):
-    doc = nlp(text)
+    return encoder.encode(text)
 
-    # map all tokens in sentence to his vectors
-    sentence = list(map(token_to_vector, doc))
-    # TODO: filter words which out of vocalabirity
+def encode(text_tensor, label):
+    encoded_text = text_to_vector(text_tensor.numpy())
 
-    return sentence 
-
-def bytes_to_tensor(bytes):
-    text = bytes.numpy().decode("utf-8")
-    vector = text_to_vector(text)
-
-    return tf.constant(vector)
+    return encoded_text, label
 
 def map_func(bytes, label):
-    [tensor, ] = tf.py_function(bytes_to_tensor, [bytes], [tf.float32])
-    tensor.set_shape([None, VECTOR_SIZE])
-    return tensor, label
+    # py_func doesn't set the shape of the returned tensors.
+    encoded_text, label = tf.py_function(encode, [bytes, label], Tout=[tf.int64, tf.int64])
+
+    # `tf.data.Datasets` work best if all components have a shape set
+    #  so set the shapes manually: 
+    encoded_text.set_shape([None])
+    label.set_shape([])
+
+    return encoded_text, label
 
-def normalize_datasets(train, validation, test):
-    norm_train = train.map(map_func)
-    norm_valid = validation.map(map_func)
-    norm_test = test.map(map_func)
-    return (norm_train, norm_valid, norm_test, VECTOR_SIZE)
+def normalize_dataset(dataset):
+    return dataset.map(map_func)
 
 def datasets():
-    train_data, validation_data, test_data = download()
-    return normalize_datasets(train_data, validation_data, test_data)
+    train_data, test_data = download()
+
+    vocab_size = build_encoder(train_data)
+
+    train_data = normalize_dataset(train_data)
+    test_data = normalize_dataset(test_data)
+
+    return train_data, test_data, vocab_size
diff --git a/src/test.py b/src/test.py
@@ -1,6 +1,6 @@
 import tensorflow as tf
 from .normalize import datasets
-from .libs import params, prepare, save_metrict, load
+from .libs import params, prepare, save_metrics, load
 
 prepare(tf)
 
@@ -14,7 +14,7 @@
 # For track results better save metrics
 
 # Load normalised datasets
-training, validation, testing, input_shape = datasets()
+training, testing, vocab_size = datasets()
 
 model = load()
 
@@ -25,4 +25,4 @@
 )
 
 with open(metrics_file, 'w') as outfile:
-    save_metrict(model, results, outfile)
+    save_metrics(model, results, outfile)
diff --git a/src/train.py b/src/train.py
@@ -13,28 +13,27 @@
 metrics_file='metrics/training.csv'
 
 # Load normalised datasets
-training, validation, testing, input_shape = datasets()
+training, testing, vocab_size = datasets()
 # Dataset data is array of tensors
 # if symplify array of tuples: (text: string, label: int)
-# where 0 mean bad, and 1 mean good,
-# text normalised to input_shape dimension embeed vector
+# where 0 mean bad, and 1 mean good
 
 # Build neural network model
-model = build_model(input_shape)
+model = build_model(vocab_size=vocab_size)
 
 train_batches = training.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
-validation_batches = validation.padded_batch(BATCH_SIZE)
+validation_batches = testing.padded_batch(BATCH_SIZE)
 
 # Train network
 model.fit(
-        train_batches,
-        epochs=EPOCHS,
-        validation_data=validation_batches,
-        callbacks=[
-            checkpoints.save_weights(), 
-            CSVLogger(metrics_file)
-        ]
-    )
+    train_batches,
+    epochs=EPOCHS,
+    validation_data=validation_batches,
+    callbacks=[
+        checkpoints.save_weights(), 
+        CSVLogger(metrics_file)
+    ]
+)
 
 # Save for restore in next time
 save(model)