Skip to content

Commit

Permalink
migrate to twitter sentiment dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
LeoVS09 committed Sep 9, 2020
1 parent ba9942f commit c248f01
Show file tree
Hide file tree
Showing 9 changed files with 90 additions and 84 deletions.
2 changes: 1 addition & 1 deletion data.dvc
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
outs:
- md5: 5e1e1637ef6e7039a40a0ee22dd6f7e8.dir
- md5: 7dafc45044d12a46008aebfb6bf28435.dir
path: data
11 changes: 0 additions & 11 deletions metrics/training.csv

This file was deleted.

47 changes: 30 additions & 17 deletions src/datasets.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,46 @@
import tensorflow_datasets as tfds
import tensorflow as tf
from .libs import params
import pandas as pd
import os

# Loaded from http://help.sentiment140.com/for-students
# kaggle copy https://www.kaggle.com/kazanova/sentiment140

dataset_name = 'imdb_reviews'
datasets_folder = './data'

# Will return:
# 20 000 train data
# 5 000 validation data
# 10 000 test data
# in tuples (string, int)
# Download dataset if it not exists locally
train_dataset_path = os.path.join(datasets_folder, 'training.1600000.processed.noemoticon.csv')
test_dataset_path = os.path.join(datasets_folder, 'testdata.manual.2009.06.14.csv')

LABEL_COLUMN = 'target'
TEXT_COLUMN = 'text'
BATCH_SIZE = params['input']['batch_size']
COLUMNS = ["target", "id", "date", "flag", "user", "text"]

def get_dataset(file_path):
df = pd.read_csv(file_path, encoding = "ISO-8859-1", names=COLUMNS)

df[LABEL_COLUMN] = pd.Categorical(df[LABEL_COLUMN])
df[LABEL_COLUMN] = df[LABEL_COLUMN].cat.codes

labels = df.pop(LABEL_COLUMN)
texts = df.pop(TEXT_COLUMN)

return tf.data.Dataset.from_tensor_slices((texts.values, labels.values))

def download():
train_data, validation_data, test_data = tfds.load(
name=dataset_name,
data_dir=datasets_folder,
split=('train[:80%]', 'train[80%:]', 'test'),
as_supervised=True
)
train_dataset = get_dataset(train_dataset_path)
test_dataset = get_dataset(test_dataset_path)

return train_data, validation_data, test_data
return train_dataset, test_dataset

# Will print dataset sizes
# Not use it in production,
# size of dataset can be computed only by transformation to list
def print_dataset_sizes(train_data, validation_data, test_data):
def print_dataset_sizes(train_data, test_data):
print(
'\nLoaded dataset',
'\ntrain size:', len(list(train_data)),
'\nvalidation sise:', len(list(validation_data)),
'\ntrain size:', len(list(train_data)),
'\ntest size:', len(list(test_data)), '\n'
)

Expand Down
4 changes: 2 additions & 2 deletions src/libs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .prepare_tf import prepare
from .params import params
from .save_metrict import save_metrict
from .save_metrics import save_metrics
from .save_and_restore import save, load
import .checkpoints as checkpoints
from . import checkpoints
2 changes: 1 addition & 1 deletion src/libs/save_metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json

def save_metrict(model, results, outfile):
def save_metrics(model, results, outfile):
metrics = {}

for name, value in zip(model.metrics_names, results):
Expand Down
7 changes: 4 additions & 3 deletions src/model.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from tensorflow.keras import Sequential, layers, losses, optimizers

def build_model(vector_dimensions):
def build_model(vocab_size):
model = Sequential([
layers.Bidirectional(layers.LSTM(64, return_sequences=True), input_shape=(None, vector_dimensions)),
layers.Bidirectional(layers.LSTM(32)),
layers.Embedding(vocab_size, 1000),
# layers.Bidirectional(layers.LSTM(64, return_sequences=True), input_shape=(None, vector_dimensions)),
layers.Bidirectional(layers.LSTM(64)),
layers.Dense(64, activation='relu'),
layers.Dropout(0.5),
# Two dense layer allow make separate predictions about each class
Expand Down
70 changes: 37 additions & 33 deletions src/normalize.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,59 @@
import spacy
import tensorflow as tf
import tensorflow_datasets as tfds
from .datasets import download

# Unfortunately Tensorflow doesn't allow save dataset or tensor in easy way,
# but Spacy process mostly work fast on data processing,
# so we can direcly load datasets and normalaise data each time

VECTOR_SIZE = 300
tokenizer = tfds.features.text.Tokenizer()

nlp = spacy.load("en_core_web_lg")
encoder=None

def extract_sentences(text):
doc = nlp(text)
return list(doc.sents)
def build_encoder(labeled_data):
vocabulary_set = set()
for text_tensor, _ in labeled_data:
some_tokens = tokenizer.tokenize(text_tensor.numpy())
vocabulary_set.update(some_tokens)

# text = "Peach emoji is where it has always been. Peach is the superior emoji. It's outranking eggplant 🍑 "
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

def print_sentencies(text):
for sentence in extract_sentences(text):
print(sentence)
vocab_size = len(vocabulary_set)
return vocab_size

def token_to_vector(token):
return token.vector
def print_text(labeled_data, index = 0):
text = next(iter(labeled_data))[index].numpy()
print(text)

# normalise text vector of words (vectors of 300 dimension)
def text_to_vector(text):
doc = nlp(text)
return encoder.encode(text)

# map all tokens in sentence to his vectors
sentence = list(map(token_to_vector, doc))
# TODO: filter words which out of vocalabirity
def encode(text_tensor, label):
encoded_text = text_to_vector(text_tensor.numpy())

return sentence

def bytes_to_tensor(bytes):
text = bytes.numpy().decode("utf-8")
vector = text_to_vector(text)

return tf.constant(vector)
return encoded_text, label

def map_func(bytes, label):
[tensor, ] = tf.py_function(bytes_to_tensor, [bytes], [tf.float32])
tensor.set_shape([None, VECTOR_SIZE])
return tensor, label
# py_func doesn't set the shape of the returned tensors.
encoded_text, label = tf.py_function(encode, [bytes, label], Tout=[tf.int64, tf.int64])

# `tf.data.Datasets` work best if all components have a shape set
# so set the shapes manually:
encoded_text.set_shape([None])
label.set_shape([])

return encoded_text, label

def normalize_datasets(train, validation, test):
norm_train = train.map(map_func)
norm_valid = validation.map(map_func)
norm_test = test.map(map_func)
return (norm_train, norm_valid, norm_test, VECTOR_SIZE)
def normalize_dataset(dataset):
return dataset.map(map_func)

def datasets():
train_data, validation_data, test_data = download()
return normalize_datasets(train_data, validation_data, test_data)
train_data, test_data = download()

vocab_size = build_encoder(train_data)

train_data = normalize_dataset(train_data)
test_data = normalize_dataset(test_data)

return train_data, test_data, vocab_size
6 changes: 3 additions & 3 deletions src/test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import tensorflow as tf
from .normalize import datasets
from .libs import params, prepare, save_metrict, load
from .libs import params, prepare, save_metrics, load

prepare(tf)

Expand All @@ -14,7 +14,7 @@
# For track results better save metrics

# Load normalised datasets
training, validation, testing, input_shape = datasets()
training, testing, vocab_size = datasets()

model = load()

Expand All @@ -25,4 +25,4 @@
)

with open(metrics_file, 'w') as outfile:
save_metrict(model, results, outfile)
save_metrics(model, results, outfile)
25 changes: 12 additions & 13 deletions src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,27 @@
metrics_file='metrics/training.csv'

# Load normalised datasets
training, validation, testing, input_shape = datasets()
training, testing, vocab_size = datasets()
# Dataset data is array of tensors
# if symplify array of tuples: (text: string, label: int)
# where 0 mean bad, and 1 mean good,
# text normalised to input_shape dimension embeed vector
# where 0 mean bad, and 1 mean good

# Build neural network model
model = build_model(input_shape)
model = build_model(vocab_size=vocab_size)

train_batches = training.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
validation_batches = validation.padded_batch(BATCH_SIZE)
validation_batches = testing.padded_batch(BATCH_SIZE)

# Train network
model.fit(
train_batches,
epochs=EPOCHS,
validation_data=validation_batches,
callbacks=[
checkpoints.save_weights(),
CSVLogger(metrics_file)
]
)
train_batches,
epochs=EPOCHS,
validation_data=validation_batches,
callbacks=[
checkpoints.save_weights(),
CSVLogger(metrics_file)
]
)

# Save for restore in next time
save(model)

0 comments on commit c248f01

Please sign in to comment.