Skip to content

Commit

Permalink
setup build encoder phase
Browse files Browse the repository at this point in the history
  • Loading branch information
LeoVS09 committed Sep 9, 2020
1 parent c248f01 commit 5c00f2c
Show file tree
Hide file tree
Showing 7 changed files with 124 additions and 28 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,6 @@ saved_models/*.h5

# Input data
data/

# Embeding encoder
encoder/
14 changes: 14 additions & 0 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,17 @@ test:
outs:
- path: metrics/test.json
md5: 517645e6f3a3f10e79f631fd14f88027
build_encoder:
cmd: python -m src.build_encoder
deps:
- path: ./src/build_encoder.py
md5: ec406fee035be7966fea0bfb1a47d8c3
- path: ./src/datasets.py
md5: f8ee9779fe8a44d60bc5ca75bca0b772
- path: data
md5: 7dafc45044d12a46008aebfb6bf28435.dir
outs:
- path: ./encoder/encoder.tokens
md5: d4016f638364037b684447f4ea184017
- path: ./encoder/info.json
md5: 0bcf7a6925d930966a4355b3fac6f83f
11 changes: 10 additions & 1 deletion dvc.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
stages:
build_encoder:
cmd: python -m src.build_encoder
deps:
- ./src/build_encoder.py
- ./src/datasets.py
- data
outs:
- ./encoder/encoder.tokens
- ./encoder/info.json
train:
cmd: python -m src.train
deps:
Expand All @@ -23,7 +32,7 @@ stages:
- src/normalize.py
- src/test.py
params:
- input.batch_size
- input.batch_size
metrics:
- metrics/test.json:
cache: false
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ plac==1.1.3
plotly==4.9.0
ply==3.11
preshed==3.0.2
progressbar2==3.53.1
prometheus-client==0.8.0
promise==2.3
prompt-toolkit==3.0.7
Expand All @@ -110,6 +111,7 @@ pyparsing==2.4.7
pyrsistent==0.16.0
python-apt==1.6.5+ubuntu0.2
python-dateutil==2.8.1
python-utils==2.4.0
pytz==2020.1
pyxdg==0.25
PyYAML==5.3.1
Expand Down
39 changes: 39 additions & 0 deletions src/build_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import tensorflow as tf
import tensorflow_datasets as tfds
from .datasets import get_train_dataset
from .normalize import encoder_filename, encoder_info_filename
import json

# Need firstly build encoder on training dataset,
# for embeding of all text input

def build_encoder(labeled_data):
tokenizer = tfds.features.text.Tokenizer()
vocabulary_set = set()
for text_tensor, _ in labeled_data:
some_tokens = tokenizer.tokenize(text_tensor.numpy())
vocabulary_set.update(some_tokens)

encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

vocab_size = len(vocabulary_set)
return encoder, vocab_size

def save(encoder, info_data):
encoder.save_to_file(encoder_filename)
print("Encoder saved to", encoder_filename)

with open(encoder_info_filename, 'w') as info:
json.dump(info_data, info)

print('Encoder info saved to', encoder_info_filename)



print('Start building encoder...')

train_data = get_train_dataset(display_progress=True)
encoder, vocab_size = build_encoder(train_data)

print('Encoder was build, saving...')
save(encoder, { 'vocab_size': vocab_size })
47 changes: 38 additions & 9 deletions src/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .libs import params
import pandas as pd
import os
from progressbar import ProgressBar

# Loaded from http://help.sentiment140.com/for-students
# kaggle copy https://www.kaggle.com/kazanova/sentiment140
Expand All @@ -16,21 +17,49 @@
TEXT_COLUMN = 'text'
BATCH_SIZE = params['input']['batch_size']
COLUMNS = ["target", "id", "date", "flag", "user", "text"]
CHUNK_SIZE = 10 ** 3 # Read thousand records at once

def get_dataset(file_path):
df = pd.read_csv(file_path, encoding = "ISO-8859-1", names=COLUMNS)
def get_dataset_generator(file_path, display_progress=False):
print('Start reading dataset from', train_dataset_path)

bar = None
if display_progress:
bar = ProgressBar(max_value=1600, max_error=False).start()

df[LABEL_COLUMN] = pd.Categorical(df[LABEL_COLUMN])
df[LABEL_COLUMN] = df[LABEL_COLUMN].cat.codes
for i, chunk in enumerate(pd.read_csv(file_path, encoding = "ISO-8859-1", names=COLUMNS, chunksize=CHUNK_SIZE)):
if bar != None:
bar.update(i)

for item in chunk.index:
text = chunk[TEXT_COLUMN][item]
label = chunk[LABEL_COLUMN][item]

# Dataset must contain labels in format: 0 = negative, 2 = neutral, 4 = positive
# but actually conain only "0" and "4"
label = 0 if label == "0" else 1

yield (text, label)

if bar != None:
bar.finish()

def get_dataset(file_path, display_progress=False):
generator = lambda: get_dataset_generator(file_path, display_progress=display_progress)
return tf.data.Dataset.from_generator(
generator,
(tf.string, tf.int64),
((), ())
)

labels = df.pop(LABEL_COLUMN)
texts = df.pop(TEXT_COLUMN)
def get_train_dataset(display_progress=False):
return get_dataset(train_dataset_path, display_progress=display_progress)

return tf.data.Dataset.from_tensor_slices((texts.values, labels.values))
def get_test_dataset(display_progress=False):
return get_dataset(test_dataset_path, display_progress=display_progress)

def download():
train_dataset = get_dataset(train_dataset_path)
test_dataset = get_dataset(test_dataset_path)
train_dataset = get_train_dataset()
test_dataset = get_test_dataset()

return train_dataset, test_dataset

Expand Down
36 changes: 18 additions & 18 deletions src/normalize.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,32 @@
import spacy
import tensorflow as tf
import tensorflow_datasets as tfds
from .datasets import download
import json

# Unfortunately Tensorflow doesn't allow save dataset or tensor in easy way,
# but Spacy process mostly work fast on data processing,
# so we can direcly load datasets and normalaise data each time
encoder_filename = 'encoder/encoder'
encoder_info_filename ='encoder/info.json'

tokenizer = tfds.features.text.Tokenizer()

encoder=None

def build_encoder(labeled_data):
vocabulary_set = set()
for text_tensor, _ in labeled_data:
some_tokens = tokenizer.tokenize(text_tensor.numpy())
vocabulary_set.update(some_tokens)
def load_encoder():
encoder = tfds.features.text.TokenTextEncoder.load_from_file(encoder_filename)
vocab_size = None

with open(encoder_info_filename) as info_file:
info = json.load(info_file)
vocab_size = info['vocab_size']

encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)
return encoder, vocab_size

vocab_size = len(vocabulary_set)
return vocab_size
# Link to encoder object
# global links not best preactice, but simple enoght
# when this file will be more complex use lambda closure or something like it
CURRENT_ENCODER = None

def print_text(labeled_data, index = 0):
text = next(iter(labeled_data))[index].numpy()
print(text)

def text_to_vector(text):
return encoder.encode(text)
return CURRENT_ENCODER.encode(text)

def encode(text_tensor, label):
encoded_text = text_to_vector(text_tensor.numpy())
Expand All @@ -50,8 +49,9 @@ def normalize_dataset(dataset):

def datasets():
train_data, test_data = download()
encoder, vocab_size = load_encoder()

vocab_size = build_encoder(train_data)
CURRENT_ENCODER = encoder

train_data = normalize_dataset(train_data)
test_data = normalize_dataset(test_data)
Expand Down

0 comments on commit 5c00f2c

Please sign in to comment.