diff --git a/.gitignore b/.gitignore index 3f2043a..9b0fb86 100644 --- a/.gitignore +++ b/.gitignore @@ -134,3 +134,6 @@ saved_models/*.h5 # Input data data/ + +# Embeding encoder +encoder/ \ No newline at end of file diff --git a/dvc.lock b/dvc.lock index 665bc84..6c3c768 100644 --- a/dvc.lock +++ b/dvc.lock @@ -35,3 +35,17 @@ test: outs: - path: metrics/test.json md5: 517645e6f3a3f10e79f631fd14f88027 +build_encoder: + cmd: python -m src.build_encoder + deps: + - path: ./src/build_encoder.py + md5: ec406fee035be7966fea0bfb1a47d8c3 + - path: ./src/datasets.py + md5: f8ee9779fe8a44d60bc5ca75bca0b772 + - path: data + md5: 7dafc45044d12a46008aebfb6bf28435.dir + outs: + - path: ./encoder/encoder.tokens + md5: d4016f638364037b684447f4ea184017 + - path: ./encoder/info.json + md5: 0bcf7a6925d930966a4355b3fac6f83f diff --git a/dvc.yaml b/dvc.yaml index 219f5e8..fa8002c 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -1,4 +1,13 @@ stages: + build_encoder: + cmd: python -m src.build_encoder + deps: + - ./src/build_encoder.py + - ./src/datasets.py + - data + outs: + - ./encoder/encoder.tokens + - ./encoder/info.json train: cmd: python -m src.train deps: @@ -23,7 +32,7 @@ stages: - src/normalize.py - src/test.py params: - - input.batch_size + - input.batch_size metrics: - metrics/test.json: cache: false diff --git a/requirements.txt b/requirements.txt index 3d64cd3..0315dd7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -91,6 +91,7 @@ plac==1.1.3 plotly==4.9.0 ply==3.11 preshed==3.0.2 +progressbar2==3.53.1 prometheus-client==0.8.0 promise==2.3 prompt-toolkit==3.0.7 @@ -110,6 +111,7 @@ pyparsing==2.4.7 pyrsistent==0.16.0 python-apt==1.6.5+ubuntu0.2 python-dateutil==2.8.1 +python-utils==2.4.0 pytz==2020.1 pyxdg==0.25 PyYAML==5.3.1 diff --git a/src/build_encoder.py b/src/build_encoder.py new file mode 100644 index 0000000..03b8030 --- /dev/null +++ b/src/build_encoder.py @@ -0,0 +1,39 @@ +import tensorflow as tf +import tensorflow_datasets as tfds +from .datasets import get_train_dataset +from .normalize import encoder_filename, encoder_info_filename +import json + +# Need firstly build encoder on training dataset, +# for embeding of all text input + +def build_encoder(labeled_data): + tokenizer = tfds.features.text.Tokenizer() + vocabulary_set = set() + for text_tensor, _ in labeled_data: + some_tokens = tokenizer.tokenize(text_tensor.numpy()) + vocabulary_set.update(some_tokens) + + encoder = tfds.features.text.TokenTextEncoder(vocabulary_set) + + vocab_size = len(vocabulary_set) + return encoder, vocab_size + +def save(encoder, info_data): + encoder.save_to_file(encoder_filename) + print("Encoder saved to", encoder_filename) + + with open(encoder_info_filename, 'w') as info: + json.dump(info_data, info) + + print('Encoder info saved to', encoder_info_filename) + + + +print('Start building encoder...') + +train_data = get_train_dataset(display_progress=True) +encoder, vocab_size = build_encoder(train_data) + +print('Encoder was build, saving...') +save(encoder, { 'vocab_size': vocab_size }) diff --git a/src/datasets.py b/src/datasets.py index 553c053..8dd7004 100644 --- a/src/datasets.py +++ b/src/datasets.py @@ -3,6 +3,7 @@ from .libs import params import pandas as pd import os +from progressbar import ProgressBar # Loaded from http://help.sentiment140.com/for-students # kaggle copy https://www.kaggle.com/kazanova/sentiment140 @@ -16,21 +17,49 @@ TEXT_COLUMN = 'text' BATCH_SIZE = params['input']['batch_size'] COLUMNS = ["target", "id", "date", "flag", "user", "text"] +CHUNK_SIZE = 10 ** 3 # Read thousand records at once -def get_dataset(file_path): - df = pd.read_csv(file_path, encoding = "ISO-8859-1", names=COLUMNS) +def get_dataset_generator(file_path, display_progress=False): + print('Start reading dataset from', train_dataset_path) + + bar = None + if display_progress: + bar = ProgressBar(max_value=1600, max_error=False).start() - df[LABEL_COLUMN] = pd.Categorical(df[LABEL_COLUMN]) - df[LABEL_COLUMN] = df[LABEL_COLUMN].cat.codes + for i, chunk in enumerate(pd.read_csv(file_path, encoding = "ISO-8859-1", names=COLUMNS, chunksize=CHUNK_SIZE)): + if bar != None: + bar.update(i) + + for item in chunk.index: + text = chunk[TEXT_COLUMN][item] + label = chunk[LABEL_COLUMN][item] + + # Dataset must contain labels in format: 0 = negative, 2 = neutral, 4 = positive + # but actually conain only "0" and "4" + label = 0 if label == "0" else 1 + + yield (text, label) + + if bar != None: + bar.finish() + +def get_dataset(file_path, display_progress=False): + generator = lambda: get_dataset_generator(file_path, display_progress=display_progress) + return tf.data.Dataset.from_generator( + generator, + (tf.string, tf.int64), + ((), ()) + ) - labels = df.pop(LABEL_COLUMN) - texts = df.pop(TEXT_COLUMN) +def get_train_dataset(display_progress=False): + return get_dataset(train_dataset_path, display_progress=display_progress) - return tf.data.Dataset.from_tensor_slices((texts.values, labels.values)) +def get_test_dataset(display_progress=False): + return get_dataset(test_dataset_path, display_progress=display_progress) def download(): - train_dataset = get_dataset(train_dataset_path) - test_dataset = get_dataset(test_dataset_path) + train_dataset = get_train_dataset() + test_dataset = get_test_dataset() return train_dataset, test_dataset diff --git a/src/normalize.py b/src/normalize.py index 0a7ae3a..f9e88fa 100644 --- a/src/normalize.py +++ b/src/normalize.py @@ -1,33 +1,32 @@ -import spacy import tensorflow as tf import tensorflow_datasets as tfds from .datasets import download +import json -# Unfortunately Tensorflow doesn't allow save dataset or tensor in easy way, -# but Spacy process mostly work fast on data processing, -# so we can direcly load datasets and normalaise data each time +encoder_filename = 'encoder/encoder' +encoder_info_filename ='encoder/info.json' -tokenizer = tfds.features.text.Tokenizer() - -encoder=None - -def build_encoder(labeled_data): - vocabulary_set = set() - for text_tensor, _ in labeled_data: - some_tokens = tokenizer.tokenize(text_tensor.numpy()) - vocabulary_set.update(some_tokens) +def load_encoder(): + encoder = tfds.features.text.TokenTextEncoder.load_from_file(encoder_filename) + vocab_size = None + + with open(encoder_info_filename) as info_file: + info = json.load(info_file) + vocab_size = info['vocab_size'] - encoder = tfds.features.text.TokenTextEncoder(vocabulary_set) + return encoder, vocab_size - vocab_size = len(vocabulary_set) - return vocab_size +# Link to encoder object +# global links not best preactice, but simple enoght +# when this file will be more complex use lambda closure or something like it +CURRENT_ENCODER = None def print_text(labeled_data, index = 0): text = next(iter(labeled_data))[index].numpy() print(text) def text_to_vector(text): - return encoder.encode(text) + return CURRENT_ENCODER.encode(text) def encode(text_tensor, label): encoded_text = text_to_vector(text_tensor.numpy()) @@ -50,8 +49,9 @@ def normalize_dataset(dataset): def datasets(): train_data, test_data = download() + encoder, vocab_size = load_encoder() - vocab_size = build_encoder(train_data) + CURRENT_ENCODER = encoder train_data = normalize_dataset(train_data) test_data = normalize_dataset(test_data)