From cafc860c5b7a5702e4c660bf64c3bd1a61c04dd9 Mon Sep 17 00:00:00 2001 From: Chimezie Iwuanyanwu Date: Sat, 2 Dec 2023 16:24:36 -0600 Subject: [PATCH] Created data.py Up-directories the module --- .../{data/ModelDatasetTypes.py => data.py} | 6 + stepcovnet/data/Tokenizers.py | 7 - stepcovnet/data/__init__.py | 0 .../training/TrainingFeatureGenerator.py | 193 ++++++++++++------ train.py | 8 +- training_data_collection.py | 6 +- 6 files changed, 148 insertions(+), 72 deletions(-) rename stepcovnet/{data/ModelDatasetTypes.py => data.py} (70%) delete mode 100644 stepcovnet/data/Tokenizers.py delete mode 100644 stepcovnet/data/__init__.py diff --git a/stepcovnet/data/ModelDatasetTypes.py b/stepcovnet/data.py similarity index 70% rename from stepcovnet/data/ModelDatasetTypes.py rename to stepcovnet/data.py index e8d2270..19e659b 100644 --- a/stepcovnet/data/ModelDatasetTypes.py +++ b/stepcovnet/data.py @@ -1,9 +1,15 @@ from enum import Enum +from transformers import GPT2Tokenizer + from stepcovnet.dataset.DistributedModelDataset import DistributedModelDataset from stepcovnet.dataset.ModelDataset import ModelDataset +class Tokenizers(Enum): + GPT2 = GPT2Tokenizer.from_pretrained("gpt2") + + class ModelDatasetTypes(Enum): SINGULAR_DATASET = ModelDataset DISTRIBUTED_DATASET = DistributedModelDataset diff --git a/stepcovnet/data/Tokenizers.py b/stepcovnet/data/Tokenizers.py deleted file mode 100644 index c30a38b..0000000 --- a/stepcovnet/data/Tokenizers.py +++ /dev/null @@ -1,7 +0,0 @@ -from enum import Enum - -from transformers import GPT2Tokenizer - - -class Tokenizers(Enum): - GPT2 = GPT2Tokenizer.from_pretrained('gpt2') diff --git a/stepcovnet/data/__init__.py b/stepcovnet/data/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/stepcovnet/training/TrainingFeatureGenerator.py b/stepcovnet/training/TrainingFeatureGenerator.py index 68fb053..46a3928 100644 --- a/stepcovnet/training/TrainingFeatureGenerator.py +++ b/stepcovnet/training/TrainingFeatureGenerator.py @@ -2,15 +2,29 @@ import numpy as np -from stepcovnet.common.utils import apply_timeseries_scalers -from stepcovnet.common.utils import get_samples_ngram_with_mask -from stepcovnet.common.utils import normalize_tokenized_arrows -from stepcovnet.data.Tokenizers import Tokenizers +from stepcovnet import data +from stepcovnet.common.utils import ( + apply_timeseries_scalers, + get_samples_ngram_with_mask, + normalize_tokenized_arrows, +) class TrainingFeatureGenerator(object): - def __init__(self, dataset_path, dataset_type, batch_size, indexes, num_samples, lookback=1, scalers=None, - difficulty="challenge", warmup=False, shuffle=True, tokenizer_name=None): + def __init__( + self, + dataset_path, + dataset_type, + batch_size, + indexes, + num_samples, + lookback=1, + scalers=None, + difficulty="challenge", + warmup=False, + shuffle=True, + tokenizer_name=None, + ): self.dataset_path = dataset_path self.dataset_type = dataset_type self.train_indexes = indexes @@ -20,7 +34,9 @@ def __init__(self, dataset_path, dataset_type, batch_size, indexes, num_samples, self.batch_size = batch_size self.difficulty = difficulty self.shuffle = shuffle - self.tokenizer = None if tokenizer_name is None else Tokenizers[tokenizer_name].value + self.tokenizer = ( + None if tokenizer_name is None else data.Tokenizers[tokenizer_name].value + ) # The Tensorflow calls the generator three times before starting a training job. We will "warmup" the data # yielding by returning the same data for the three calls. This way the indexing is aligned correctly. @@ -43,52 +59,84 @@ def __call__(self): features = defaultdict(lambda: np.array([])) if self.song_index >= len(self.train_indexes): self.song_index = 0 - song_start_index, song_end_index = dataset.song_index_ranges[self.train_indexes[self.song_index]] - if self.song_start_index is None or self.song_start_index >= song_end_index or \ - self.song_start_index < song_start_index or self.warmup_countdown > 0: + song_start_index, song_end_index = dataset.song_index_ranges[ + self.train_indexes[self.song_index] + ] + if ( + self.song_start_index is None + or self.song_start_index >= song_end_index + or self.song_start_index < song_start_index + or self.warmup_countdown > 0 + ): self.song_start_index = song_start_index self.warmup_countdown = max(self.warmup_countdown - 1, 0) # We only return partial batches when at the end of the training data. Otherwise, use start of next song # to append data to the batch. - while len(features["y_batch"]) == 0 or self.song_index < len(self.train_indexes): + while len(features["y_batch"]) == 0 or self.song_index < len( + self.train_indexes + ): start_index = self.song_start_index y_batch_len = len(features["y_batch"]) - end_index = min(start_index + self.batch_size - y_batch_len, song_end_index) + end_index = min( + start_index + self.batch_size - y_batch_len, song_end_index + ) # Lookback data from ngram returns empty value in index 0. Also, arrow features should only # contain previously seen features. Therefore, removing last element and last lookback from # arrows features and first element from audio features. mask_padding_value = 0 if new_song else 1 - lookback_index_padding_start = max(start_index - self.lookback, song_start_index) + lookback_index_padding_start = max( + start_index - self.lookback, song_start_index + ) lookback_padding_added = start_index - lookback_index_padding_start if self.tokenizer is not None: - arrows = dataset.string_arrows[lookback_index_padding_start:end_index] - arrow_features, arrow_mask = self.get_tokenized_arrow_features(arrows, mask_padding_value, - lookback_padding_added) + arrows = dataset.string_arrows[ + lookback_index_padding_start:end_index + ] + arrow_features, arrow_mask = self.get_tokenized_arrow_features( + arrows, mask_padding_value, lookback_padding_added + ) else: - arrows = dataset.label_encoded_arrows[lookback_index_padding_start:end_index] - arrow_features, arrow_mask = self.get_arrow_features(arrows, mask_padding_value, - lookback_padding_added) - - audio_data = dataset.features[lookback_index_padding_start:end_index] - audio_features = self.get_audio_features(audio_data, lookback_padding_added) + arrows = dataset.label_encoded_arrows[ + lookback_index_padding_start:end_index + ] + arrow_features, arrow_mask = self.get_arrow_features( + arrows, mask_padding_value, lookback_padding_added + ) + + audio_data = dataset.features[ + lookback_index_padding_start:end_index + ] + audio_features = self.get_audio_features( + audio_data, lookback_padding_added + ) arrows = dataset.onehot_encoded_arrows[start_index:end_index] sample_weights = dataset.sample_weights[start_index:end_index] - features = self.append_existing_data(features=features, arrow_features=arrow_features, - arrow_mask=arrow_mask, audio_features=audio_features, - arrows=arrows, sample_weights=sample_weights) + features = self.append_existing_data( + features=features, + arrow_features=arrow_features, + arrow_mask=arrow_mask, + audio_features=audio_features, + arrows=arrows, + sample_weights=sample_weights, + ) self.song_start_index = end_index # Break if collected enough data for a batch or end of song list. # Otherwise, change to next song to collect more. - if len(features["y_batch"]) >= self.batch_size or self.song_index + 1 >= len(self.train_indexes): + if len( + features["y_batch"] + ) >= self.batch_size or self.song_index + 1 >= len( + self.train_indexes + ): new_song = False break else: self.song_index += 1 - song_start_index, song_end_index = \ - dataset.song_index_ranges[self.train_indexes[self.song_index]] + song_start_index, song_end_index = dataset.song_index_ranges[ + self.train_indexes[self.song_index] + ] self.song_start_index = song_start_index new_song = True @@ -97,15 +145,20 @@ def __call__(self): self.song_index += 1 if len(features["y_batch"]) > 0: - scaled_audio_features = apply_timeseries_scalers(features=features["audio_features"], - scalers=self.scalers) - x_batch = {"arrow_input": features["arrow_features"], - "arrow_mask": features["arrow_mask"], - "audio_input": scaled_audio_features} + scaled_audio_features = apply_timeseries_scalers( + features=features["audio_features"], scalers=self.scalers + ) + x_batch = { + "arrow_input": features["arrow_features"], + "arrow_mask": features["arrow_mask"], + "audio_input": scaled_audio_features, + } yield x_batch, features["y_batch"], features["sample_weights_batch"] @staticmethod - def append_existing_data(features, arrow_features, arrow_mask, audio_features, arrows, sample_weights): + def append_existing_data( + features, arrow_features, arrow_mask, audio_features, arrows, sample_weights + ): # Append or set features/labels/sample weights based on if existing data is present if not features or any(len(value) == 0 for value in features.values()): features["arrow_features"] = arrow_features @@ -114,43 +167,67 @@ def append_existing_data(features, arrow_features, arrow_mask, audio_features, a features["y_batch"] = arrows features["sample_weights_batch"] = sample_weights else: - if isinstance(features["arrow_features"], list) or isinstance(features["arrow_mask"], list): + if isinstance(features["arrow_features"], list) or isinstance( + features["arrow_mask"], list + ): features["arrow_features"].extend(arrow_features) features["arrow_mask"].extend(arrow_mask) # Normalize again after appending in the case where split batches have different max lengths - features["arrow_features"], features["arrow_mask"] = \ - normalize_tokenized_arrows(arrow_features=features["arrow_features"], - arrow_mask=features["arrow_mask"]) + ( + features["arrow_features"], + features["arrow_mask"], + ) = normalize_tokenized_arrows( + arrow_features=features["arrow_features"], + arrow_mask=features["arrow_mask"], + ) else: - features["arrow_features"] = np.concatenate((features["arrow_features"], arrow_features), axis=0) - features["arrow_mask"] = np.concatenate((features["arrow_mask"], arrow_mask), axis=0) - features["audio_features"] = np.concatenate((features["audio_features"], audio_features), axis=0) + features["arrow_features"] = np.concatenate( + (features["arrow_features"], arrow_features), axis=0 + ) + features["arrow_mask"] = np.concatenate( + (features["arrow_mask"], arrow_mask), axis=0 + ) + features["audio_features"] = np.concatenate( + (features["audio_features"], audio_features), axis=0 + ) features["y_batch"] = np.concatenate((features["y_batch"], arrows), axis=0) - features["sample_weights_batch"] = np.concatenate((features["sample_weights_batch"], sample_weights), - axis=0) + features["sample_weights_batch"] = np.concatenate( + (features["sample_weights_batch"], sample_weights), axis=0 + ) return features - def get_tokenized_arrow_features(self, arrows, mask_padding_value, lookback_padding_added): - arrow_features, arrow_mask = get_samples_ngram_with_mask(arrows, self.lookback, - reshape=True, - sample_padding_value='0000', - mask_padding_value=mask_padding_value) + def get_tokenized_arrow_features( + self, arrows, mask_padding_value, lookback_padding_added + ): + arrow_features, arrow_mask = get_samples_ngram_with_mask( + arrows, + self.lookback, + reshape=True, + sample_padding_value="0000", + mask_padding_value=mask_padding_value, + ) arrow_features = arrow_features[lookback_padding_added:] arrow_mask = arrow_mask[lookback_padding_added:].astype(np.int32) decoded_arrows = [" ".join(line) for line in arrow_features] - arrow_features = [self.tokenizer(line, return_tensors='tf', - add_prefix_space=True)['input_ids'] - .numpy()[0][1:].astype(np.int32) - for line in decoded_arrows] + arrow_features = [ + self.tokenizer(line, return_tensors="tf", add_prefix_space=True)[ + "input_ids" + ] + .numpy()[0][1:] + .astype(np.int32) + for line in decoded_arrows + ] arrow_features = arrow_features[:-1] arrow_mask = list(arrow_mask[:-1, 1:]) - return normalize_tokenized_arrows(arrow_features=arrow_features, arrow_mask=arrow_mask) + return normalize_tokenized_arrows( + arrow_features=arrow_features, arrow_mask=arrow_mask + ) def get_arrow_features(self, arrows, mask_padding_value, lookback_padding_added): - arrow_features, arrow_mask = get_samples_ngram_with_mask(arrows, self.lookback, - reshape=True, - mask_padding_value=mask_padding_value) + arrow_features, arrow_mask = get_samples_ngram_with_mask( + arrows, self.lookback, reshape=True, mask_padding_value=mask_padding_value + ) arrow_features = arrow_features[lookback_padding_added:] arrow_mask = arrow_mask[lookback_padding_added:] arrow_features = arrow_features[:-1, 1:] @@ -159,7 +236,9 @@ def get_arrow_features(self, arrows, mask_padding_value, lookback_padding_added) return arrow_features.astype(np.int32), arrow_mask.astype(np.int32) def get_audio_features(self, audio_data, lookback_padding_added): - audio_features, _ = get_samples_ngram_with_mask(audio_data, self.lookback, squeeze=False) + audio_features, _ = get_samples_ngram_with_mask( + audio_data, self.lookback, squeeze=False + ) audio_features = audio_features[lookback_padding_added:] audio_features = audio_features[1:] diff --git a/train.py b/train.py index cf337f0..376e652 100644 --- a/train.py +++ b/train.py @@ -4,9 +4,7 @@ import joblib -from stepcovnet import config -from stepcovnet.data.ModelDatasetTypes import ModelDatasetTypes -from stepcovnet.data.Tokenizers import Tokenizers +from stepcovnet import config, data from stepcovnet.executor.TrainingExecutor import TrainingExecutor from stepcovnet.inputs.TrainingInput import TrainingInput from stepcovnet.model.ClassifierModel import ClassifierModel @@ -19,7 +17,7 @@ def load_training_data(input_path: str): metadata = json.load(open(os.path.join(input_path, "metadata.json"), "r")) dataset_name = metadata["dataset_name"] - dataset_type = ModelDatasetTypes[metadata["dataset_type"]].value + dataset_type = data.ModelDatasetTypes[metadata["dataset_type"]].value dataset_path = os.path.join(input_path, dataset_name + "_dataset") scalers = joblib.load( open(os.path.join(input_path, dataset_name + "_scaler.pkl"), "rb") @@ -49,7 +47,7 @@ def run_training( limit=limit, lookback=lookback, difficulty=difficulty, - tokenizer_name=Tokenizers.GPT2.name, + tokenizer_name=data.Tokenizers.GPT2.name, ) training_input = TrainingInput(training_config) diff --git a/training_data_collection.py b/training_data_collection.py index bf13519..2582f00 100644 --- a/training_data_collection.py +++ b/training_data_collection.py @@ -9,13 +9,13 @@ import joblib import psutil +from stepcovnet import data from stepcovnet.common.parameters import CONFIG, VGGISH_CONFIG from stepcovnet.common.utils import ( get_channel_scalers, get_filename, get_filenames_from_folder, ) -from stepcovnet.data.ModelDatasetTypes import ModelDatasetTypes from stepcovnet.data_collection.sample_collection_helper import ( feature_onset_phrase_label_sample_weights, get_features_and_labels, @@ -214,9 +214,9 @@ def training_data_collection( output_path = os.path.join(output_path, name_prefix + name_postfix) os.makedirs(output_path, exist_ok=True) dataset_type = ( - ModelDatasetTypes.DISTRIBUTED_DATASET + data.ModelDatasetTypes.DISTRIBUTED_DATASET if distributed - else ModelDatasetTypes.SINGULAR_DATASET + else data.ModelDatasetTypes.SINGULAR_DATASET ) training_dataset = dataset_type.value( os.path.join(output_path, name_prefix + name_postfix), overwrite=True