Created data.py

Up-directories the module
cpuguy96 · Dec 2, 2023 · cafc860 · cafc860
1 parent ffb7a4d
commit cafc860
Show file tree

Hide file tree

Showing 6 changed files with 148 additions and 72 deletions.
diff --git a/stepcovnet/data/ModelDatasetTypes.py → stepcovnet/data.py b/stepcovnet/data/ModelDatasetTypes.py → stepcovnet/data.py
@@ -1,9 +1,15 @@
 from enum import Enum
 
+from transformers import GPT2Tokenizer
+
 from stepcovnet.dataset.DistributedModelDataset import DistributedModelDataset
 from stepcovnet.dataset.ModelDataset import ModelDataset
 
 
+class Tokenizers(Enum):
+    GPT2 = GPT2Tokenizer.from_pretrained("gpt2")
+
+
 class ModelDatasetTypes(Enum):
     SINGULAR_DATASET = ModelDataset
     DISTRIBUTED_DATASET = DistributedModelDataset
diff --git a/stepcovnet/data/Tokenizers.py b/stepcovnet/data/Tokenizers.py
diff --git a/stepcovnet/data/__init__.py b/stepcovnet/data/__init__.py
diff --git a/stepcovnet/training/TrainingFeatureGenerator.py b/stepcovnet/training/TrainingFeatureGenerator.py
@@ -2,15 +2,29 @@
 
 import numpy as np
 
-from stepcovnet.common.utils import apply_timeseries_scalers
-from stepcovnet.common.utils import get_samples_ngram_with_mask
-from stepcovnet.common.utils import normalize_tokenized_arrows
-from stepcovnet.data.Tokenizers import Tokenizers
+from stepcovnet import data
+from stepcovnet.common.utils import (
+    apply_timeseries_scalers,
+    get_samples_ngram_with_mask,
+    normalize_tokenized_arrows,
+)
 
 
 class TrainingFeatureGenerator(object):
-    def __init__(self, dataset_path, dataset_type, batch_size, indexes, num_samples, lookback=1, scalers=None,
-                 difficulty="challenge", warmup=False, shuffle=True, tokenizer_name=None):
+    def __init__(
+        self,
+        dataset_path,
+        dataset_type,
+        batch_size,
+        indexes,
+        num_samples,
+        lookback=1,
+        scalers=None,
+        difficulty="challenge",
+        warmup=False,
+        shuffle=True,
+        tokenizer_name=None,
+    ):
         self.dataset_path = dataset_path
         self.dataset_type = dataset_type
         self.train_indexes = indexes
@@ -20,7 +34,9 @@ def __init__(self, dataset_path, dataset_type, batch_size, indexes, num_samples,
         self.batch_size = batch_size
         self.difficulty = difficulty
         self.shuffle = shuffle
-        self.tokenizer = None if tokenizer_name is None else Tokenizers[tokenizer_name].value
+        self.tokenizer = (
+            None if tokenizer_name is None else data.Tokenizers[tokenizer_name].value
+        )
 
         # The Tensorflow calls the generator three times before starting a training job. We will "warmup" the data
         # yielding by returning the same data for the three calls. This way the indexing is aligned correctly.
@@ -43,52 +59,84 @@ def __call__(self):
                 features = defaultdict(lambda: np.array([]))
                 if self.song_index >= len(self.train_indexes):
                     self.song_index = 0
-                song_start_index, song_end_index = dataset.song_index_ranges[self.train_indexes[self.song_index]]
-                if self.song_start_index is None or self.song_start_index >= song_end_index or \
-                        self.song_start_index < song_start_index or self.warmup_countdown > 0:
+                song_start_index, song_end_index = dataset.song_index_ranges[
+                    self.train_indexes[self.song_index]
+                ]
+                if (
+                    self.song_start_index is None
+                    or self.song_start_index >= song_end_index
+                    or self.song_start_index < song_start_index
+                    or self.warmup_countdown > 0
+                ):
                     self.song_start_index = song_start_index
                     self.warmup_countdown = max(self.warmup_countdown - 1, 0)
                 # We only return partial batches when at the end of the training data. Otherwise, use start of next song
                 # to append data to the batch.
-                while len(features["y_batch"]) == 0 or self.song_index < len(self.train_indexes):
+                while len(features["y_batch"]) == 0 or self.song_index < len(
+                    self.train_indexes
+                ):
                     start_index = self.song_start_index
                     y_batch_len = len(features["y_batch"])
-                    end_index = min(start_index + self.batch_size - y_batch_len, song_end_index)
+                    end_index = min(
+                        start_index + self.batch_size - y_batch_len, song_end_index
+                    )
 
                     # Lookback data from ngram returns empty value in index 0. Also, arrow features should only
                     # contain previously seen features. Therefore, removing last element and last lookback from
                     # arrows features and first element from audio features.
                     mask_padding_value = 0 if new_song else 1
-                    lookback_index_padding_start = max(start_index - self.lookback, song_start_index)
+                    lookback_index_padding_start = max(
+                        start_index - self.lookback, song_start_index
+                    )
                     lookback_padding_added = start_index - lookback_index_padding_start
                     if self.tokenizer is not None:
-                        arrows = dataset.string_arrows[lookback_index_padding_start:end_index]
-                        arrow_features, arrow_mask = self.get_tokenized_arrow_features(arrows, mask_padding_value,
-                                                                                       lookback_padding_added)
+                        arrows = dataset.string_arrows[
+                            lookback_index_padding_start:end_index
+                        ]
+                        arrow_features, arrow_mask = self.get_tokenized_arrow_features(
+                            arrows, mask_padding_value, lookback_padding_added
+                        )
                     else:
-                        arrows = dataset.label_encoded_arrows[lookback_index_padding_start:end_index]
-                        arrow_features, arrow_mask = self.get_arrow_features(arrows, mask_padding_value,
-                                                                             lookback_padding_added)
-
-                    audio_data = dataset.features[lookback_index_padding_start:end_index]
-                    audio_features = self.get_audio_features(audio_data, lookback_padding_added)
+                        arrows = dataset.label_encoded_arrows[
+                            lookback_index_padding_start:end_index
+                        ]
+                        arrow_features, arrow_mask = self.get_arrow_features(
+                            arrows, mask_padding_value, lookback_padding_added
+                        )
+
+                    audio_data = dataset.features[
+                        lookback_index_padding_start:end_index
+                    ]
+                    audio_features = self.get_audio_features(
+                        audio_data, lookback_padding_added
+                    )
 
                     arrows = dataset.onehot_encoded_arrows[start_index:end_index]
                     sample_weights = dataset.sample_weights[start_index:end_index]
 
-                    features = self.append_existing_data(features=features, arrow_features=arrow_features,
-                                                         arrow_mask=arrow_mask, audio_features=audio_features,
-                                                         arrows=arrows, sample_weights=sample_weights)
+                    features = self.append_existing_data(
+                        features=features,
+                        arrow_features=arrow_features,
+                        arrow_mask=arrow_mask,
+                        audio_features=audio_features,
+                        arrows=arrows,
+                        sample_weights=sample_weights,
+                    )
                     self.song_start_index = end_index
                     # Break if collected enough data for a batch or end of song list.
                     # Otherwise, change to next song to collect more.
-                    if len(features["y_batch"]) >= self.batch_size or self.song_index + 1 >= len(self.train_indexes):
+                    if len(
+                        features["y_batch"]
+                    ) >= self.batch_size or self.song_index + 1 >= len(
+                        self.train_indexes
+                    ):
                         new_song = False
                         break
                     else:
                         self.song_index += 1
-                        song_start_index, song_end_index = \
-                            dataset.song_index_ranges[self.train_indexes[self.song_index]]
+                        song_start_index, song_end_index = dataset.song_index_ranges[
+                            self.train_indexes[self.song_index]
+                        ]
                         self.song_start_index = song_start_index
                         new_song = True
 
@@ -97,15 +145,20 @@ def __call__(self):
                     self.song_index += 1
 
                 if len(features["y_batch"]) > 0:
-                    scaled_audio_features = apply_timeseries_scalers(features=features["audio_features"],
-                                                                     scalers=self.scalers)
-                    x_batch = {"arrow_input": features["arrow_features"],
-                               "arrow_mask": features["arrow_mask"],
-                               "audio_input": scaled_audio_features}
+                    scaled_audio_features = apply_timeseries_scalers(
+                        features=features["audio_features"], scalers=self.scalers
+                    )
+                    x_batch = {
+                        "arrow_input": features["arrow_features"],
+                        "arrow_mask": features["arrow_mask"],
+                        "audio_input": scaled_audio_features,
+                    }
                     yield x_batch, features["y_batch"], features["sample_weights_batch"]
 
     @staticmethod
-    def append_existing_data(features, arrow_features, arrow_mask, audio_features, arrows, sample_weights):
+    def append_existing_data(
+        features, arrow_features, arrow_mask, audio_features, arrows, sample_weights
+    ):
         # Append or set features/labels/sample weights based on if existing data is present
         if not features or any(len(value) == 0 for value in features.values()):
             features["arrow_features"] = arrow_features
@@ -114,43 +167,67 @@ def append_existing_data(features, arrow_features, arrow_mask, audio_features, a
             features["y_batch"] = arrows
             features["sample_weights_batch"] = sample_weights
         else:
-            if isinstance(features["arrow_features"], list) or isinstance(features["arrow_mask"], list):
+            if isinstance(features["arrow_features"], list) or isinstance(
+                features["arrow_mask"], list
+            ):
                 features["arrow_features"].extend(arrow_features)
                 features["arrow_mask"].extend(arrow_mask)
                 # Normalize again after appending in the case where split batches have different max lengths
-                features["arrow_features"], features["arrow_mask"] = \
-                    normalize_tokenized_arrows(arrow_features=features["arrow_features"],
-                                               arrow_mask=features["arrow_mask"])
+                (
+                    features["arrow_features"],
+                    features["arrow_mask"],
+                ) = normalize_tokenized_arrows(
+                    arrow_features=features["arrow_features"],
+                    arrow_mask=features["arrow_mask"],
+                )
             else:
-                features["arrow_features"] = np.concatenate((features["arrow_features"], arrow_features), axis=0)
-                features["arrow_mask"] = np.concatenate((features["arrow_mask"], arrow_mask), axis=0)
-            features["audio_features"] = np.concatenate((features["audio_features"], audio_features), axis=0)
+                features["arrow_features"] = np.concatenate(
+                    (features["arrow_features"], arrow_features), axis=0
+                )
+                features["arrow_mask"] = np.concatenate(
+                    (features["arrow_mask"], arrow_mask), axis=0
+                )
+            features["audio_features"] = np.concatenate(
+                (features["audio_features"], audio_features), axis=0
+            )
             features["y_batch"] = np.concatenate((features["y_batch"], arrows), axis=0)
-            features["sample_weights_batch"] = np.concatenate((features["sample_weights_batch"], sample_weights),
-                                                              axis=0)
+            features["sample_weights_batch"] = np.concatenate(
+                (features["sample_weights_batch"], sample_weights), axis=0
+            )
 
         return features
 
-    def get_tokenized_arrow_features(self, arrows, mask_padding_value, lookback_padding_added):
-        arrow_features, arrow_mask = get_samples_ngram_with_mask(arrows, self.lookback,
-                                                                 reshape=True,
-                                                                 sample_padding_value='0000',
-                                                                 mask_padding_value=mask_padding_value)
+    def get_tokenized_arrow_features(
+        self, arrows, mask_padding_value, lookback_padding_added
+    ):
+        arrow_features, arrow_mask = get_samples_ngram_with_mask(
+            arrows,
+            self.lookback,
+            reshape=True,
+            sample_padding_value="0000",
+            mask_padding_value=mask_padding_value,
+        )
         arrow_features = arrow_features[lookback_padding_added:]
         arrow_mask = arrow_mask[lookback_padding_added:].astype(np.int32)
         decoded_arrows = [" ".join(line) for line in arrow_features]
-        arrow_features = [self.tokenizer(line, return_tensors='tf',
-                                         add_prefix_space=True)['input_ids']
-                              .numpy()[0][1:].astype(np.int32)
-                          for line in decoded_arrows]
+        arrow_features = [
+            self.tokenizer(line, return_tensors="tf", add_prefix_space=True)[
+                "input_ids"
+            ]
+            .numpy()[0][1:]
+            .astype(np.int32)
+            for line in decoded_arrows
+        ]
         arrow_features = arrow_features[:-1]
         arrow_mask = list(arrow_mask[:-1, 1:])
-        return normalize_tokenized_arrows(arrow_features=arrow_features, arrow_mask=arrow_mask)
+        return normalize_tokenized_arrows(
+            arrow_features=arrow_features, arrow_mask=arrow_mask
+        )
 
     def get_arrow_features(self, arrows, mask_padding_value, lookback_padding_added):
-        arrow_features, arrow_mask = get_samples_ngram_with_mask(arrows, self.lookback,
-                                                                 reshape=True,
-                                                                 mask_padding_value=mask_padding_value)
+        arrow_features, arrow_mask = get_samples_ngram_with_mask(
+            arrows, self.lookback, reshape=True, mask_padding_value=mask_padding_value
+        )
         arrow_features = arrow_features[lookback_padding_added:]
         arrow_mask = arrow_mask[lookback_padding_added:]
         arrow_features = arrow_features[:-1, 1:]
@@ -159,7 +236,9 @@ def get_arrow_features(self, arrows, mask_padding_value, lookback_padding_added)
         return arrow_features.astype(np.int32), arrow_mask.astype(np.int32)
 
     def get_audio_features(self, audio_data, lookback_padding_added):
-        audio_features, _ = get_samples_ngram_with_mask(audio_data, self.lookback, squeeze=False)
+        audio_features, _ = get_samples_ngram_with_mask(
+            audio_data, self.lookback, squeeze=False
+        )
         audio_features = audio_features[lookback_padding_added:]
         audio_features = audio_features[1:]
 

diff --git a/train.py b/train.py
@@ -4,9 +4,7 @@
 
 import joblib
 
-from stepcovnet import config
-from stepcovnet.data.ModelDatasetTypes import ModelDatasetTypes
-from stepcovnet.data.Tokenizers import Tokenizers
+from stepcovnet import config, data
 from stepcovnet.executor.TrainingExecutor import TrainingExecutor
 from stepcovnet.inputs.TrainingInput import TrainingInput
 from stepcovnet.model.ClassifierModel import ClassifierModel
@@ -19,7 +17,7 @@
 def load_training_data(input_path: str):
     metadata = json.load(open(os.path.join(input_path, "metadata.json"), "r"))
     dataset_name = metadata["dataset_name"]
-    dataset_type = ModelDatasetTypes[metadata["dataset_type"]].value
+    dataset_type = data.ModelDatasetTypes[metadata["dataset_type"]].value
     dataset_path = os.path.join(input_path, dataset_name + "_dataset")
     scalers = joblib.load(
         open(os.path.join(input_path, dataset_name + "_scaler.pkl"), "rb")
@@ -49,7 +47,7 @@ def run_training(
         limit=limit,
         lookback=lookback,
         difficulty=difficulty,
-        tokenizer_name=Tokenizers.GPT2.name,
+        tokenizer_name=data.Tokenizers.GPT2.name,
     )
     training_input = TrainingInput(training_config)
 

diff --git a/training_data_collection.py b/training_data_collection.py
@@ -9,13 +9,13 @@
 import joblib
 import psutil
 
+from stepcovnet import data
 from stepcovnet.common.parameters import CONFIG, VGGISH_CONFIG
 from stepcovnet.common.utils import (
     get_channel_scalers,
     get_filename,
     get_filenames_from_folder,
 )
-from stepcovnet.data.ModelDatasetTypes import ModelDatasetTypes
 from stepcovnet.data_collection.sample_collection_helper import (
     feature_onset_phrase_label_sample_weights,
     get_features_and_labels,
@@ -214,9 +214,9 @@ def training_data_collection(
     output_path = os.path.join(output_path, name_prefix + name_postfix)
     os.makedirs(output_path, exist_ok=True)
     dataset_type = (
-        ModelDatasetTypes.DISTRIBUTED_DATASET
+        data.ModelDatasetTypes.DISTRIBUTED_DATASET
         if distributed
-        else ModelDatasetTypes.SINGULAR_DATASET
+        else data.ModelDatasetTypes.SINGULAR_DATASET
     )
     training_dataset = dataset_type.value(
         os.path.join(output_path, name_prefix + name_postfix), overwrite=True