diff --git a/frame_level_models.py b/frame_level_models.py index 3fe91b30..98be27be 100644 --- a/frame_level_models.py +++ b/frame_level_models.py @@ -14,13 +14,12 @@ """Contains a collection of models which operate on variable-length sequences.""" import math +import model_utils as utils import models -import video_level_models import tensorflow as tf -import model_utils as utils - -import tensorflow.contrib.slim as slim from tensorflow import flags +import tensorflow.contrib.slim as slim +import video_level_models FLAGS = flags.FLAGS flags.DEFINE_integer("iterations", 30, "Number of frames per batch for DBoF.") @@ -38,6 +37,10 @@ "dbof_pooling_method", "max", "The pooling method used in the DBoF cluster layer. " "Choices are 'average' and 'max'.") +flags.DEFINE_string( + "dbof_activation", "sigmoid", + "The nonlinear activation method for cluster and hidden dense layer, e.g., " + "sigmoid, relu6, etc.") flags.DEFINE_string( "video_level_classifier_model", "MoeModel", "Some Frame-Level models can be decomposed into a " @@ -48,11 +51,10 @@ class FrameLevelLogisticModel(models.BaseModel): + """Creates a logistic classifier over the aggregated frame-level features.""" def create_model(self, model_input, vocab_size, num_frames, **unused_params): - """Creates a model which uses a logistic classifier over the average of the - - frame-level features. + """See base class. This class is intended to be an example for implementors of frame level models. If you want to train a model over averaged features it is more @@ -94,18 +96,6 @@ class DbofModel(models.BaseModel): The model will randomly sample either frames or sequences of frames during training to speed up convergence. - - Args: - model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of input - features. - vocab_size: The number of classes in the dataset. - num_frames: A vector of length 'batch' which indicates the number of frames - for each video (before padding). - - Returns: - A dictionary with a tensor containing the probability predictions of the - model in the 'predictions' key. The dimensions of the tensor are - 'batch_size' x 'num_classes'. """ def create_model(self, @@ -119,11 +109,32 @@ def create_model(self, hidden_size=None, is_training=True, **unused_params): + """See base class. + + Args: + model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of + input features. + vocab_size: The number of classes in the dataset. + num_frames: A vector of length 'batch' which indicates the number of + frames for each video (before padding). + iterations: the number of frames to be sampled. + add_batch_norm: whether to add batch norm during training. + sample_random_frames: whether to sample random frames or random sequences. + cluster_size: the output neuron number of the cluster layer. + hidden_size: the output neuron number of the hidden layer. + is_training: whether to build the graph in training mode. + + Returns: + A dictionary with a tensor containing the probability predictions of the + model in the 'predictions' key. The dimensions of the tensor are + 'batch_size' x 'num_classes'. + """ iterations = iterations or FLAGS.iterations add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm random_frames = sample_random_frames or FLAGS.sample_random_frames cluster_size = cluster_size or FLAGS.dbof_cluster_size hidden1_size = hidden_size or FLAGS.dbof_hidden_size + act_fn = getattr(tf.nn, FLAGS.dbof_activation) num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32) if random_frames: @@ -165,7 +176,7 @@ def create_model(self, math.sqrt(feature_size))) tf.summary.histogram("cluster_biases", cluster_biases) activation += cluster_biases - activation = tf.nn.relu6(activation) + activation = act_fn(activation) tf.summary.histogram("cluster_output", activation) activation = tf.reshape(activation, [-1, max_frames, cluster_size]) @@ -190,7 +201,7 @@ def create_model(self, initializer=tf.random_normal_initializer(stddev=0.01)) tf.summary.histogram("hidden1_biases", hidden1_biases) activation += hidden1_biases - activation = tf.nn.relu6(activation) + activation = act_fn(activation) tf.summary.histogram("hidden1_output", activation) aggregated_model = getattr(video_level_models, @@ -200,9 +211,10 @@ def create_model(self, class LstmModel(models.BaseModel): + """Creates a model which uses a stack of LSTMs to represent the video.""" def create_model(self, model_input, vocab_size, num_frames, **unused_params): - """Creates a model which uses a stack of LSTMs to represent the video. + """See base class. Args: model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of @@ -224,9 +236,7 @@ def create_model(self, model_input, vocab_size, num_frames, **unused_params): for _ in range(number_of_layers) ]) - loss = 0.0 - - outputs, state = tf.nn.dynamic_rnn( + _, state = tf.nn.dynamic_rnn( stacked_lstm, model_input, sequence_length=num_frames, dtype=tf.float32) aggregated_model = getattr(video_level_models,