Run all pipeline script

imatge-upc · Jul 4, 2016 · 19936d4 · 19936d4
1 parent 634cb8c
commit 19936d4
Show file tree

Hide file tree

Showing 6 changed files with 228 additions and 8 deletions.
diff --git a/data/models/get_temporal_location_weights.sh b/data/models/get_temporal_location_weights.sh
@@ -1,2 +1 @@
-wget --no-check-certificate
-'https://www.dropbox.com/s/1vq55hw17y8k4vi/temporal-location_weights.hdf5?dl=0' -O ./temporal-locations_weights.h5
+wget --no-check-certificate 'https://www.dropbox.com/s/1vq55hw17y8k4vi/temporal-location_weights.hdf5?dl=0' -O ./temporal-location_weights.hdf5
diff --git a/scripts/extract_features.py b/scripts/extract_features.py
@@ -26,8 +26,6 @@ def extract_features(videos_dir, output_dir, batch_size, num_threads, queue_size
     output_file = h5py.File(output_path, mode)
     extracted_videos = output_file.keys()
     output_file.close()
-    #print(extracted_videos)
-    #return
 
     videos_ids = [v[:-4] for v in os.listdir(videos_dir) if v[-4:] == '.mp4']
 

diff --git a/scripts/process_prediction.py b/scripts/process_prediction.py
@@ -13,6 +13,7 @@
 
 
 def process_prediction(experiment_id, predictions_path, output_path, smoothing_k, activity_threshold, subset=None):
+    clip_length = 16.
 
     if subset == None:
         subsets = ['validation', 'testing']
@@ -62,7 +63,7 @@ def process_prediction(experiment_id, predictions_path, output_path, smoothing_k
             # Post Processing to obtain the detection
             prediction_smoothed = smoothing(prediction, k=smoothing_k)
             activities_idx, startings, endings, scores = activity_localization(
-                prediction,
+                prediction_smoothed,
                 activity_threshold
             )
             result_detection = []
@@ -71,8 +72,8 @@ def process_prediction(experiment_id, predictions_path, output_path, smoothing_k
                 result_detection.append({
                     'score': score,
                     'segment': [
-                        s * nb_clips / fps,
-                        e * nb_clips / fps
+                        s * clip_length / fps,
+                        e * clip_length / fps
                     ],
                     'label': label
                 })

diff --git a/scripts/run_all_pipeline.py b/scripts/run_all_pipeline.py
@@ -0,0 +1,189 @@
+import argparse
+
+import numpy as np
+
+from keras.layers import (LSTM, BatchNormalization, Convolution3D, Dense, Dropout, Flatten, Input,
+                          MaxPooling3D, TimeDistributed, ZeroPadding3D)
+from keras.models import Model, Sequential
+from src.data import import_labels
+from src.io import get_duration, get_num_frames, video_to_array
+from src.processing import activity_localization, get_classification, smoothing
+
+
+def run_all_pipeline(input_video, smoothing_k, activity_threshold):
+    input_size = (112, 112)
+    length = 16
+
+    # Load labels
+    with open('dataset/labels.txt', 'r') as f:
+        labels = import_labels(f)
+
+    print('Reading Video...')
+    video_array = video_to_array(input_video, resize=input_size)
+    if video_array is None:
+        raise Exception('The video could not be read')
+    nb_frames = get_num_frames(input_video)
+    duration = get_duration(input_video)
+    fps = nb_frames / duration
+    print('Duration: {:.1f}s'.format(duration))
+    print('FPS: {:.1f}'.format(fps))
+    print('Number of frames: {}'.format(nb_frames))
+
+    nb_clips = nb_frames // length
+    video_array = video_array.transpose(1, 0, 2, 3)
+    video_array = video_array[:nb_clips*length,:,:,:]
+    video_array = video_array.reshape((nb_clips, length, 3, 112, 112))
+    video_array = video_array.transpose(0, 2, 1, 3, 4)
+
+    # Load C3D model and mean
+    print('Loading C3D network...')
+    model  = C3D_conv_features(True)
+    model.compile(optimizer='sgd', loss='mse')
+    mean_total = np.load('data/models/c3d-sports1M_mean.npy')
+    mean = np.mean(mean_total, axis=(0, 2, 3, 4), keepdims=True)
+
+    # Extract features
+    print('Extracting features...')
+    X = video_array - mean
+    Y = model.predict(X, batch_size=1, verbose=1)
+
+    # Load the temporal localization network
+    print('Loading temporal localization network...')
+    model_localization = temporal_localization_network(True)
+    model_localization.compile(optimizer='rmsprop', loss='categorical_crossentropy')
+
+    # Predict with the temporal localization network
+    print('Predicting...')
+    Y = Y.reshape(nb_clips, 1, 4096)
+    prediction = model_localization.predict(Y, batch_size=1, verbose=1)
+    prediction = prediction.reshape(nb_clips, 201)
+
+    # Post processing the predited output
+    print('Post-processing output...')
+    labels_idx, scores = get_classification(prediction, k=5)
+    print('Video: {}\n'.format(input_video))
+    print('Classification:')
+    for idx, score in zip(labels_idx, scores):
+        label = labels[idx]
+        print('{:.4f}\t{}'.format(score, label))
+
+    prediction_smoothed = smoothing(prediction, k=smoothing_k)
+    activities_idx, startings, endings, scores = activity_localization(
+        prediction_smoothed,
+        activity_threshold
+    )
+
+    print('\nDetection:')
+    print('Score\tInterval\t\tActivity')
+    for idx, s, e, score in zip(activities_idx, startings, endings, scores):
+        start = s * float(length) / fps
+        end = e * float(length) / fps
+        label = labels[idx]
+        print('{:.4f}\t{:.1f}s - {:.1f}s\t\t{}'.format(score, start, end, label))
+
+
+def C3D_conv_features(summary=False):
+    """ Return the Keras model of the network until the fc6 layer where the
+    convolutional features can be extracted.
+    """
+    from keras.layers.convolutional import Convolution3D, MaxPooling3D, ZeroPadding3D
+    from keras.layers.core import Dense, Dropout, Flatten
+    from keras.models import Sequential
+
+    model = Sequential()
+    # 1st layer group
+    model.add(Convolution3D(64, 3, 3, 3, activation='relu',
+                            border_mode='same', name='conv1',
+                            subsample=(1, 1, 1),
+                            input_shape=(3, 16, 112, 112),
+                            trainable=False))
+    model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2),
+                           border_mode='valid', name='pool1'))
+    # 2nd layer group
+    model.add(Convolution3D(128, 3, 3, 3, activation='relu',
+                            border_mode='same', name='conv2',
+                            subsample=(1, 1, 1),
+                            trainable=False))
+    model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2),
+                           border_mode='valid', name='pool2'))
+    # 3rd layer group
+    model.add(Convolution3D(256, 3, 3, 3, activation='relu',
+                            border_mode='same', name='conv3a',
+                            subsample=(1, 1, 1),
+                            trainable=False))
+    model.add(Convolution3D(256, 3, 3, 3, activation='relu',
+                            border_mode='same', name='conv3b',
+                            subsample=(1, 1, 1),
+                            trainable=False))
+    model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2),
+                           border_mode='valid', name='pool3'))
+    # 4th layer group
+    model.add(Convolution3D(512, 3, 3, 3, activation='relu',
+                            border_mode='same', name='conv4a',
+                            subsample=(1, 1, 1),
+                            trainable=False))
+    model.add(Convolution3D(512, 3, 3, 3, activation='relu',
+                            border_mode='same', name='conv4b',
+                            subsample=(1, 1, 1),
+                            trainable=False))
+    model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2),
+                           border_mode='valid', name='pool4'))
+    # 5th layer group
+    model.add(Convolution3D(512, 3, 3, 3, activation='relu',
+                            border_mode='same', name='conv5a',
+                            subsample=(1, 1, 1),
+                            trainable=False))
+    model.add(Convolution3D(512, 3, 3, 3, activation='relu',
+                            border_mode='same', name='conv5b',
+                            subsample=(1, 1, 1),
+                            trainable=False))
+    model.add(ZeroPadding3D(padding=(0, 1, 1), name='zeropadding'))
+    model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2),
+                           border_mode='valid', name='pool5'))
+    model.add(Flatten(name='flatten'))
+    # FC layers group
+    model.add(Dense(4096, activation='relu', name='fc6', trainable=False))
+    model.add(Dropout(.5, name='do1'))
+    model.add(Dense(4096, activation='relu', name='fc7'))
+    model.add(Dropout(.5, name='do2'))
+    model.add(Dense(487, activation='softmax', name='fc8'))
+
+    # Load weights
+    model.load_weights('data/models/c3d-sports1M_weights.h5')
+
+    for _ in range(4):
+        model.pop_layer()
+
+    if summary:
+        print(model.summary())
+    return model
+
+def temporal_localization_network(summary=False):
+    input_features = Input(batch_shape=(1, 1, 4096,), name='features')
+    input_normalized = BatchNormalization(name='normalization')(input_features)
+    input_dropout = Dropout(p=.5)(input_normalized)
+    lstm = LSTM(512, return_sequences=True, stateful=True, name='lsmt1')(input_dropout)
+    output_dropout = Dropout(p=.5)(lstm)
+    output = TimeDistributed(Dense(201, activation='softmax'), name='fc')(output_dropout)
+
+    model = Model(input=input_features, output=output)
+    model.load_weights('data/models/temporal-location_weights.hdf5')
+
+    if summary:
+        model.summary()
+    return model
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Run all pipeline. Given a video, classify it and temporal localize the activity on it')
+
+    parser.add_argument('-i', '--input-video', type=str, dest='input_video', help='Path to the input video')
+    parser.add_argument('-k', type=int, dest='smoothing_k', default=5, help='Smoothing factor at post-processing (default: %(default)s)')
+    parser.add_argument('-t', type=float, dest='activity_threshold', default=.2, help='Activity threshold at post-processing (default: %(default)s)')
+
+    args = parser.parse_args()
+
+    run_all_pipeline(
+        args.input_video,
+        args.smoothing_k,
+        args.activity_threshold
+    )
diff --git a/src/io.py b/src/io.py
@@ -68,3 +68,36 @@ def video_to_array(video_path, resize=None, start_frame=0, end_frame=None,
     if dim_ordering == 'th':
         video = video.transpose(3, 0, 1, 2)
     return video
+
+
+def get_num_frames(video_path):
+    ''' Return the number of frames of the video track of the video given '''
+    import cv2
+    if cv2.__version__ >= '3.0.0':
+        CAP_PROP_FRAME_COUNT = cv2.CAP_PROP_FRAME_COUNT
+    else:
+        CAP_PROP_FRAME_COUNT = cv2.cv.CV_CAP_PROP_FRAME_COUNT
+
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise Exception('Could not open the video')
+    num_frames = int(cap.get(CAP_PROP_FRAME_COUNT))
+    return num_frames
+
+def get_duration(video_path):
+    ''' Return the duration of the video track of the video given '''
+    import cv2
+    if cv2.__version__ >= '3.0.0':
+        CAP_PROP_FRAME_COUNT = cv2.CAP_PROP_FRAME_COUNT
+        CAP_PROP_FPS = cv2.CAP_PROP_FPS
+    else:
+        CAP_PROP_FRAME_COUNT = cv2.cv.CV_CAP_PROP_FRAME_COUNT
+        CAP_PROP_FPS = cv2.cv.CV_CAP_PROP_FPS
+
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise Exception('Could not open the video')
+    num_frames = int(cap.get(CAP_PROP_FRAME_COUNT))
+    fps = float(cap.get(CAP_PROP_FPS))
+    duration = num_frames / fps
+    return duration
diff --git a/src/processing.py b/src/processing.py
@@ -38,7 +38,7 @@ def activity_localization(sequence_class_prob, activity_threshold=.2):
     padded = np.pad(activity_tag, pad_width=1, mode='constant')
     dif = padded[1:] - padded [:-1]
 
-    indexes = np.arange(dif.size)
+    indexes = np.arange(dif.size).astype(np.float32)
     startings = indexes[dif==1]
     endings = indexes[dif==-1]