commit

ndkamath · Jul 13, 2017 · 5ad9481 · 5ad9481
1 parent 3ce1143
commit 5ad9481
Show file tree

Hide file tree

Showing 32 changed files with 89 additions and 21 deletions.
diff --git a/README.rst b/README.rst
@@ -145,7 +145,7 @@ dlib library [dlib]_. Finally, all mouth areas are resized to have the same size
 cube. The dataset does not contain any audio files. The audio files are extracted from
 videos using FFmpeg framework [ffmpeg]_. The processing pipeline is the below figure.
 
-.. image:: data/images/processing.gif
+.. image:: readme_images/processing.gif
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Input Pipeline for this work
@@ -176,7 +176,7 @@ forms 40 MFEC features) can be derived which form a
 speech feature cube. Each input feature map for a single audio stream has the dimensionality of 15 × 40 × 3.
 This representation is depicted in the following figure:
 
-.. image:: data/images/Speech_GIF.gif
+.. image:: readme_images/Speech_GIF.gif
 
 The **speech features** have been extracted using [SpeechPy]_ package.
 
@@ -188,7 +188,7 @@ The input of the visual stream of the network is a cube of size 9x60x100,
 where 9 is the number of frames that represent the temporal information. Each
 channel is a 60x100 gray-scale image of mouth region.
 
-.. image:: data/images/lip_motion.jpg
+.. image:: readme_images/lip_motion.jpg
 
 
 
@@ -205,7 +205,7 @@ considered as a spatial dimension, and the stacked audio frames form the
 temporal dimension. In the proposed 3D CNN architecture, the convolutional operations
 are performed on successive temporal frames for both audio-visual streams.
 
-.. image:: data/images/DNN-Coupled.png
+.. image:: readme_images/DNN-Coupled.png
 
 
 ----------------------
@@ -238,10 +238,10 @@ Results
 The below results demonstrate effects of the proposed method on the accuracy
 and the speed of convergence.
 
-.. |accuracy| image:: results/accuracy-bar-pairselection.png
+.. |accuracy| image:: readme_images/accuracy-bar-pairselection.png
 
 
-.. |converge| image:: results/convergence-speed.png
+.. |converge| image:: readme_images/convergence-speed.png
 
 
 |accuracy|

diff --git a/code/lip_tracking/VisualizeLip.py b/code/lip_tracking/VisualizeLip.py
@@ -34,7 +34,7 @@
 predictor_path = 'dlib/shape_predictor_68_face_landmarks.dat'
 detector = dlib.get_frontal_face_detector()
 predictor = dlib.shape_predictor(predictor_path)
-mouth_destination_path = 'mouth'
+mouth_destination_path = os.path.dirname(args["output"]) + '/' + 'mouth'
 if not os.path.exists(mouth_destination_path):
     os.makedirs(mouth_destination_path)
 
@@ -53,6 +53,7 @@
 total_num_frames = int(video_shape[0])
 num_frames = min(total_num_frames,max_counter)
 counter = 0
+font = cv2.FONT_HERSHEY_SIMPLEX
 
 # Define the writer
 writer = skvideo.io.FFmpegWriter(args["output"])
@@ -218,11 +219,8 @@
     with open(the_filename, 'rb') as f:
         my_list = pickle.load(f)
 """
-activation_destination_path = 'activation'
-if not os.path.exists(activation_destination_path):
-    os.makedirs(activation_destination_path)
 
-the_filename = activation_destination_path + '/' +args["input"].split('.')[0] + '_' + 'activation'
+the_filename = os.path.dirname(args["output"]) + '/' + 'activation'
 my_list = activation
 with open(the_filename, 'wb') as f:
     pickle.dump(my_list, f)