mikelaidata · tonylaioffer · Oct 9, 2018 · Nov 28, 2018
diff --git a/DAE.py b/DAE.py
@@ -5,34 +5,34 @@
 class DAE:
 
     def __init__(self, FLAGS):
-        ''' Imlimentation of deep autoencoder class.'''
+        ''' Implementation of deep autoencoder class.'''
 
-        self.FLAGS=FLAGS
-        self.weight_initializer=model_helper._get_weight_initializer()
-        self.bias_initializer=model_helper._get_bias_initializer()
+        self.FLAGS = FLAGS
+        self.weight_initializer = model_helper._get_weight_initializer()
+        self.bias_initializer = model_helper._get_bias_initializer()
         self.init_parameters()
 
 
     def init_parameters(self):
         '''Initialize networks weights abd biasis.'''
 
         with tf.name_scope('weights'):
-            self.W_1=tf.get_variable(name='weight_1', shape=(self.FLAGS.num_v,self.FLAGS.num_h), 
-                                     initializer=self.weight_initializer)
-            self.W_2=tf.get_variable(name='weight_2', shape=(self.FLAGS.num_h,self.FLAGS.num_h), 
-                                     initializer=self.weight_initializer)
-            self.W_3=tf.get_variable(name='weight_3', shape=(self.FLAGS.num_h,self.FLAGS.num_h), 
-                                     initializer=self.weight_initializer)
-            self.W_4=tf.get_variable(name='weight_4', shape=(self.FLAGS.num_h,self.FLAGS.num_v), 
-                                     initializer=self.weight_initializer)
+            self.W_1 = tf.get_variable(name='weight_1', shape=(self.FLAGS.num_v,self.FLAGS.num_h),
+                                       initializer=self.weight_initializer)
+            self.W_2 = tf.get_variable(name='weight_2', shape=(self.FLAGS.num_h,self.FLAGS.num_h),
+                                       initializer=self.weight_initializer)
+            self.W_3 = tf.get_variable(name='weight_3', shape=(self.FLAGS.num_h,self.FLAGS.num_h),
+                                       initializer=self.weight_initializer)
+            self.W_4 = tf.get_variable(name='weight_4', shape=(self.FLAGS.num_h,self.FLAGS.num_v),
+                                       initializer=self.weight_initializer)
 
         with tf.name_scope('biases'):
-            self.b1=tf.get_variable(name='bias_1', shape=(self.FLAGS.num_h), 
-                                    initializer=self.bias_initializer)
-            self.b2=tf.get_variable(name='bias_2', shape=(self.FLAGS.num_h), 
-                                    initializer=self.bias_initializer)
-            self.b3=tf.get_variable(name='bias_3', shape=(self.FLAGS.num_h), 
-                                    initializer=self.bias_initializer)
+            self.b1 = tf.get_variable(name='bias_1', shape=(self.FLAGS.num_h),
+                                      initializer=self.bias_initializer)
+            self.b2 = tf.get_variable(name='bias_2', shape=(self.FLAGS.num_h),
+                                      initializer=self.bias_initializer)
+            self.b3 = tf.get_variable(name='bias_3', shape=(self.FLAGS.num_h),
+                                      initializer=self.bias_initializer)
 
     def _inference(self, x):
         ''' Making one forward pass. Predicting the networks outputs.
@@ -42,27 +42,26 @@ def _inference(self, x):
         '''
 
         with tf.name_scope('inference'):
-             a1=tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(x, self.W_1),self.b1))
-             a2=tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(a1, self.W_2),self.b2))
-             a3=tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(a2, self.W_3),self.b3))   
-             a4=tf.matmul(a3, self.W_4) 
+             a1 = tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(x, self.W_1),self.b1))
+             a2 = tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(a1, self.W_2),self.b2))
+             a3 = tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(a2, self.W_3),self.b3))
+             a4 = tf.matmul(a3, self.W_4)
         return a4
 
-    def _compute_loss(self, predictions, labels,num_labels):
+    def _compute_loss(self, predictions, labels, num_labels):
         ''' Computing the Mean Squared Error loss between the input and output of the network.
-
-    	  @param predictions: predictions of the stacked autoencoder
-    	  @param labels: input values of the stacked autoencoder which serve as labels at the same time
-    	  @param num_labels: number of labels !=0 in the data set to compute the mean
-
-    	  @return mean squared error loss tf-operation
-    	  '''
 
-        with tf.name_scope('loss'):
+          @param predictions: predictions of the stacked autoencoder
+          @param labels: input values of the stacked autoencoder which serve as labels at the same time
+          @param num_labels: number of labels !=0 in the data set to compute the mean
+
+          @return mean squared error loss tf-operation
+          '''
 
-            loss_op=tf.div(tf.reduce_sum(tf.square(tf.subtract(predictions,labels))),num_labels)
+        with tf.name_scope('loss'):
+            loss_op = tf.div(tf.reduce_sum(tf.square(tf.subtract(predictions,labels))),num_labels)
             return loss_op
-    	  
+          
 
 
     def _optimizer(self, x):
@@ -74,42 +73,41 @@ def _optimizer(self, x):
             @return: ROOT!! mean squared error
         '''
 
-        outputs=self._inference(x)
-        mask=tf.where(tf.equal(x,0.0), tf.zeros_like(x), x) # indices of 0 values in the training set
-        num_train_labels=tf.cast(tf.count_nonzero(mask),dtype=tf.float32) # number of non zero values in the training set
-        bool_mask=tf.cast(mask,dtype=tf.bool) # boolean mask
-        outputs=tf.where(bool_mask, outputs, tf.zeros_like(outputs)) # set the output values to zero if corresponding input values are zero
+        outputs = self._inference(x)
+        mask = tf.where(tf.equal(x,0.0), tf.zeros_like(x), x) # indices of 0 values in the training set
+        num_train_labels = tf.cast(tf.count_nonzero(mask),dtype=tf.float32) # number of non zero values in the training set
+        bool_mask = tf.cast(mask,dtype=tf.bool) # boolean mask
+        outputs = tf.where(bool_mask, outputs, tf.zeros_like(outputs)) # set the output values to zero if corresponding input values are zero
 
-        MSE_loss=self._compute_loss(outputs,x,num_train_labels)
+        MSE_loss = self._compute_loss(outputs,x,num_train_labels)
 
-        if self.FLAGS.l2_reg==True:
+        if self.FLAGS.l2_reg == True:
             l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
             MSE_loss = MSE_loss +  self.FLAGS.lambda_ * l2_loss
 
-        train_op=tf.train.AdamOptimizer(self.FLAGS.learning_rate).minimize(MSE_loss)
-        RMSE_loss=tf.sqrt(MSE_loss)
+        train_op = tf.train.AdamOptimizer(self.FLAGS.learning_rate).minimize(MSE_loss)
+        RMSE_loss = tf.sqrt(MSE_loss)
 
         return train_op, RMSE_loss
 
     def _validation_loss(self, x_train, x_test):
-
         ''' Computing the loss during the validation time.
-    		
-    	  @param x_train: training data samples
-    	  @param x_test: test data samples
-    		
-    	  @return networks predictions
-    	  @return root mean squared error loss between the predicted and actual ratings
-    	  '''
+            
+          @param x_train: training data samples
+          @param x_test: test data samples
+            
+          @return networks predictions
+          @return root mean squared error loss between the predicted and actual ratings
+          '''
 
-        outputs=self._inference(x_train) # use training sample to make prediction
-        mask=tf.where(tf.equal(x_test,0.0), tf.zeros_like(x_test), x_test) # identify the zero values in the test ste
-        num_test_labels=tf.cast(tf.count_nonzero(mask),dtype=tf.float32) # count the number of non zero values
-        bool_mask=tf.cast(mask,dtype=tf.bool) 
-        outputs=tf.where(bool_mask, outputs, tf.zeros_like(outputs))
+        outputs = self._inference(x_train) # use training sample to make prediction
+        mask = tf.where(tf.equal(x_test,0.0), tf.zeros_like(x_test), x_test) # identify the zero values in the test ste
+        num_test_labels = tf.cast(tf.count_nonzero(mask),dtype=tf.float32) # count the number of non zero values
+        bool_mask = tf.cast(mask,dtype=tf.bool) 
+        outputs = tf.where(bool_mask, outputs, tf.zeros_like(outputs))
 
-        MSE_loss=self._compute_loss(outputs, x_test, num_test_labels)
-        RMSE_loss=tf.sqrt(MSE_loss)
+        MSE_loss = self._compute_loss(outputs, x_test, num_test_labels)
+        RMSE_loss = tf.sqrt(MSE_loss)
 
         return outputs, RMSE_loss
 

diff --git a/data/dataset.py b/data/dataset.py
@@ -8,7 +8,7 @@ def _get_training_data(FLAGS):
     @return data for the inference
     '''
 
-    filenames=[FLAGS.tf_records_train_path+f for f in os.listdir(FLAGS.tf_records_train_path)]
+    filenames = [os.path.join(FLAGS.tf_records_train_path, f) for f in os.listdir(FLAGS.tf_records_train_path)]
 
     dataset = tf.data.TFRecordDataset(filenames)
     dataset = dataset.map(parse)
@@ -23,33 +23,33 @@ def _get_training_data(FLAGS):
     dataset2 = dataset2.repeat()
     dataset2 = dataset2.batch(1)
     dataset2 = dataset2.prefetch(buffer_size=1)
-    
+
     return dataset, dataset2
-    
+
 
 def _get_test_data(FLAGS):
     ''' Buildind the input pipeline for test data.'''
-    
-    filenames=[FLAGS.tf_records_test_path+f for f in os.listdir(FLAGS.tf_records_test_path)]
-    
+
+    filenames = [os.path.join(FLAGS.tf_records_test_path, f) for f in os.listdir(FLAGS.tf_records_test_path)]
+
     dataset = tf.data.TFRecordDataset(filenames)
     dataset = dataset.map(parse)
     dataset = dataset.shuffle(buffer_size=1)
     dataset = dataset.repeat()
     dataset = dataset.batch(1)
     dataset = dataset.prefetch(buffer_size=1)
-    
+
     return dataset
 
 
 def parse(serialized):
     ''' Parser fot the TFRecords file.'''
-    
-    features={'movie_ratings':tf.FixedLenFeature([3952], tf.float32),  
+
+    features = {'movie_ratings':tf.FixedLenFeature([3952], tf.float32),  
               }
-    parsed_example=tf.parse_single_example(serialized,
+    parsed_example = tf.parse_single_example(serialized,
                                            features=features,
                                            )
     movie_ratings = tf.cast(parsed_example['movie_ratings'], tf.float32)
-     
+
     return movie_ratings
diff --git a/data/ml-1m/README b/data/ml-1m/README
@@ -0,0 +1,170 @@
+SUMMARY
+================================================================================
+
+These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
+made by 6,040 MovieLens users who joined MovieLens in 2000.
+
+USAGE LICENSE
+================================================================================
+
+Neither the University of Minnesota nor any of the researchers
+involved can guarantee the correctness of the data, its suitability
+for any particular purpose, or the validity of results based on the
+use of the data set.  The data set may be used for any research
+purposes under the following conditions:
+
+     * The user may not state or imply any endorsement from the
+       University of Minnesota or the GroupLens Research Group.
+
+     * The user must acknowledge the use of the data set in
+       publications resulting from the use of the data set
+       (see below for citation information).
+
+     * The user may not redistribute the data without separate
+       permission.
+
+     * The user may not use this information for any commercial or
+       revenue-bearing purposes without first obtaining permission
+       from a faculty member of the GroupLens Research Project at the
+       University of Minnesota.
+
+If you have any further questions or comments, please contact GroupLens
+<grouplens-info@cs.umn.edu>. 
+
+CITATION
+================================================================================
+
+To acknowledge use of the dataset in publications, please cite the following
+paper:
+
+F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History
+and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4,
+Article 19 (December 2015), 19 pages. DOI=http://dx.doi.org/10.1145/2827872
+
+
+ACKNOWLEDGEMENTS
+================================================================================
+
+Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data
+set.
+
+FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
+================================================================================
+
+The GroupLens Research Project is a research group in the Department of 
+Computer Science and Engineering at the University of Minnesota. Members of 
+the GroupLens Research Project are involved in many research projects related 
+to the fields of information filtering, collaborative filtering, and 
+recommender systems. The project is lead by professors John Riedl and Joseph 
+Konstan. The project began to explore automated collaborative filtering in 
+1992, but is most well known for its world wide trial of an automated 
+collaborative filtering system for Usenet news in 1996. Since then the project 
+has expanded its scope to research overall information filtering solutions, 
+integrating in content-based methods as well as improving current collaborative 
+filtering technology.
+
+Further information on the GroupLens Research project, including research 
+publications, can be found at the following web site:
+
+        http://www.grouplens.org/
+
+GroupLens Research currently operates a movie recommender based on 
+collaborative filtering:
+
+        http://www.movielens.org/
+
+RATINGS FILE DESCRIPTION
+================================================================================
+
+All ratings are contained in the file "ratings.dat" and are in the
+following format:
+
+UserID::MovieID::Rating::Timestamp
+
+- UserIDs range between 1 and 6040 
+- MovieIDs range between 1 and 3952
+- Ratings are made on a 5-star scale (whole-star ratings only)
+- Timestamp is represented in seconds since the epoch as returned by time(2)
+- Each user has at least 20 ratings
+
+USERS FILE DESCRIPTION
+================================================================================
+
+User information is in the file "users.dat" and is in the following
+format:
+
+UserID::Gender::Age::Occupation::Zip-code
+
+All demographic information is provided voluntarily by the users and is
+not checked for accuracy.  Only users who have provided some demographic
+information are included in this data set.
+
+- Gender is denoted by a "M" for male and "F" for female
+- Age is chosen from the following ranges:
+
+	*  1:  "Under 18"
+	* 18:  "18-24"
+	* 25:  "25-34"
+	* 35:  "35-44"
+	* 45:  "45-49"
+	* 50:  "50-55"
+	* 56:  "56+"
+
+- Occupation is chosen from the following choices:
+
+	*  0:  "other" or not specified
+	*  1:  "academic/educator"
+	*  2:  "artist"
+	*  3:  "clerical/admin"
+	*  4:  "college/grad student"
+	*  5:  "customer service"
+	*  6:  "doctor/health care"
+	*  7:  "executive/managerial"
+	*  8:  "farmer"
+	*  9:  "homemaker"
+	* 10:  "K-12 student"
+	* 11:  "lawyer"
+	* 12:  "programmer"
+	* 13:  "retired"
+	* 14:  "sales/marketing"
+	* 15:  "scientist"
+	* 16:  "self-employed"
+	* 17:  "technician/engineer"
+	* 18:  "tradesman/craftsman"
+	* 19:  "unemployed"
+	* 20:  "writer"
+
+MOVIES FILE DESCRIPTION
+================================================================================
+
+Movie information is in the file "movies.dat" and is in the following
+format:
+
+MovieID::Title::Genres
+
+- Titles are identical to titles provided by the IMDB (including
+year of release)
+- Genres are pipe-separated and are selected from the following genres:
+
+	* Action
+	* Adventure
+	* Animation
+	* Children's
+	* Comedy
+	* Crime
+	* Documentary
+	* Drama
+	* Fantasy
+	* Film-Noir
+	* Horror
+	* Musical
+	* Mystery
+	* Romance
+	* Sci-Fi
+	* Thriller
+	* War
+	* Western
+
+- Some MovieIDs do not correspond to a movie due to accidental duplicate
+entries and/or test entries
+- Movies are mostly entered by hand, so errors and inconsistencies may exist