Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 55 additions & 57 deletions DAE.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,34 @@
class DAE:

def __init__(self, FLAGS):
''' Imlimentation of deep autoencoder class.'''
''' Implementation of deep autoencoder class.'''

self.FLAGS=FLAGS
self.weight_initializer=model_helper._get_weight_initializer()
self.bias_initializer=model_helper._get_bias_initializer()
self.FLAGS = FLAGS
self.weight_initializer = model_helper._get_weight_initializer()
self.bias_initializer = model_helper._get_bias_initializer()
self.init_parameters()


def init_parameters(self):
'''Initialize networks weights abd biasis.'''

with tf.name_scope('weights'):
self.W_1=tf.get_variable(name='weight_1', shape=(self.FLAGS.num_v,self.FLAGS.num_h),
initializer=self.weight_initializer)
self.W_2=tf.get_variable(name='weight_2', shape=(self.FLAGS.num_h,self.FLAGS.num_h),
initializer=self.weight_initializer)
self.W_3=tf.get_variable(name='weight_3', shape=(self.FLAGS.num_h,self.FLAGS.num_h),
initializer=self.weight_initializer)
self.W_4=tf.get_variable(name='weight_4', shape=(self.FLAGS.num_h,self.FLAGS.num_v),
initializer=self.weight_initializer)
self.W_1 = tf.get_variable(name='weight_1', shape=(self.FLAGS.num_v,self.FLAGS.num_h),
initializer=self.weight_initializer)
self.W_2 = tf.get_variable(name='weight_2', shape=(self.FLAGS.num_h,self.FLAGS.num_h),
initializer=self.weight_initializer)
self.W_3 = tf.get_variable(name='weight_3', shape=(self.FLAGS.num_h,self.FLAGS.num_h),
initializer=self.weight_initializer)
self.W_4 = tf.get_variable(name='weight_4', shape=(self.FLAGS.num_h,self.FLAGS.num_v),
initializer=self.weight_initializer)

with tf.name_scope('biases'):
self.b1=tf.get_variable(name='bias_1', shape=(self.FLAGS.num_h),
initializer=self.bias_initializer)
self.b2=tf.get_variable(name='bias_2', shape=(self.FLAGS.num_h),
initializer=self.bias_initializer)
self.b3=tf.get_variable(name='bias_3', shape=(self.FLAGS.num_h),
initializer=self.bias_initializer)
self.b1 = tf.get_variable(name='bias_1', shape=(self.FLAGS.num_h),
initializer=self.bias_initializer)
self.b2 = tf.get_variable(name='bias_2', shape=(self.FLAGS.num_h),
initializer=self.bias_initializer)
self.b3 = tf.get_variable(name='bias_3', shape=(self.FLAGS.num_h),
initializer=self.bias_initializer)

def _inference(self, x):
''' Making one forward pass. Predicting the networks outputs.
Expand All @@ -42,27 +42,26 @@ def _inference(self, x):
'''

with tf.name_scope('inference'):
a1=tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(x, self.W_1),self.b1))
a2=tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(a1, self.W_2),self.b2))
a3=tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(a2, self.W_3),self.b3))
a4=tf.matmul(a3, self.W_4)
a1 = tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(x, self.W_1),self.b1))
a2 = tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(a1, self.W_2),self.b2))
a3 = tf.nn.sigmoid(tf.nn.bias_add(tf.matmul(a2, self.W_3),self.b3))
a4 = tf.matmul(a3, self.W_4)
return a4

def _compute_loss(self, predictions, labels,num_labels):
def _compute_loss(self, predictions, labels, num_labels):
''' Computing the Mean Squared Error loss between the input and output of the network.

@param predictions: predictions of the stacked autoencoder
@param labels: input values of the stacked autoencoder which serve as labels at the same time
@param num_labels: number of labels !=0 in the data set to compute the mean

@return mean squared error loss tf-operation
'''

with tf.name_scope('loss'):
@param predictions: predictions of the stacked autoencoder
@param labels: input values of the stacked autoencoder which serve as labels at the same time
@param num_labels: number of labels !=0 in the data set to compute the mean

@return mean squared error loss tf-operation
'''

loss_op=tf.div(tf.reduce_sum(tf.square(tf.subtract(predictions,labels))),num_labels)
with tf.name_scope('loss'):
loss_op = tf.div(tf.reduce_sum(tf.square(tf.subtract(predictions,labels))),num_labels)
return loss_op


def _optimizer(self, x):
Expand All @@ -74,42 +73,41 @@ def _optimizer(self, x):
@return: ROOT!! mean squared error
'''

outputs=self._inference(x)
mask=tf.where(tf.equal(x,0.0), tf.zeros_like(x), x) # indices of 0 values in the training set
num_train_labels=tf.cast(tf.count_nonzero(mask),dtype=tf.float32) # number of non zero values in the training set
bool_mask=tf.cast(mask,dtype=tf.bool) # boolean mask
outputs=tf.where(bool_mask, outputs, tf.zeros_like(outputs)) # set the output values to zero if corresponding input values are zero
outputs = self._inference(x)
mask = tf.where(tf.equal(x,0.0), tf.zeros_like(x), x) # indices of 0 values in the training set
num_train_labels = tf.cast(tf.count_nonzero(mask),dtype=tf.float32) # number of non zero values in the training set
bool_mask = tf.cast(mask,dtype=tf.bool) # boolean mask
outputs = tf.where(bool_mask, outputs, tf.zeros_like(outputs)) # set the output values to zero if corresponding input values are zero

MSE_loss=self._compute_loss(outputs,x,num_train_labels)
MSE_loss = self._compute_loss(outputs,x,num_train_labels)

if self.FLAGS.l2_reg==True:
if self.FLAGS.l2_reg == True:
l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
MSE_loss = MSE_loss + self.FLAGS.lambda_ * l2_loss

train_op=tf.train.AdamOptimizer(self.FLAGS.learning_rate).minimize(MSE_loss)
RMSE_loss=tf.sqrt(MSE_loss)
train_op = tf.train.AdamOptimizer(self.FLAGS.learning_rate).minimize(MSE_loss)
RMSE_loss = tf.sqrt(MSE_loss)

return train_op, RMSE_loss

def _validation_loss(self, x_train, x_test):

''' Computing the loss during the validation time.
@param x_train: training data samples
@param x_test: test data samples
@return networks predictions
@return root mean squared error loss between the predicted and actual ratings
'''
@param x_train: training data samples
@param x_test: test data samples
@return networks predictions
@return root mean squared error loss between the predicted and actual ratings
'''

outputs=self._inference(x_train) # use training sample to make prediction
mask=tf.where(tf.equal(x_test,0.0), tf.zeros_like(x_test), x_test) # identify the zero values in the test ste
num_test_labels=tf.cast(tf.count_nonzero(mask),dtype=tf.float32) # count the number of non zero values
bool_mask=tf.cast(mask,dtype=tf.bool)
outputs=tf.where(bool_mask, outputs, tf.zeros_like(outputs))
outputs = self._inference(x_train) # use training sample to make prediction
mask = tf.where(tf.equal(x_test,0.0), tf.zeros_like(x_test), x_test) # identify the zero values in the test ste
num_test_labels = tf.cast(tf.count_nonzero(mask),dtype=tf.float32) # count the number of non zero values
bool_mask = tf.cast(mask,dtype=tf.bool)
outputs = tf.where(bool_mask, outputs, tf.zeros_like(outputs))

MSE_loss=self._compute_loss(outputs, x_test, num_test_labels)
RMSE_loss=tf.sqrt(MSE_loss)
MSE_loss = self._compute_loss(outputs, x_test, num_test_labels)
RMSE_loss = tf.sqrt(MSE_loss)

return outputs, RMSE_loss

Expand Down
22 changes: 11 additions & 11 deletions data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def _get_training_data(FLAGS):
@return data for the inference
'''

filenames=[FLAGS.tf_records_train_path+f for f in os.listdir(FLAGS.tf_records_train_path)]
filenames = [os.path.join(FLAGS.tf_records_train_path, f) for f in os.listdir(FLAGS.tf_records_train_path)]

dataset = tf.data.TFRecordDataset(filenames)
dataset = dataset.map(parse)
Expand All @@ -23,33 +23,33 @@ def _get_training_data(FLAGS):
dataset2 = dataset2.repeat()
dataset2 = dataset2.batch(1)
dataset2 = dataset2.prefetch(buffer_size=1)

return dataset, dataset2


def _get_test_data(FLAGS):
''' Buildind the input pipeline for test data.'''
filenames=[FLAGS.tf_records_test_path+f for f in os.listdir(FLAGS.tf_records_test_path)]

filenames = [os.path.join(FLAGS.tf_records_test_path, f) for f in os.listdir(FLAGS.tf_records_test_path)]

dataset = tf.data.TFRecordDataset(filenames)
dataset = dataset.map(parse)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat()
dataset = dataset.batch(1)
dataset = dataset.prefetch(buffer_size=1)

return dataset


def parse(serialized):
''' Parser fot the TFRecords file.'''
features={'movie_ratings':tf.FixedLenFeature([3952], tf.float32),

features = {'movie_ratings':tf.FixedLenFeature([3952], tf.float32),
}
parsed_example=tf.parse_single_example(serialized,
parsed_example = tf.parse_single_example(serialized,
features=features,
)
movie_ratings = tf.cast(parsed_example['movie_ratings'], tf.float32)

return movie_ratings
170 changes: 170 additions & 0 deletions data/ml-1m/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
SUMMARY
================================================================================

These files contain 1,000,209 anonymous ratings of approximately 3,900 movies
made by 6,040 MovieLens users who joined MovieLens in 2000.

USAGE LICENSE
================================================================================

Neither the University of Minnesota nor any of the researchers
involved can guarantee the correctness of the data, its suitability
for any particular purpose, or the validity of results based on the
use of the data set. The data set may be used for any research
purposes under the following conditions:

* The user may not state or imply any endorsement from the
University of Minnesota or the GroupLens Research Group.

* The user must acknowledge the use of the data set in
publications resulting from the use of the data set
(see below for citation information).

* The user may not redistribute the data without separate
permission.

* The user may not use this information for any commercial or
revenue-bearing purposes without first obtaining permission
from a faculty member of the GroupLens Research Project at the
University of Minnesota.

If you have any further questions or comments, please contact GroupLens
<grouplens-info@cs.umn.edu>.

CITATION
================================================================================

To acknowledge use of the dataset in publications, please cite the following
paper:

F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History
and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4,
Article 19 (December 2015), 19 pages. DOI=http://dx.doi.org/10.1145/2827872


ACKNOWLEDGEMENTS
================================================================================

Thanks to Shyong Lam and Jon Herlocker for cleaning up and generating the data
set.

FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
================================================================================

The GroupLens Research Project is a research group in the Department of
Computer Science and Engineering at the University of Minnesota. Members of
the GroupLens Research Project are involved in many research projects related
to the fields of information filtering, collaborative filtering, and
recommender systems. The project is lead by professors John Riedl and Joseph
Konstan. The project began to explore automated collaborative filtering in
1992, but is most well known for its world wide trial of an automated
collaborative filtering system for Usenet news in 1996. Since then the project
has expanded its scope to research overall information filtering solutions,
integrating in content-based methods as well as improving current collaborative
filtering technology.

Further information on the GroupLens Research project, including research
publications, can be found at the following web site:

http://www.grouplens.org/

GroupLens Research currently operates a movie recommender based on
collaborative filtering:

http://www.movielens.org/

RATINGS FILE DESCRIPTION
================================================================================

All ratings are contained in the file "ratings.dat" and are in the
following format:

UserID::MovieID::Rating::Timestamp

- UserIDs range between 1 and 6040
- MovieIDs range between 1 and 3952
- Ratings are made on a 5-star scale (whole-star ratings only)
- Timestamp is represented in seconds since the epoch as returned by time(2)
- Each user has at least 20 ratings

USERS FILE DESCRIPTION
================================================================================

User information is in the file "users.dat" and is in the following
format:

UserID::Gender::Age::Occupation::Zip-code

All demographic information is provided voluntarily by the users and is
not checked for accuracy. Only users who have provided some demographic
information are included in this data set.

- Gender is denoted by a "M" for male and "F" for female
- Age is chosen from the following ranges:

* 1: "Under 18"
* 18: "18-24"
* 25: "25-34"
* 35: "35-44"
* 45: "45-49"
* 50: "50-55"
* 56: "56+"

- Occupation is chosen from the following choices:

* 0: "other" or not specified
* 1: "academic/educator"
* 2: "artist"
* 3: "clerical/admin"
* 4: "college/grad student"
* 5: "customer service"
* 6: "doctor/health care"
* 7: "executive/managerial"
* 8: "farmer"
* 9: "homemaker"
* 10: "K-12 student"
* 11: "lawyer"
* 12: "programmer"
* 13: "retired"
* 14: "sales/marketing"
* 15: "scientist"
* 16: "self-employed"
* 17: "technician/engineer"
* 18: "tradesman/craftsman"
* 19: "unemployed"
* 20: "writer"

MOVIES FILE DESCRIPTION
================================================================================

Movie information is in the file "movies.dat" and is in the following
format:

MovieID::Title::Genres

- Titles are identical to titles provided by the IMDB (including
year of release)
- Genres are pipe-separated and are selected from the following genres:

* Action
* Adventure
* Animation
* Children's
* Comedy
* Crime
* Documentary
* Drama
* Fantasy
* Film-Noir
* Horror
* Musical
* Mystery
* Romance
* Sci-Fi
* Thriller
* War
* Western

- Some MovieIDs do not correspond to a movie due to accidental duplicate
entries and/or test entries
- Movies are mostly entered by hand, so errors and inconsistencies may exist
Loading