update

bob7783 · bob7783 · commit f9e4b8ff660b · 2019-08-25T02:51:36.000-04:00
diff --git a/recommenders/tfidf.py b/recommenders/tfidf.py
@@ -0,0 +1,72 @@
+import pandas as pd
+import json
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
+
+
+# get the data from: https://www.kaggle.com/tmdb/tmdb-movie-metadata
+# load in the data
+df = pd.read_csv('../large_files/tmdb_5000_movies.csv')
+
+
+# convert the relevant data for each movie into a single string
+# to be ingested by TfidfVectorizer
+def genres_and_keywords_to_string(row):
+  genres = json.loads(row['genres'])
+  genres = ' '.join(''.join(j['name'].split()) for j in genres)
+
+  keywords = json.loads(row['keywords'])
+  keywords = ' '.join(''.join(j['name'].split()) for j in keywords)
+  return "%s %s" % (genres, keywords)
+
+
+# create a new string representation of each movie
+df['string'] = df.apply(genres_and_keywords_to_string, axis=1)
+
+
+# create a tf-idf vectorizer object
+# remove stopwords automatically
+tfidf = TfidfVectorizer(max_features=2000)
+
+# create a data matrix from the overviews
+X = tfidf.fit_transform(df['string'])
+
+# check the shape of X
+print("X.shape:", X.shape)
+
+# generate a mapping from movie title -> index (in df)
+movie2idx = pd.Series(df.index, index=df['title'])
+
+# create a function that generates recommendations
+def recommend(title):
+  # get the row in the dataframe for this movie
+  idx = movie2idx[title]
+  if type(idx) == pd.Series:
+    idx = idx.iloc[0]
+  # print("idx:", idx)
+
+  # calculate the pairwise similarities for this movie
+  query = X[idx]
+  scores = cosine_similarity(query, X)
+
+  # currently the array is 1 x N, make it just a 1-D array
+  scores = scores.flatten()
+
+  # get the indexes of the highest scoring movies
+  # get the first K recommendations
+  # don't return itself!
+  recommended_idx = (-scores).argsort()[1:6]
+
+  # return the titles of the recommendations
+  return df['title'].iloc[recommended_idx]
+
+
+print("\nRecommendations for 'Scream 3':")
+print(recommend('Scream 3'))
+
+print("\nRecommendations for 'Mortal Kombat':")
+print(recommend('Mortal Kombat'))
+
+print("\nRecommendations for 'Runaway Bride':")
+print(recommend('Runaway Bride'))