Skip to content

Commit f9e4b8f

Browse files
committed
update
1 parent de016bb commit f9e4b8f

File tree

1 file changed

+72
-0
lines changed

1 file changed

+72
-0
lines changed

recommenders/tfidf.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import pandas as pd
2+
import json
3+
4+
from sklearn.feature_extraction.text import TfidfVectorizer
5+
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
6+
7+
8+
# get the data from: https://www.kaggle.com/tmdb/tmdb-movie-metadata
9+
# load in the data
10+
df = pd.read_csv('../large_files/tmdb_5000_movies.csv')
11+
12+
13+
# convert the relevant data for each movie into a single string
14+
# to be ingested by TfidfVectorizer
15+
def genres_and_keywords_to_string(row):
16+
genres = json.loads(row['genres'])
17+
genres = ' '.join(''.join(j['name'].split()) for j in genres)
18+
19+
keywords = json.loads(row['keywords'])
20+
keywords = ' '.join(''.join(j['name'].split()) for j in keywords)
21+
return "%s %s" % (genres, keywords)
22+
23+
24+
# create a new string representation of each movie
25+
df['string'] = df.apply(genres_and_keywords_to_string, axis=1)
26+
27+
28+
# create a tf-idf vectorizer object
29+
# remove stopwords automatically
30+
tfidf = TfidfVectorizer(max_features=2000)
31+
32+
# create a data matrix from the overviews
33+
X = tfidf.fit_transform(df['string'])
34+
35+
# check the shape of X
36+
print("X.shape:", X.shape)
37+
38+
# generate a mapping from movie title -> index (in df)
39+
movie2idx = pd.Series(df.index, index=df['title'])
40+
41+
# create a function that generates recommendations
42+
def recommend(title):
43+
# get the row in the dataframe for this movie
44+
idx = movie2idx[title]
45+
if type(idx) == pd.Series:
46+
idx = idx.iloc[0]
47+
# print("idx:", idx)
48+
49+
# calculate the pairwise similarities for this movie
50+
query = X[idx]
51+
scores = cosine_similarity(query, X)
52+
53+
# currently the array is 1 x N, make it just a 1-D array
54+
scores = scores.flatten()
55+
56+
# get the indexes of the highest scoring movies
57+
# get the first K recommendations
58+
# don't return itself!
59+
recommended_idx = (-scores).argsort()[1:6]
60+
61+
# return the titles of the recommendations
62+
return df['title'].iloc[recommended_idx]
63+
64+
65+
print("\nRecommendations for 'Scream 3':")
66+
print(recommend('Scream 3'))
67+
68+
print("\nRecommendations for 'Mortal Kombat':")
69+
print(recommend('Mortal Kombat'))
70+
71+
print("\nRecommendations for 'Runaway Bride':")
72+
print(recommend('Runaway Bride'))

0 commit comments

Comments
 (0)