-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMICE_imputer.py
92 lines (75 loc) · 2.83 KB
/
MICE_imputer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#%%
'''
It fill the missing rating using MICE imputer method with RF Regressor
as an estimator.
Additionally movies with lower than 10 rating and users with less than
50 rated movies are deleted from dataset to remove the noises from the data .
'''
#%%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
#%%[markdown]
### Loading and reshaping the dataset
ratings = pd.read_csv('data/raw/ratings.csv')
movies = pd.read_csv('data/raw/movies.csv')
# %%
ratings.shape, ratings.head(), movies.shape, movies.head()
# %% Made the matrix
df = pd.pivot_table(ratings, values='rating', index='movieId', columns='userId')
df
#%% [markdown]
#### Removing Noise from the data
# * To qualify a movie, a minimum of 10 users should have voted a movie.
# * To qualify a user, a minimum of 50 movies should have voted by the user.
users_rating = ratings.groupby('movieId')['rating'].agg('count')
movies_rating = ratings.groupby('userId')['rating'].agg('count')
# %% Visualization of movies ratings (at least 10 vote per movie as threshold)
f,ax = plt.subplots(1,1,figsize=(16,4))
plt.scatter(users_rating.index,users_rating,color='green')
plt.axhline(y=10,color='r')
plt.xlabel('MovieId')
plt.ylabel('No. of users voted')
plt.show()
#%%
# removing movies below threshold
df_m = df.loc[users_rating[users_rating > 10].index,:]
df_m # The number of movies reduced from 9724 to 2121
#%%
#Visualization of users ratings (at least 50 vote per user as threshold)
f,ax = plt.subplots(1,1,figsize=(16,4))
plt.scatter(movies_rating.index,movies_rating,color='green')
plt.axhline(y=50,color='r')
plt.xlabel('UserId')
plt.ylabel('No. of votes by user')
plt.show()
#%% removing users below threshold
users_threshold = movies_rating[movies_rating > 50]
df_u = df_m.loc[:,movies_rating[movies_rating > 50].index]
df_u # the number of users reduced from 610 to 378
#%%
df_final = df_u.copy()
# %% Fill the matrix with MICE imputer
imp = IterativeImputer(estimator=RandomForestRegressor(),
initial_strategy='mean',
max_iter=10, tol=1e-10, random_state=0)
df_filled = imp.fit_transform(df_final)
df_filled
# %% # Change the numpy ndarray to dataframe
df_clean= pd.DataFrame(data=df_filled[0:,0:],
index=[i for i in range(df_filled.shape[0])],
columns=['U_'+str(i) for i in range(df_filled.shape[1])])
df_clean.shape
#%%
# replacing the movieId with ordinal index
df_clean.index = df_filled.index
#%%
# Transposing the matrix X-Axis-> movieID, Y-Axis-> userId
df_final= df_clean.T
df_final
# %% # saving the final df to csv -> Ready for recommender systems
df_final.to_csv('data/preprocessed/df_final.csv', index= False)