-
Notifications
You must be signed in to change notification settings - Fork 2
/
usingPearsonCorrelation.py
154 lines (132 loc) · 6.49 KB
/
usingPearsonCorrelation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# from sklearn.neighbors import NearestNeighbors
# from scipy.sparse import csr_matrix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
class recommendationClass:
def __init__(self):
# dataset books,student,rating
self.books = pd.read_csv('booksForExcel.csv', error_bad_lines=False, encoding="latin-1")
# columns of books dataset
self.books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher']
self.users = pd.read_csv('student.csv', error_bad_lines=False, encoding="latin-1")
# columns of student dataset
self.users.columns = ['userID', 'Name', 'Age', 'Interest']
self.ratings = pd.read_csv('ratings.csv', error_bad_lines=False, encoding="latin-1")
# columns of rating dataset
self.ratings.columns = ['userID', 'ISBN', 'bookRating']
def show(self):
print(self.books)
print(self.users)
print(self.ratings)
def shapeOfData(self):
# shape of rating dataset,gives the dimension of dataset i.e the number of rows and columns
print(self.ratings.shape)
# list of rating columns
print(list(self.ratings.columns))
print(self.books.shape)
# list of books columns
print(list(self.books.columns))
print(self.users.shape)
# list of student columns
print(list(self.users.columns))
def diagram(self):
# # rating distribution using histogram
plt.rc("font", size=15)
self.ratings.bookRating.value_counts(sort=False).plot(kind='bar')
plt.title('Rating Distribution\n')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.savefig('system1.png', bbox_inches='tight')
plt.show()
# student age distribution using histogram
self.users.Age.hist(bins=[18, 20, 22, 24, 26, 28, 30, 32, 40])
plt.title('Age Distribution\n')
plt.xlabel('Age')
plt.ylabel('Count')
plt.savefig('system2.png', bbox_inches='tight')
plt.show()
# -------------------------------------------------------------------------
# boxplot
self.ratings.boxplot(column=['bookRating'], grid=False)
self.users.boxplot(column=['Age'])
# Draw a vertical boxplot grouped
# by a categorical variable:
sns.set_style("whitegrid")
sns.boxplot(y='bookRating', data=self.ratings)
sns.boxplot(y='Age', data=self.users)
def recommendation(self):
# recommendation based on rating count
rating_count = pd.DataFrame(self.ratings.groupby('ISBN')['bookRating'].count())
print(rating_count)
# sorting of the counts of rating to get the highest rated books
rating_count.sort_values('bookRating', ascending=False).head()
xy = rating_count.sort_values('bookRating', ascending=False).head(5)
print(xy)
# books details of first 5 book which received highest rating by students
most_rated_books = pd.DataFrame(['978-8120349391', '978-0198070887', '978-9351341741',
'978-0198083542', '978-9351343257'], index=np.arange(5), columns=['ISBN'])
most_rated_books_summary = pd.merge(most_rated_books, self.books, on='ISBN')
print(most_rated_books_summary)
# recommendations based on correlations
# here Pearson correlation coefficient used to measure the linear correlation between
# two variable --- the ratings for two books
# fetch the average rating and the count of rating each book received
average_rating = pd.DataFrame(self.ratings.groupby('ISBN')['bookRating'].mean())
print(average_rating)
average_rating['ratingCount'] = pd.DataFrame(
self.ratings.groupby('ISBN')['bookRating'].count())
average_rating.sort_values('ratingCount', ascending=False).head(10)
# here the main disadvantage is that, the book which got highest number of rating
# has the rating average is low
# observation-- in this dataset the book that received the most rating counts was not highly
# rated at all. So if we are going to use recommendation based on
# rating counts,we would definitely make mistake or wrong recommendation.
# to ensure statistical significance,
# student who rate books and their count >=3
counts1 = self.ratings['userID'].value_counts()
print(counts1)
ratings = self.ratings[self.ratings['userID'].isin(counts1[counts1 >= 1].index)]
print(ratings)
counts = ratings['bookRating'].value_counts()
print(counts)
# rating of book > 2
ratings = ratings[ratings['bookRating'].isin(counts[counts >= 1].index)]
print(ratings)
# ---------------------------------------------------------------
# using pearson correlation
# rating matrix
# convert the rating table into 2D matrix.
# generate sparse matrix because not all students rated book
# by using pivot table we will be able to create combination of userId and isbn
# this will give us that whether the student is rated a book either NAN is given
#
ratings_pivot = self.ratings.pivot(index='userID', columns='ISBN').bookRating
userID = ratings_pivot.index
print(userID)
ISBN = ratings_pivot.columns
print(ISBN)
print(ratings_pivot.shape)
print(ratings_pivot.head())
# pearson algorithm to find correlation between the isbn with other
someBookIsbn_ratings = ratings_pivot['978-0070634244']
# someBookIsbn = input("Enter ISBN: - ")
# someBookIsbn_ratings = ratings_pivot[someBookIsbn]
similar_to_someBookIsbn_ratings = ratings_pivot.corrwith(someBookIsbn_ratings)
corr_someBookIsbn = pd.DataFrame(
similar_to_someBookIsbn_ratings, columns=['pearsonR'])
corr_someBookIsbn.dropna(inplace=True)
corr_summary = corr_someBookIsbn.join(average_rating['ratingCount'])
corr_summary[corr_summary['ratingCount'] >= 2].sort_values(
'pearsonR', ascending=False).head(10)
# book details
books_corr_to_someBookIsbn = pd.DataFrame(['978-0070634244', '978-9351341741'],
index=np.arange(2), columns=['ISBN'])
corr_books = pd.merge(books_corr_to_someBookIsbn, self.books, on='ISBN')
print(corr_books)
obj = recommendationClass()
obj.show()
obj.shapeOfData()
obj.diagram()
obj.recommendation()