-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathR_Recommender.R
135 lines (112 loc) · 5.28 KB
/
R_Recommender.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
## R_Recommender
# Recommender system based on vectorial representation of wordfrequencies of documents
# Loading libraries
library(tm)
library(SnowballC)
# Please specify the Directory in which this file is.
fp = file.path("~","Dropbox","Github","R_Recommender")
#Import texts
setwd(fp)
dir(fp)
books <- Corpus(DirSource(file.path(fp,"texts")))
# The book names
booksnames = c("Alice's Adventures in Wonderland, by Lewis Carroll", "Metamorphosis, by Franz Kafka
Translated by David Wyllie", "Picture of Dorian Gray, by Oscar Wilde", "The Adventures of Tom Sawyer, Complete by
Mark Twain", "Gutenberg EBook of Dracula, by Bram Stoker", "Frankenstein, by Mary Wollstonecraft","Brothers Karamazov by Fyodor
Dostoyevsky", "Siddhartha, by Herman Hesse", "Don Quixote, by Miguel de Cervantes", "The jungle book, by Rudyard Kipling" )
# And emotions
emotions <- c("Anger","Fear", "Hapiness","Sadness","Shame")
## Preprocessing steps
#Remove Punctuation signs, numbers, lowecase and stopwords in english
books <- tm_map(books,removePunctuation)
books <- tm_map(books, removeNumbers)
books <- tm_map(books, tolower)
books <- tm_map(books, removeWords, stopwords("english"))
#Stemize the words and remove unnecesary whitespaces
books <- tm_map(books, stemDocument)
books <- tm_map(books, stripWhitespace)
#Change datatype
books <- tm_map(books, PlainTextDocument)
## Processing steps
#Compute the Term frquecuency matrix
matrix <- DocumentTermMatrix(books)
# Cast is from sparse matrix to a normal matrix
wordMatrix =as.matrix( matrix )
#Function to find the emotion of document vec
emotioner <- function (vec)
{ r = {}
for(j in 1:5)
{ #Compute the dot product of the book with each emotion (emotions are the last five documents)
r[j] = vec %*% wordMatrix[10+j,]
}
# Normalize the result
r = r/sum(r)
# Return the proportion of each emotion in the document
return(r)
}
#Compute the emotion in each document
p = matrix(, nrow = 10, ncol = 5)
for(j in 1:10)
{
p[j,] = emotioner(wordMatrix[j,])
}
#Change the names of rows and columns accordinglu
colnames(p) <- emotions
rownames(p) <- booksnames
#See the result of the proportion of each emotion in every document
print (p)
# Anger Fear Hapiness Sadness Shame
# Alice's Adventures in Wonderland, by Lewis Carroll 0.2543353 0.3352601 0.2196532 0.16763006 0.02312139
# Metamorphosis, by Franz Kafka\nTranslated by David Wyllie 0.1897810 0.2700730 0.4817518 0.05839416 0.00000000
# Picture of Dorian Gray, by Oscar Wilde 0.2703777 0.3558648 0.2345924 0.10139165 0.03777336
# The Adventures of Tom Sawyer, Complete by\n Mark Twain 0.2179837 0.2452316 0.2970027 0.18528610 0.05449591
# Gutenberg EBook of Dracula, by Bram Stoker 0.2485137 0.2639715 0.3543401 0.11058264 0.02259215
# Frankenstein, by Mary Wollstonecraft 0.2349138 0.1767241 0.3836207 0.16594828 0.03879310
# Brothers Karamazov by Fyodor\n Dostoyevsky 0.2900097 0.2332687 0.2929195 0.07953443 0.10426770
# Siddhartha, by Herman Hesse 0.1831683 0.2326733 0.4752475 0.08415842 0.02475248
# Don Quixote, by Miguel de Cervantes 0.2641129 0.1895161 0.3501344 0.11895161 0.07728495
# The jungle book, by Rudyard Kipling 0.3152709 0.3497537 0.1970443 0.11822660 0.01970443
## Testing
# Create two users, one with random profile and one that reads more Happy books
u1 = runif(5, 0, 1)
u2 = c(.1,.1,1,.1,.1)
#Normalize them
u1 = u1/sum(u1)
u2 = u2/sum(u2)
#Produce the recommendation
recommend <- function(u)
{ r = p %*% u
return (r)
}
#Preferences of the user 1
recommend(u1)
#Preferences of the user 2
recommend(u2)
# [,1]
# Alice's Adventures in Wonderland, by Lewis Carroll 0.2196532
# Metamorphosis, by Franz Kafka\nTranslated by David Wyllie 0.4817518
# Picture of Dorian Gray, by Oscar Wilde 0.2345924
# The Adventures of Tom Sawyer, Complete by\n Mark Twain 0.2970027
# Gutenberg EBook of Dracula, by Bram Stoker 0.3543401
# Frankenstein, by Mary Wollstonecraft 0.3836207
# Brothers Karamazov by Fyodor\n Dostoyevsky 0.2929195
# Siddhartha, by Herman Hesse 0.4752475
# Don Quixote, by Miguel de Cervantes 0.3501344
# The jungle book, by Rudyard Kipling 0.1970443
#We will recommend the n top elements in this vector, if n=2
##Recommendation to user 2
#Metamorphosis, by Franz Kafka\nTranslated by David Wyllie 0.4817518
#Siddhartha, by Herman Hesse 0.4752475
## Automatic evaluation of user profile.
# Let's say that user 3 reads books 1,2 & 3, what profiles this gives him. Well it's just
my_books = p[1:3,]
# A function to renurmalize the empirical frequencies of the books
profiler <- function(matrix)
{
r = colSums(matrix)
return(r/sum(r))
}
# He surely likes Fear over Shame
profiler(my_books)
# Anger Fear Hapiness Sadness Shame
# 0.23816467 0.32039931 0.31199915 0.10913862 0.02029825