1
1
# Natural Language Processing
2
2
3
+ # Libraries
4
+
5
+ # corpus
6
+ install.packages(" slam" , type = " binary" )
7
+ install.packages(' tm' )
8
+ library(tm )
9
+
10
+ # stopwords()
11
+ install.packages(' SnowballC' )
12
+ library(SnowballC )
13
+
3
14
# Importing the dataset
4
15
5
16
# get a dataset with columns separated by a tab
@@ -19,15 +30,10 @@ dataset = read.delim('Restaurant_Reviews.tsv',
19
30
# distinct words = columns
20
31
# count the number of times the word goes with which review
21
32
22
- install.packages(" slam" , type = " binary" )
23
-
24
- install.packages(' tm' )
25
- library(tm )
26
33
27
34
corpus = VCorpus(VectorSource(dataset $ Review )) # we won't clean the reviews directly in the dataset - new dataset only for reviews = corpus
28
35
29
36
# 1. words -> lower cases
30
-
31
37
# so we don't get two versions of the same word in lower and upper case
32
38
# tm_map - update the corpus
33
39
# content_transformer - transforming function for all the words in the corpus
@@ -36,5 +42,18 @@ corpus = tm_map(corpus, content_transformer(tolower))
36
42
37
43
as.character(corpus [[1 ]]) # to view the first element of the corpus
38
44
45
+ # 2. numbers -> remove
46
+ corpus = tm_map(corpus , removeNumbers )
47
+
48
+ as.character(corpus [[841 ]]) # review with the number in it
49
+
50
+ # 3. punctuation -> remove
51
+ corpus = tm_map(corpus , removePunctuation )
52
+
53
+ # 4. non-relevant words -> remove
54
+ corpus = tm_map(corpus , removeWords , stopwords())
55
+
56
+
57
+
39
58
40
59
0 commit comments