Skip to content

Commit 95e748f

Browse files
Text cleaning - removing the numbers, punctuation etc from the restaurant reviews
1 parent e81de6a commit 95e748f

File tree

1 file changed

+24
-5
lines changed
  • Part 7 - Natural Language Processing

1 file changed

+24
-5
lines changed

Part 7 - Natural Language Processing/NLP.R

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
# Natural Language Processing
22

3+
# Libraries
4+
5+
# corpus
6+
install.packages("slam", type = "binary")
7+
install.packages('tm')
8+
library(tm)
9+
10+
# stopwords()
11+
install.packages('SnowballC')
12+
library(SnowballC)
13+
314
# Importing the dataset
415

516
# get a dataset with columns separated by a tab
@@ -19,15 +30,10 @@ dataset = read.delim('Restaurant_Reviews.tsv',
1930
# distinct words = columns
2031
# count the number of times the word goes with which review
2132

22-
install.packages("slam", type = "binary")
23-
24-
install.packages('tm')
25-
library(tm)
2633

2734
corpus = VCorpus(VectorSource(dataset$Review)) #we won't clean the reviews directly in the dataset - new dataset only for reviews = corpus
2835

2936
# 1. words -> lower cases
30-
3137
#so we don't get two versions of the same word in lower and upper case
3238
#tm_map - update the corpus
3339
#content_transformer - transforming function for all the words in the corpus
@@ -36,5 +42,18 @@ corpus = tm_map(corpus, content_transformer(tolower))
3642

3743
as.character(corpus[[1]]) #to view the first element of the corpus
3844

45+
# 2. numbers -> remove
46+
corpus = tm_map(corpus, removeNumbers)
47+
48+
as.character(corpus[[841]]) #review with the number in it
49+
50+
# 3. punctuation -> remove
51+
corpus = tm_map(corpus, removePunctuation)
52+
53+
# 4. non-relevant words -> remove
54+
corpus = tm_map(corpus, removeWords, stopwords())
55+
56+
57+
3958

4059

0 commit comments

Comments
 (0)