imdb.Rmd

---
title: "Text Classification Case Study: IMDB Dataset"
author: "Dr. Stephen W. Thomas, Queen's University"
date: "2017"
output:
  pdf_document:
    highlight: pygments
    number_sections: yes
    toc: no
    toc_depth: '2'
---


```{r}
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidytext, tidyverse, cleanNLP, lubridate, RWeka, class, e1071, caret,
               stringr, scales, rpart, rpart.plot, MLmetrics, stringi, devtools)

if (packageVersion("tm") != "0.7.1"){
  install_version("tm", version = "0.7.1", repos = "http://cran.us.r-project.org")
}
library(tm)
```

# Read in the data

```{r}
imdb = read_delim("data/all.imdb.pipe.csv", delim="|", quote="")

# For some reason, the dataset does not have an ID per row. So let's give each row a unique id here.
# Also, tm's DataframeSouce (used later) requires that the first column be named "doc_id", and the 
# second column be named "text". We also need to rename "sentiment" to "sentiment.id" because we don't 
# want this column to be confused with the word "sentiment" in our TDM later.
imdb <- imdb %>%
  mutate(doc_id = 1:nrow(imdb)) %>%
  rename(text = review) %>% 
  dplyr::select(doc_id, text, everything()) %>%
  rename(sentiment.score = sentiment)

dim(imdb)
head(imdb)

# For now, just take a fraction, to speed up the analysis initially. Change this once you're ready to run this script for a while.
set.seed(123)
imdb <- imdb %>%
  sample_frac(0.05)
```


Do some initial cleaning of strange characters in the text data.

```{r}

# Now, I noticed that there were some weird characters in the file. Non-ascii, weird stuff.
# I removed some in the input file itself, in vim, using the command:
# :.,$s/[\x97]/ - /g
#
# I did the above for (if memory serves correctly): \u0096, \u0097, \u0084, \u008d, \u0095, and \u0091.
#
# For good measure, let's remove a few more, here in R, below:

imdb$text = stringi::stri_trans_general(imdb$text, "latin-ascii")
imdb$text= gsub('\\R', '', imdb$text, perl=T)
imdb$text= gsub('£|¢|§|¦|·|¡|°|¿', ' ', imdb$text, perl=T)
imdb$text= gsub('´', '\'', imdb$text, perl=T)
imdb$text= gsub('¨', '"', imdb$text, perl=T)


# BTW, here is how I searched for those non-ascii characters:
# grep("I_WAS_NOT_ASCII", iconv(imdb$text, "latin1", "ASCII", sub="I_WAS_NOT_ASCII"))

# Inspect
head(imdb)
imdb[4,]$text
```

Some simple descriptive stats.

```{r}
# This is not really important. I was just 
# investigating  what uid means.

imdb %>%
  group_by(uid) %>%
  summarize(count = n()) %>% arrange(desc(count))

imdb %>%
  filter(uid == 24)

imdb %>%
  group_by(sentiment.score) %>%
  summarize(count=n())
```


# Convert to tidy text

```{r}
tidy <- imdb %>% 
  unnest_tokens(word, text)
head(tidy)
```


Some more descriptive stats, the tidy way.

Most frequent words.

```{r}
word_freqs = tidy %>%
  group_by(word) %>%
  summarize(n = n()) %>%
  arrange(desc(n))
  
word_freqs %>%
  top_n(50)
```


Least frequent words.

```{r}
word_freqs %>%
  top_n(-50)
```


Sentiment distribution.

```{r}
tidy %>%
  group_by(sentiment.score) %>%
  summarize(n = n()) %>%
  mutate(freq= n/sum(n))
```


Most positive and negative words.

```{r}
sentiment_words_count = tidy %>% 
  group_by(sentiment.score, word) %>%
  summarize(count=n()) %>%
  arrange(desc(count))

log_ratios = sentiment_words_count %>% 
  spread (sentiment.score, count) %>%
  mutate(negative = ifelse(is.na(negative), 0, negative)) %>%
  mutate(positive = ifelse(is.na(positive), 0, positive)) %>%
  mutate(total=negative+positive) %>%
  mutate(log_ratio = log2(positive/negative)) 

# Save this for later, to be used as features in classification:
# the most polarizing words
top_log_ratios = log_ratios %>%
  filter(total > 50) %>%
  group_by(log_ratio < 0) %>%
  top_n(40, abs(log_ratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, log_ratio))

log_ratios %>%
  filter(total > 50) %>%
  group_by(log_ratio < 0) %>%
  top_n(15, abs(log_ratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, log_ratio)) %>%
  ggplot(aes(word, log_ratio, fill = log_ratio < 0)) +
  geom_col() +
  coord_flip() +
  ylab("log odds ratio") +
  scale_fill_discrete(name = "", labels = c("positive", "negative"))
```


Log odd ratio chart for n-grams.

```{r}
tidy_tri <- imdb %>% 
  unnest_tokens(word, text, token="ngrams", n=2)

# Most positive and negative words
sentiment_words_count_tri = tidy_tri %>% 
  group_by(sentiment.score, word) %>%
  summarize(count=n()) %>%
  arrange(desc(count))

log_ratios_tri = sentiment_words_count_tri %>% 
  spread (sentiment.score, count) %>%
  mutate(negative = ifelse(is.na(negative), 0, negative)) %>%
  mutate(positive = ifelse(is.na(positive), 0, positive)) %>%
  mutate(total=negative+positive) %>%
  mutate(log_ratio = log2(positive/negative)) 

log_ratios_tri %>%
  filter(total > 100) %>%
  group_by(log_ratio < 0) %>%
  top_n(10, abs(log_ratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, log_ratio)) %>%
  ggplot(aes(word, log_ratio, fill = log_ratio < 0)) +
  geom_col() +
  coord_flip() +
  ylab("log odds ratio") +
  scale_fill_discrete(name = "", labels = c("positive", "negative"))
```


# Build a classifier.

## Create the feature matrix.

There's a lot of features we can have for each document:

- Individual words (weighted by term frequency, binary frequency, TF-IDF, ...)
- n-grams (again, with various weights)
- Topic memberships (after running e.g., LDA)
- Cluster memberships (after running e.g., kmeans)
- Text length

We'll explore a few different features.

The overall strategy will be to create a DocumentTermMatrix (from the `tm`) package and input that into some ML models to do the classification.


First, create the DTM.

```{r}
# We could transform our tidy dataframe directly into a TM DTM, using the cast_dtm function. Or, we can
# create the DTM from the original data. Here, we should how to do it both ways. Set the `use_tidy` flag
# below to experiment with each way.

use_tidy = FALSE
if (use_tidy == TRUE) {
  # First, need to get the counts of each polarizing word in each doc
  tidy_counts = tidy %>%
    filter(word %in% top_log_ratios$word) %>% # only keep most polarizing words
    group_by(doc_id, uid, sentiment.score, word) %>%
    summarize(count = n())
  
  # Make the DTM
  dtm <- tidy_counts %>%
    cast_dtm(doc_id, word, count)
  dim(dtm)
  inspect(dtm)
  
  # Now, if we want to use some of tm's preprocessing functions, we need to first convert the DTM
  # to a corpus object, then run the preprocessing functions, and then convert the corpus object
  # back to a DTM. (Strange, I know.)
  
  # Convert DTM to a list of text
  dtm_list <- apply(dtm, 1, function(x) {
      paste(rep(names(x), x), collapse=" ")
  })
  
  corpus <- VCorpus(VectorSource(dtm_list))
} else {
   # To retain the metadata, use a custom reader.
  m <- list(content = "text", id = "doc_id", 
            score="score", sentiment.score = "sentiment.score", uid = "uid")
  myReader <- readTabular(mapping = m)
  corpus <- Corpus(DataframeSource(imdb), readerControl = list(reader = myReader))
}


# Output some example content, before preprocessing.
corpus[[1]]$content
corpus[[2]]$content
corpus[[3]]$content
corpus[[4]]$content

# Y'all got any more of those preprocessing steps?
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, removeWords, c("br"))
corpus <- tm_map(corpus, stemDocument, language = "english") 
corpus <- tm_map(corpus, stripWhitespace)

# Output the same example content, after preprocessing.
corpus[[1]]$content
corpus[[2]]$content
corpus[[3]]$content
corpus[[4]]$content

# When we build the DTM, we want n-grams!
NgramTokenizer  <- function(x) {
  RWeka::NGramTokenizer(x, RWeka::Weka_control(min = 1, max = 3))
}

dtm <- DocumentTermMatrix(corpus, 
                          control = list(tokenize = NgramTokenizer, 
                                         weighting = function(x) weightTfIdf(x, normalize = FALSE)))

dim(dtm)

# Only keep the top features, to keep the feature matrix reasonable in size.
dtm = removeSparseTerms(dtm, .994)
dim(dtm)
```

Convert the dtm to a dataframe, so we can pass it into some ML methods.
```{r}
df <- as.data.frame(as.matrix(dtm))

# Add the truth label (i.e., sentiment.score) to the dataframe.
labels = imdb %>%
  dplyr::select(doc_id, sentiment.score)

df_l = merge(df, labels, by.x="row.names", by.y="doc_id", all.x=TRUE)
df_l[,1] <- NULL # Don't want the pesky "RowNames" column hanging around
head(df_l)
```

Split into training and testing.

```{r}
# Training and testing
N = nrow(df_l)
smp_size <- floor(0.75 * N)
set.seed(123)
train_ind <- sample(seq_len(nrow(df_l)), size = smp_size)


train <- df_l[train_ind, ]
test <- df_l[-train_ind, ]

train$sentiment.score = as.factor(train$sentiment.score)
test$sentiment.score = as.factor(test$sentiment.score)

table(train$sentiment.score)
table(test$sentiment.score)
```


Output a sample of the data, for the slides.

```{r}
# The TF-IDF scores for the top features of the first doc
sort(as.data.frame(as.matrix(dtm[1,])), decreasing=TRUE)[1:10]

# First, just look at a few docs, and see which words are highest rated.
idxs = c(1, 2, 3, 4)
s = train[idxs,]

for (i in idxs){
  print(i)
  print(sort(s[i,which(s[i,]>0)], decreasing=T))
}

# Then, choose some words/features manually, and see what the values are in the sample docs.
example_features = c("success", "awkward", "sex", "new", "shock", "better", "cant believ", "young woman", "work well")
for (example_feature in example_features){
  print("")
  print(example_feature)
  print(s[[example_feature]])
}
```


This function will print out some performance stats for a given model's predictions.

```{r}
printPerf = function(predicted, actual){
  (table(predicted, actual))

  print(sprintf("Accuracy:    %.3f", Accuracy(y_true=actual, y_pred=predicted)))
  print(sprintf("Precision:   %.3f", Precision(y_true=actual, y_pred=predicted)))
  print(sprintf("Recall:      %.3f", Recall(y_true=actual, y_pred=predicted)))
  print(sprintf("F1 Score:    %.3f", F1_Score(predicted, actual)))
  print(sprintf("Sensitivity: %.3f", Sensitivity(y_true=actual, y_pred=predicted)))
  print(sprintf("Specificity: %.3f", Specificity(y_true=predicted, y_pred=actual)))
}
```


Decision Trees.

```{r}
set.seed(123)
tree <- rpart(sentiment.score ~ ., data=train)
tree
rpart.plot(tree, extra=2) 

predicted = predict(tree, test, type="class")
printPerf(predicted, test$sentiment.score)
```


Naive Bayes.

```{r}
set.seed(123)
nb <- naiveBayes(sentiment.score ~ ., data=train)

predicted.nb = predict(nb, test, type="class")
printPerf(predicted, test$sentiment.score)

# Look at some of the individual tables like so:
nb$tables$steve
```

Support Vector Machines.

```{r}
set.seed(123)
sv <- svm(sentiment.score ~ ., data=train)

predicted.svm = predict(sv, test, type="class")
printPerf(predicted, test$sentiment.score)

```


KNN

Warning: the following takes a LONG time, so I've turned off this cell by default.

```{r eval=FALSE, include=FALSE}
# knn in caret requires you split the data into a feature matrix (x) and the label (y)
idx = which(colnames(train)=="sentiment.score")
x = train[,-idx]
dim(x)
y = train[,idx]
length(y)

library(caret)
set.seed(100)
ctrl <- trainControl(method="repeatedcv", number = 2, repeats = 1) #,classProbs=TRUE)

knn <- train(x=x, y=y, method = "knn", trControl = ctrl)

predicted.knn = predict(knn, test, type="class")
printPerf(predicted.knn, test$sentiment.score)
```