-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_1_EDA_titles_BTM.R
82 lines (68 loc) · 3.02 KB
/
01_1_EDA_titles_BTM.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
load("data/PolPsyArticles.RData")
load("data/PolPsyArticles_q_corp.RData")
library(quanteda)
df$Title %>% tokens(remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE) %>%
tokens_select(pattern = stopwords("en"), selection = 'remove') %>%
dfm() %>% topfeatures(100)
exp_dfm <- PolPsyArt_q_corp %>% tokens(remove_punct = TRUE,
remove_symbols = TRUE,
remove_numbers = TRUE) %>%
tokens_select(pattern = stopwords("en"), selection = 'remove') %>%
dfm()
exp_dfm_tstat_dist <- as.dist(quanteda.textstats::textstat_dist(exp_dfm))
mds_fit <- cmdscale(exp_dfm_tstat_dist, eig = TRUE, k = 2)
mds_points <- data.frame(x = mds_fit$points[, 1], y = mds_fit$points[, 2])
library(tidyverse)
ggplot(mds_points, aes(x = x, y = y)) +
geom_point(data = mds_points, aes(x = x, y = y, color = df$Source)) +
geom_text(data = mds_points, aes(x = x, y = y - 0.2,
label = df$doc_id)) +
labs(x = "Dimension 1", y = "Dimension 2",
title = "MDS Plot",
subtitle = "Possible # of topics judging by the titles of the articles",
caption = "Heuristic expectation: 10 - 14 topics") +
theme_bw() + theme(legend.position = "none")
gdata::keep(df, PolPsyArt_q_corp, sure = TRUE)
btm_corp <- df %>% select(doc_id, text = Title)
library(udpipe)
udp_anno <- udpipe(btm_corp, "english", trace = 10)
library(data.table)
bi_terms <- as.data.frame(udp_anno)
txt_stats <- txt_freq(udp_anno$upos)
txt_stats$key <- factor(txt_stats$key, levels = rev(txt_stats$key))
txt_stats %>% ggplot(aes(x = key, y = freq)) +
geom_col() +
labs(x = "", y = "Frequency",
title = "Text Statistics",
subtitle = "Universal Parts of Speech",
caption = "In many languages nouns, adjectives & verbs are most useful
\nfor understanding semantic relationships") +
coord_flip() + theme_bw()
library(BTM)
traindata <- subset(udp_anno, upos %in% c("NOUN", "PROPN", "ADJ", "VERB") &
!lemma %in% stopwords("en") & nchar(lemma) > 2)
bi_terms_dt <- as.data.table(bi_terms)
bi_terms_btm <- bi_terms_dt[, cooccurrence(x = lemma,
relevant = upos %in% c("NOUN", "PROPN",
"ADJ", "VERB") &
nchar(lemma) > 2 & !lemma %in% stopwords("en"),
skipgram = 3),
by = list(doc_id)]
traindata <- traindata[, c("doc_id", "lemma")]
library(textplot)
library(ggraph)
fit_BTM_get_plot <- function(num_top, sub_title) {
set.seed(666)
btm_model <- BTM(traindata, biterms = bi_terms_btm, k = num_top,
iter = 2000, background = TRUE, trace = 100)
set.seed(666)
plot(btm_model, top_n = num_top,
title = "BTM model", subtitle = sub_title)
}
fit_BTM_get_plot(10, "10 Topics")
fit_BTM_get_plot(11, "11 Topics")
fit_BTM_get_plot(12, "12 Topics")
fit_BTM_get_plot(13, "13 Topics")
fit_BTM_get_plot(14, "14 Topics")