Skip to content

Commit ab3ca44

Browse files
committed
Final ABT
1 parent fb1a60f commit ab3ca44

6 files changed

+153
-11
lines changed

ABT_add_comment_NLP.R

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
library(jsonlite)
2+
library(tidyverse)
3+
library(lubridate)
4+
5+
6+
#path <- "G:/Saját meghajtó/HiFly/Common/Homokozó/nlp-hackathon/data/"
7+
8+
#sent_in <- readRDS(paste0(path, "transformed/comment_month_user_agg.rds"))
9+
#ABT_in <- read.csv(paste0(path, "transformed/ABT_base.csv"))
10+
11+
#abt_date <- "2020-08-01"
12+
13+
add_comment_NLP <- function(ABT_in, sent_in, abt_date){
14+
sent <- sent_in %>%
15+
filter(time < as.Date(abt_date),
16+
time > as.Date(abt_date) - as.difftime(183, unit="days"))
17+
18+
ABT <- ABT_in %>%
19+
left_join(sent %>% filter(time > as.Date(abt_date) - as.difftime(35, unit="days"))) %>%
20+
left_join(sent %>%
21+
filter(time > as.Date(abt_date) - as.difftime(70, unit="days")) %>%
22+
group_by(by) %>%
23+
summarise(sum_qout_2 = sum(sum_qout),
24+
sum_exclam_2 = sum(sum_exclam),
25+
sum_link_2 = sum(sum_link),
26+
cnt_comment_2 = sum(cnt_comment),
27+
avg_token_len_2 = mean(avg_token_len))) %>%
28+
left_join(sent %>%
29+
filter(time > as.Date(abt_date) - as.difftime(200, unit="days")) %>%
30+
group_by(by) %>%
31+
summarise(sum_qout_6 = sum(sum_qout),
32+
sum_exclam_6 = sum(sum_exclam),
33+
sum_link_6 = sum(sum_link),
34+
cnt_comment_6 = sum(cnt_comment),
35+
avg_token_len_6 = mean(avg_token_len)))
36+
37+
return(ABT)
38+
}

ABT_add_comment_popularity.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ add_comment_popularity <- function(ABT_in, data_in, abt_date){
1313
ABT <- ABT_in %>% filter(year_month == abt_date)
1414

1515
data <- data_in %>%
16-
mutate(time = as.Date(as.POSIXct(time, origin="1970-01-01"))) %>%
16+
mutate(time = as.Date(time)) %>%
1717
filter(time < as.Date(abt_date),
1818
time > as.Date(abt_date) - as.difftime(183, unit="days"),
1919
by %in% ABT$by, type == "comment") %>%

ABT_add_hist_activity.R

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
library(jsonlite)
2+
library(tidyverse)
3+
library(lubridate)
4+
5+
# path <- "G:/Saját meghajtó/HiFly/Common/Homokozó/nlp-hackathon/data/"
6+
7+
# data_in <- readRDS(paste0(path, "collected/items_225_245_v2.RDS"))
8+
# ABT_in <- read.csv(paste0(path, "transformed/ABT_base.csv"))
9+
10+
# abt_date <- "2020-07-01"
11+
12+
add_hist_activity <- function(ABT_in, data_in, abt_date){
13+
ABT <- ABT_in %>% filter(year_month == abt_date)
14+
15+
data <- data_in %>%
16+
mutate(time = as.Date(time)) %>%
17+
filter(time < as.Date(abt_date),
18+
time > as.Date(abt_date) - as.difftime(183, unit="days"),
19+
by %in% ABT$by) %>%
20+
select(by, descendants, score, time) %>%
21+
replace_na(list("score" = 0, "descendants" = 0))
22+
23+
activity_30_1 <- data %>%
24+
filter(as.Date(abt_date) - time < 30) %>%
25+
group_by(by) %>%
26+
summarise(max_activity_score_30 = max(score),
27+
max_activity_desc_30 = max(descendants),
28+
mean_activity_score_30 = mean(score),
29+
mean_activity_desc_30 = mean(descendants),
30+
activity_count_30 = n()) %>%
31+
ungroup() %>%
32+
replace(is.na(.), 0)
33+
34+
activity_30_2 <- data %>%
35+
filter(as.Date(abt_date) - time < 60,
36+
as.Date(abt_date) - time >= 30) %>%
37+
group_by(by) %>%
38+
summarise(max_activity_score_30_2 = max(score),
39+
max_activity_desc_30_2 = max(descendants),
40+
mean_activity_score_30_2 = mean(score),
41+
mean_activity_desc_30_2 = mean(descendants),
42+
activity_count_30_2 = n()) %>%
43+
ungroup() %>%
44+
replace(is.na(.), 0)
45+
46+
activity_60 <- data %>%
47+
filter(as.Date(abt_date) - time < 60) %>%
48+
group_by(by) %>%
49+
summarise(max_activity_score_60 = max(score),
50+
max_activity_desc_60 = max(descendants),
51+
mean_activity_score_60 = mean(score),
52+
mean_activity_desc_60 = mean(descendants),
53+
activity_count_60 = n()) %>%
54+
ungroup() %>%
55+
replace(is.na(.), 0)
56+
57+
activity_60_2 <- data %>%
58+
filter(as.Date(abt_date) - time < 120,
59+
as.Date(abt_date) - time >= 60) %>%
60+
group_by(by) %>%
61+
summarise(max_activity_score_60_2 = max(score),
62+
max_activity_desc_60_2 = max(descendants),
63+
mean_activity_score_60_2 = mean(score),
64+
mean_activity_desc_60_2 = mean(descendants),
65+
activity_count_60_2 = n()) %>%
66+
ungroup() %>%
67+
replace(is.na(.), 0)
68+
69+
activity_trend_30 <- activity_30_1 %>%
70+
left_join(activity_30_2) %>%
71+
replace(is.na(.), 0) %>%
72+
mutate(max_activity_score_t_30 = max_activity_score_30 - max_activity_score_30_2,
73+
max_activity_desc_t_30 = max_activity_desc_30 - max_activity_desc_30_2,
74+
mean_activity_score_t_30 = mean_activity_score_30 - mean_activity_score_30_2,
75+
mean_activity_desc_t_30 = mean_activity_desc_30 - mean_activity_desc_30_2,
76+
activity_trend_30 = activity_count_30 - activity_count_30_2)
77+
78+
activity_trend_60 <- activity_60 %>%
79+
left_join(activity_60_2) %>%
80+
replace(is.na(.), 0) %>%
81+
mutate(max_activity_score_t_60 = max_activity_score_60 - max_activity_score_60_2,
82+
max_activity_desc_t_60 = max_activity_desc_60 - max_activity_desc_60_2,
83+
mean_activity_score_t_60 = mean_activity_score_60 - mean_activity_score_60_2,
84+
mean_activity_desc_t_60 = mean_activity_desc_60 - mean_activity_desc_60_2,
85+
activity_trend_60 = activity_count_60 - activity_count_60_2)
86+
87+
ABT <- ABT %>%
88+
left_join(activity_trend_30) %>%
89+
left_join(activity_trend_60) %>%
90+
replace(is.na(.), 0)
91+
92+
return(ABT)
93+
}

ABT_add_last_activity.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ add_last_activity <- function(ABT_in, data_in, abt_date) {
1313
ABT <- ABT_in %>% filter(year_month == abt_date)
1414

1515
data <- data_in %>%
16-
mutate(time = as.Date(as.POSIXct(time, origin="1970-01-01"))) %>%
16+
mutate(time = as.Date(time)) %>%
1717
filter(time < abt_date, by %in% ABT$by) %>%
1818
group_by(by) %>%
1919
summarise(last_activity = max(time)) %>%

ABT_add_post_popularity.R

+3-1
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,15 @@ add_post_popularity <- function(ABT_in, data_in, abt_date){
1313
ABT <- ABT_in %>% filter(year_month == abt_date)
1414

1515
data <- data_in %>%
16-
mutate(time = as.Date(as.POSIXct(time, origin="1970-01-01"))) %>%
16+
mutate(time = as.Date(time)) %>%
1717
filter(time < as.Date(abt_date),
1818
time > as.Date(abt_date) - as.difftime(183, unit="days"),
1919
by %in% ABT$by, type == "story") %>%
2020
select(by, descendants, score, time) %>%
2121
replace_na(list("score" = 0, "descendants" = 0))
2222

23+
print(dim(data))
24+
2325
last_post <- data %>%
2426
group_by(by) %>%
2527
mutate(max_post = max(time)) %>%

ABT_finalize.R

+17-8
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,28 @@ library(lubridate)
55
source("ABT_add_last_activity.R")
66
source("ABT_add_post_popularity.R")
77
source("ABT_add_comment_popularity.R")
8+
source("ABT_add_hist_activity.R")
9+
source("ABT_add_comment_NLP.R")
810

911
path <- "G:/Saját meghajtó/HiFly/Common/Homokozó/nlp-hackathon/data/"
1012

11-
data_in <- readRDS(paste0(path, "collected/items_225_245_v2.RDS"))
12-
ABT <- read.csv(paste0(path, "transformed/ABT_base.csv"))
13+
data_in <- read.csv(paste0(path, "transformed/data_full.csv")) # , encoding="iso-8859-2")
14+
ABT <- read.csv(paste0(path, "transformed/ABT_base_full.csv"))
15+
sent_in <- readRDS(paste0(path, "transformed/comment_month_user_agg.rds"))
1316

14-
ABT <- ABT %>% select(-c(comment_number, story_number, activity_number))
17+
data <- data_in %>% filter(time >= as.Date("2019-08-01"))
18+
19+
ABT <- ABT %>%
20+
select(-c(comment_number, story_number, activity_number)) %>%
21+
filter(is_active == 1)
1522

1623
ABTs <- list()
17-
for(ym in c("2020-07-01", "2020-08-01")){
18-
ABTs[[ym]] <- add_last_activity(ABT, data_in, ym)
19-
ABTs[[ym]] <- add_post_popularity(ABTs[[ym]], data_in, ym)
20-
ABTs[[ym]] <- add_comment_popularity(ABTs[[ym]], data_in, ym)
24+
for(ym in c("2020-06-01", "2020-07-01")){
25+
ABTs[[ym]] <- add_last_activity(ABT, data, ym)
26+
ABTs[[ym]] <- add_post_popularity(ABTs[[ym]], data, ym)
27+
ABTs[[ym]] <- add_comment_popularity(ABTs[[ym]], data, ym)
28+
ABTs[[ym]] <- add_hist_activity(ABTs[[ym]], data, ym)
29+
ABTs[[ym]] <- add_comment_NLP(ABTs[[ym]], sent_in, ym)
2130
}
2231

23-
write_rds(ABTs, paste0(path, "transformed/final_ABTs.RDS"))
32+
saveRDS(ABTs, paste0(path, "transformed/final_ABTs_2.RDS"))

0 commit comments

Comments
 (0)