-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcondense.py
66 lines (57 loc) · 1.88 KB
/
condense.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
def subreddit_dataframe_condense(df):
"""
Reassigns Pushshift archives to condensed df for annotation, assigns columns for strain,
explicit targeting, implicit vulnerability tags
"""
df = df[[
'author',
'created_utc',
'date',
'id',
'num_comments',
'selftext',
'subreddit',
'title',
]].copy()
df.rename(
columns = {
'author': 'p_au',
'created_utc': 'p_utc',
'date': 'p_date',
'id': 'p_id',
'num_comments': 'n_cmnt',
'selftext': 'text',
'subreddit': 'sbrt',
'title': 'p_titl',
}, inplace = True,
)
df = df.assign(
asp = ' ', ### s_1...3 strains
asp_rtnl = ' ',
dep = ' ',
dep_rtnl = ' ',
val = ' ',
val_rtnl = ' ',
prg = ' ', ### E_1,2 explicit targeting
tgd = ' ',
age = ' ', ### I_1...3 implicit vulnerabilities
race = ' ',
dbty = ' ',
insb = ' ', ### insubstantial
)
df = df[~df['text'].isin([
'[deleted]',
'[removed]',
])]
return df
import pandas as pd
def subreddit_parse(df, col):
"""
Parses df by subreddit, returns dict 'sub_d' of subreddit-specific df objects.
"""
uniq_val = df[col].unique()
sub_d = {}
for val in uniq_val:
sub_d[f'd_{val}'] = df[df[col] == val].copy()
return sub_d