-
Notifications
You must be signed in to change notification settings - Fork 5
/
generate-train-test.py
111 lines (87 loc) · 4.84 KB
/
generate-train-test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from database import db_session, init_db
from models import ArticleMetadata, CodeFirstPass, CodeSecondPass
from sqlalchemy import func
from datetime import datetime
import random
import pandas as pd
import numpy as np
today_str = datetime.today().strftime('%Y-%m-%d')
## get the database of variables
df_db = pd.DataFrame([(x[1].db_id, x[0].variable, x[0].value, x[0].text) \
for x in db_session.query(CodeFirstPass, ArticleMetadata).\
join(ArticleMetadata).filter(~CodeFirstPass.variable.in_(['load', 'protest', 'multi', 'nous', 'ignore'])).all()],\
columns = ['id', 'variable', 'value', 'text'])
def findVariableLocation(id, variable):
""" Gets the first instance of variable in the article text. """
values = df_db[(df_db.variable == variable) & (df_db.id == id)].value.values
## paragraph numbers are noted by pp-start-end, indexed from 0.
## meta data is included and will be indexed as -1
try:
values = map(lambda x: -1 if 'meta' in x else int(x.split('-')[0]), values)
except ValueError as detail:
print(detail)
pass ## some weirdness happening but whatever we'll just catch
## TK: Make any affordances for things that just say, non-descriptly, 'protest', like in form?
## I think this is okay for now since it still provides information about the locus of the action.
if len(values) < 1:
return None
return min(values)
# def addParagraphNums():
# """ Take the existing files and add paragraph numbers for form, issue, location, time, and size """
# for pub in ['NYT', 'WaPo', 'AGW-AFP']:
# for filename in ['training', 'test']:
# df_pub = pd.read_csv("generate-train-test/haystack-%s-mpeds-%s_2015-08-30.csv" % (filename, pub))
# for variable in ['form', 'issue', 'loc', 'time', 'size']:
# print(filename, pub, variable)
# ## if at least one record is yes or maybe
# var_index = df_pub[(df_pub.percent_yes > 0) | (df_pub.percent_maybe > 0)].id
# df_pub[variable + '_paragraph'] = var_index.apply(findVariableLocation, args = [variable])
# df_pub.to_csv("generate-train-test/haystack-%s-mpeds-%s_2015-08-30.csv" % (filename, pub), index = False)
# def addNumCoders(df):
# """ Take existing files and add number of coders. """
# df_numcoders = pd.DataFrame({'num_coders': df.num_coders, 'id' = df.index)
# for pub in ['NYT', 'WaPo', 'AGW-AFP']:
# for filename in ['training', 'test']:
# df_pub = pd.read_csv("generate-train-test/haystack-%s-mpeds-%s_2015-08-30.csv" % (filename, pub))
# df_pub = df_pub.merge(df_numcoders)
# df_pub.to_csv("generate-train-test/haystack-%s-mpeds-%s_2015-08-30.csv" % (filename, pub), index = False)
def generateTrainTestSets():
""" Generates training and test sets by publication from the MPEDS database. """
cfp_dict = {}
for cfp, am in db_session.query(CodeFirstPass, ArticleMetadata).join(ArticleMetadata).filter(CodeFirstPass.variable == 'protest').all():
pub = ''
if 'AGW' in am.db_id:
pub = "-".join(am.db_id.split("_")[0:2])
elif 'NYT' in am.db_id:
pub = 'NYT'
else:
pub = am.db_id.split("_")[0]
if am.db_id not in cfp_dict:
cfp_dict[am.db_id] = {"coder_value": [], "num_coders": 0, "pub": pub, "percent_yes": 0, "percent_maybe": 0, "percent_no": 0}
cfp_dict[am.db_id]["coder_value"].append( (cfp.coder_id, cfp.value) )
cfp_dict[am.db_id]["num_coders"] += 1
for article_id, v in cfp_dict.items():
for coder, decision in v["coder_value"]:
## not sure why this would be null, but it is
if not decision:
continue
## add to percent
cfp_dict[article_id]["percent_" + decision] += 1.
## divide all
for decision in ['yes', 'no', 'maybe']:
cfp_dict[article_id]["percent_" + decision] /= len(cfp_dict[article_id]["coder_value"])
df = pd.DataFrame(cfp_dict).T
del df['coder_value']
df['id'] = df.index
## add in variable paragraph numbers
for variable in ['form', 'issue', 'loc', 'time', 'size']:
print(variable)
## if at least one record is yes or maybe
var_index = df[(df.percent_yes > 0) | (df.percent_maybe > 0)].id
df[variable + '_paragraph'] = var_index.apply(findVariableLocation, args = [variable])
for pub in ['NYT', 'WaPo', 'AGW-AFP']:
df_pub = df[df.pub == pub]
## 90/10 split
rows = random.sample(df_pub.index, int(df_pub.index.shape[0]*0.9))
df_pub.ix[rows].to_csv("generate-train-test/haystack-training-mpeds-%s_%s.csv" % (pub, today_str), index = False)
df_pub.drop(rows).to_csv("generate-train-test/haystack-test-mpeds-%s_%s.csv" % (pub, today_str), index = False)