-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcfg.py
122 lines (81 loc) · 2.26 KB
/
cfg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from parse import *
from pipetools import pipe
from pipetools.utils import foreach
import nltk
import pickle
files = ["2008-01.txt"]
# Uses OCaml-style composition to create a function from files to comments.
getComments = (pipe
| getData
| convertToUTF
| toComment
| foreach (sanitize)
| list
)
# Only create the comments once to avoid repeated computation.
comments = getComments(files)
sents = []
numList = []
curNum = 0
print "Generating sentence list..."
for (i, comm) in tqdm(enumerate(comments)):
# if i == 5:
# break
mycomm = comm[6:-6]
sentList = nltk.sent_tokenize(mycomm)
for sent in sentList:
sents.append(sent)
# numList.append(curNum)
# curNum += 1
# sents.extend(sentList)
print len(sents)
print sents[10]
# print len(numList)
# 1/0
# print len(sents)
POSs = []
print "Word tokenizing sentences..."
for sent in tqdm(sents):
POSs.append(nltk.word_tokenize(sent))
# POSs = [nltk.word_tokenize(sent) for sent in sents]
print POSs[10]
# print sents[0]
# print POSs[0]
POS2s = []
print "Computing POS tags..."
for sent in tqdm(POSs):
POS2s.append(nltk.pos_tag(sent))
# POSs = [nltk.pos_tag(sent) for sent in POSs]
POSs = POS2s
print POSs[10]
tagLists = []
# for sent in POSs[203:207]:
# print sent
# print comments[sent[0]]
# 1/0
for sent in POSs:
tagLists.append([i[1] for i in sent])
print tagLists[10]
filteredSents = []
filteredTags = []
print "Filtering sentences..."
for (i, tag) in tqdm(enumerate(tagLists)):
if ('``' not in tag and '\'\'' not in tag and len(tag) > 4 and len(tag) < 10 and "$" not in tag and tag[-1] == "."):
filteredSents.append(sents[i])
filteredTags.append(tagLists[i])
print len(filteredTags), len(filteredSents)
# print filteredSents[20]
# print filteredTags[20]
# for i in range(len(filteredTags)):
# print len(filteredTags[i]), len(filteredSents[i].split())
# tagLists = filter(lambda x: ('``' not in x and '\'\'' not in x and len(x) > 4 and "$" not in x and x[-1] == "."), tagLists)
outfile = open("tags2.pkl", "wb")
# # for (i,tag) in enumerate(tagLists):
# # outfile.write(str(POSs[i]) + "\n")
# # outfile.write(str(tag) + "\n")
# # outfile.close()
pickle.dump(filteredTags, outfile)
outfile.close()
outfile = open("tagSents2.pkl", "wb")
pickle.dump(filteredSents, outfile)
outfile.close()