-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_vocabulary.py
52 lines (36 loc) · 1.4 KB
/
create_vocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#! /user/bin/env python
import pickle
train_data_file = "hw4data/traindata.txt"
stop_words_file = "hw4data/stoplist.txt"
train_label_file = "hw4data/trainlabels.txt"
def find_prior():
with open(train_label_file, 'r') as f:
train_labels = [int(x.strip()) for x in f.readlines()]
return train_labels.count(0), train_labels.count(1), train_labels.count(0)/float(len(train_labels)), train_labels.count(1)/float(len(train_labels))
def find_stop_words():
with open(stop_words_file, 'r') as f:
stop_words = [x.strip() for x in f.readlines()]
return stop_words
def main():
sentences = list()
vocabulary = dict()
stop_words = find_stop_words()
with open(train_label_file, 'r') as f:
train_labels = [int(x.strip()) for x in f.readlines()]
with open(train_data_file, 'r') as f:
sentences = f.readlines()
for sentence, label in zip(sentences, train_labels):
words = sentence.strip().split(' ')
words = list(set(words))
for word in words:
if word not in stop_words:
if word in vocabulary:
vocabulary[word][label] += 1
else:
val = [0,0]
val[label] += 1
vocabulary[word] = val
with open('vocabulary', 'w') as f:
pickle.dump(vocabulary, f)
if __name__ == "__main__":
main()