-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathgeneratefeatures.py
executable file
·107 lines (93 loc) · 4.2 KB
/
generatefeatures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
## Author: Jessy Li (ljunyi@seas.upenn.edu)
## given raw text files, generate features for
## shallow and neuralbrn classifiers
from collections import namedtuple
import os.path
import features
import utils
Instance = namedtuple("Instance","uid,label,rawsent")
class ModelNewText(object):
def __init__(self, brnspace, brnclst, embeddings):
self.featurestest = {} ## <name, flist>
self.test = []
self.brnclst = brnclst
self.brnspace = brnspace
self.embeddings = embeddings
self.fileid = None
def loadFromFile(self,filename):
self.test = []
self.fileid = os.path.basename(filename)
i = 0
with open(filename) as f:
for line in f:
if len(line.strip()) == 0: continue
self.test.append(Instance(self.fileid+"."+str(i),0,features.RawSent(line.strip())))
i += 1
f.close()
def loadSentences(self, identifier, sentlist):
## sentlist should be a list of sentence strings, tokenized;
## identifier is a string serving as the header of this sentlst
self.test = []
self.fileid = identifier
for i,sent in enumerate(sentlist):
self.test.append(Instance(identifier+"."+str(i),0,features.RawSent(sent)))
def _add_feature(self, key, values):
if key in self.featurestest: return
self.featurestest[key] = values
def fShallow(self):
normalize = True
recs = [r.rawsent for r in self.test]
self._add_feature("sentlen",features.sentLen(recs))
self._add_feature("numnumbers",features.numNumbers(recs, normalize))
self._add_feature("numcapltrs",features.numCapLetters(recs, normalize))
self._add_feature("numsymbols",features.numSymbols(recs, normalize))
self._add_feature("avgwordlen",features.avgWordLen(recs))
self._add_feature("numconns",features.numConnectives(recs))
self._add_feature("fracstopwords",features.fracStopwords(recs))
polarvals = features.mpqaGenInqInfo(recs)
keys = ["mpqageninq-subj","mpqageninq-polarity"]
for (key,vals) in zip(keys,polarvals):
self._add_feature(key,vals)
mrcvals = features.mrcInfo(recs)
keys = ["mrc-fami","mrc-img"]
for (key,vals) in zip(keys,mrcvals):
self._add_feature(key,vals)
idfvals = features.idf(recs)
keys = ["idf-min", "idf-max", "idf-avg"]
for (key,vals) in zip(keys,idfvals):
self._add_feature(key,vals)
def fNeuralVec(self):
keys = ["neuralvec-"+str(i) for i in range(100)]
if keys[0] not in self.featurestest:
feats = features.neuralvec(self.embeddings,[r.rawsent for r in self.test])
for i,key in enumerate(keys):
self.featurestest[key] = feats[i]
def fBrownCluster(self):
if self.brnclst == None:
self.brnclst = utils.readMetaOptimizeBrownCluster()
key = "brnclst1gram"
if key not in self.featurestest:
self.featurestest[key] = []
for instance in self.test:
rs = features.getBrownClusNgram(instance.rawsent,1,self.brnclst)
rs = ["_".join(x) for x in rs]
self.featurestest[key].append(rs)
def transformShallow(self):
ys = [x.label for x in self.test]
xs = [{} for i in xrange(len(self.test))]
fnames = ["sentlen","numnumbers","numcapltrs","numsymbols","avgwordlen","numconns","fracstopwords","mpqageninq-subj","mpqageninq-polarity","mrc-fami","mrc-img","idf-min","idf-max","idf-avg"]
for fid,fname in enumerate(fnames):
for i,item in enumerate(self.featurestest[fname]):
xs[i][fid+1] = item
return ys,xs
def transformWordRep(self):
neuralvec_start = 1
ys = [x.label for x in self.test]
xs = [{} for i in xrange(len(self.test))]
for j in range(100):
fname = "neuralvec-"+str(j)
for i,item in enumerate(self.featurestest[fname]):
xs[i][j+1] = item
for i,item in enumerate(self.featurestest["brnclst1gram"]):
xs[i].update(self.brnspace.toFeatDict(item,False))
return ys,xs