-
Notifications
You must be signed in to change notification settings - Fork 0
/
wm2thes.py
112 lines (79 loc) · 3 KB
/
wm2thes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/python
"""
Word Sketch Data ---> Word-(Relation,Word) Matrix
=================================================
Script for transforming word sketch data into a sparse target-context
co-occurrence matrix with the same parameters as original wm2thes.
To save memory, the script does two-passes over the word sketch data.
"""
import sys
import pickle
import numpy as np
# Sketch Engine imports
import wmap
import manatee
# ------------------------------------------------------------------------------
# Only (word, rel, col) triples exceeding these values shall pass:
TARGET_COUNT = 20
TRIPLE_COUNT = 1
TRIPLE_SCORE = 0
# ------------------------------------------------------------------------------
def iter_word_sketches(corpus_id):
"""
Yields (word, rel_id, coll_id, score) quadruples.
"""
corpus = manatee.Corpus(corpus_id)
attr = corpus.get_attr(corpus.get_conf('WSATTR'))
wsbase = corpus.get_conf('WSBASE')
wmap1 = wmap.WMap(wsbase, 0, 0, 0, corpus_id)
while True: # over targets
if wmap1.getcnt() > TARGET_COUNT:
word = attr.id2str(wmap1.getid())
wmap2 = wmap1.nextlevel()
while True: # over target's relations
rel_id = wmap2.getid()
wmap3 = wmap2.nextlevel()
while True: # over target's relation's collocates
coll_id = wmap3.getid()
count = wmap3.getcnt()
rank = wmap3.getrnk()
if count > TRIPLE_COUNT and rank > TRIPLE_SCORE:
yield (word, rel_id, coll_id, rank)
if not wmap3.next():
break
if not wmap2.next():
break
if not wmap1.next():
break
def main():
if len(sys.argv) != 3:
sys.stderr.write("Usage: python wm2thes.py CORPUS_NAME OUTPUT_NAME\n")
sys.exit(1)
corpus_name = sys.argv[1]
output_name = sys.argv[2]
#
nb_cells = 0
targets = set()
contexts = set()
for word, rel_id, col_id, ___ in iter_word_sketches(corpus_name):
nb_cells += 1
targets.add(word)
contexts.add((rel_id, col_id))
# TODO: targets sorted by freqs
target2i = {target: i for i, target in enumerate(targets)}
context2i = {(rel, coll): i for i, (rel, coll) in enumerate(contexts)}
# Sparse target-context co-occurrence matrix
rows = np.zeros((nb_cells,), dtype=np.int64)
cols = np.zeros((nb_cells,), dtype=np.int64)
vals = np.zeros((nb_cells,), dtype=np.float32)
for i, (word, rel_id, coll_id, score) in enumerate(iter_word_sketches(corpus_name)):
rows[i] = target2i[word]
cols[i] = context2i[(rel_id, coll_id)]
vals[i] = score
np.save(output_name + "-rows.npy", rows)
np.save(output_name + "-cols.npy", cols)
np.save(output_name + "-vals.npy", vals)
with open(output_name + "-target2i.pickle", "w") as f:
pickle.dump(target2i, f)
if __name__ == "__main__":
main()