10
10
11
11
from datetime import datetime
12
12
from sklearn .utils import shuffle
13
- from word2vec import get_wikipedia_data , find_analogies
13
+ from word2vec import get_wikipedia_data , find_analogies , get_sentences_with_word2idx_limit_vocab
14
14
15
15
# Experiments
16
16
# previous results did not make sense b/c X was built incorrectly
@@ -260,8 +260,11 @@ def save(self, fn):
260
260
np .savez (fn , * arrays )
261
261
262
262
263
- def main (we_file , w2i_file , n_files = 50 ):
264
- cc_matrix = "cc_matrix_%s.npy" % n_files
263
+ def main (we_file , w2i_file , use_brown = True , n_files = 50 ):
264
+ if use_brown :
265
+ cc_matrix = "cc_matrix_brown.npy"
266
+ else :
267
+ cc_matrix = "cc_matrix_%s.npy" % n_files
265
268
266
269
# hacky way of checking if we need to re-load the raw data or not
267
270
# remember, only the co-occurrence matrix is needed for training
@@ -270,7 +273,19 @@ def main(we_file, w2i_file, n_files=50):
270
273
word2idx = json .load (f )
271
274
sentences = [] # dummy - we won't actually use it
272
275
else :
273
- sentences , word2idx = get_wikipedia_data (n_files = n_files , n_vocab = 2000 )
276
+ if use_brown :
277
+ keep_words = set ([
278
+ 'king' , 'man' , 'woman' ,
279
+ 'france' , 'paris' , 'london' , 'rome' , 'italy' , 'britain' , 'england' ,
280
+ 'french' , 'english' , 'japan' , 'japanese' , 'chinese' , 'italian' ,
281
+ 'australia' , 'australian' , 'december' , 'november' , 'june' ,
282
+ 'january' , 'february' , 'march' , 'april' , 'may' , 'july' , 'august' ,
283
+ 'september' , 'october' ,
284
+ ])
285
+ sentences , word2idx = get_sentences_with_word2idx_limit_vocab (keep_words = keep_words )
286
+ else :
287
+ sentences , word2idx = get_wikipedia_data (n_files = n_files , n_vocab = 2000 )
288
+
274
289
with open (w2i_file , 'w' ) as f :
275
290
json .dump (word2idx , f )
276
291
@@ -282,17 +297,19 @@ def main(we_file, w2i_file, n_files=50):
282
297
cc_matrix = cc_matrix ,
283
298
learning_rate = 3 * 10e-5 ,
284
299
reg = 0.01 ,
285
- epochs = 2000 ,
286
- gd = True ,
287
- use_theano = True
300
+ epochs = 10 ,
301
+ gd = False ,
302
+ use_theano = False
288
303
) # gradient descent
289
304
model .save (we_file )
290
305
291
306
292
307
if __name__ == '__main__' :
293
- we = 'glove_model_50.npz'
294
- w2i = 'glove_word2idx_50.json'
295
- main (we , w2i )
308
+ # we = 'glove_model_50.npz'
309
+ # w2i = 'glove_word2idx_50.json'
310
+ we = 'glove_model_brown.npz'
311
+ w2i = 'glove_word2idx_brown.json'
312
+ main (we , w2i , use_brown = True )
296
313
for concat in (True , False ):
297
314
print "** concat:" , concat
298
315
find_analogies ('king' , 'man' , 'woman' , concat , we , w2i )
0 commit comments