-
Notifications
You must be signed in to change notification settings - Fork 11
/
encode_keywords_word2vec.py
59 lines (46 loc) · 1.46 KB
/
encode_keywords_word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import time
import torch
import json
import os
import numpy as np
import scipy.io as sio
import argparse
import gensim.downloader as api
import pickle
import argparse
######## Change to encode keywords for the desired task
encode_50keywords = True
encode_ROC = False
encode_articles = False
print('word2vec loading...')
word2vec_encoder = api.load("word2vec-google-news-300")
print('word2vec loaded')
word2vec_dict = {}
######## Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument('-file', type=str)
args = parser.parse_args()
file_name = args.file
folder = os.path.dirname(file_name)
print('file_name: ', file_name)
if encode_50keywords == True:
file1 = open(str(os.path.dirname(os.path.abspath(__file__))) +
file_name, "r+")
lines = file1.readlines()
i=0
for line in lines:
keywords = list(line.strip().split(", "))
word2vec_words = []
print(keywords)
for word in keywords:
word2vec = word2vec_encoder[word]
word2vec_words.append(word2vec)
word2vec_dict[word] = word2vec
save_path = str(os.path.dirname(
os.path.abspath(__file__))) + folder + '/set_' +str(i) + '.npy'
np.save(save_path, word2vec_words)
i=i+1
save_path_dict = str(os.path.dirname(
os.path.abspath(__file__))) + folder + '/dict_word2vec.pkl'
with open(save_path_dict, 'wb') as f:
pickle.dump(word2vec_dict, f, pickle.HIGHEST_PROTOCOL)