-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathw2v_modules.py
120 lines (92 loc) · 3.23 KB
/
w2v_modules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__synopsis__ : Tools for Word2Vec operations
__description__ :
__project__ : my_modules
__author__ : 'Samujjwal Ghosh'
__version__ :
__date__ : June 2018
__copyright__ : "Copyright (c) 2018"
__license__ : "Python"; (Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html)
__classes__ :
__variables__ :
__methods__ :
TODO : 1.
"""
import os
import numpy as np
from collections import OrderedDict
import my_modules as mm
def init_w2v(nlp=True):
if nlp:
nlp_path = '/home/cs16resch01001/data/crisisNLP_word2vec_model/'
nlp_file = 'crisisNLP_word_vector.bin'
w2v = open_word2vec(os.path.join(nlp_path,nlp_file))
print("NLP Word2Vec selected")
else:
g_path = '/home/cs16resch01001/data/'
g_file = 'GoogleNews-vectors-negative300.bin'
w2v = open_word2vec(os.path.join(g_path,g_file))
print("Google Word2Vec selected")
return w2v
def open_word2vec(word2vec):
from gensim.models.keyedvectors import KeyedVectors
model = KeyedVectors.load_word2vec_format(word2vec, binary=True)
return model
def use_word2vec(train,w2v):
train_vec = OrderedDict()
for id,val in train.items():
s_vec = np.zeros(300)
for word in val['parsed_tweet'].split(" "):
if word in w2v.wv.vocab:
# train_vec[id][word] = w2v[word].tolist()
s_vec = np.add(s_vec, w2v[word])
else:
pass
# print("Word [",word,"] not in vocabulary")
# print("\n")
train_vec[id]=s_vec
return train_vec
def find_sim(w2v,word,c=None):
print(w2v)
print(type(w2v))
#print("Find similar words of: ",word)
w2v_words = []
if word in w2v.wv.vocab:
print(w2v.most_similar(positive=[word], negative=[], topn=c))
w2v_words = w2v.most_similar(positive=[word], negative=[], topn=c)
print("here2: ",w2v_words)
#for term,val in list(w2v_words):
# word_list = word_list + [term]
print(w2v_words)
return w2v_words
def find_sim_list(w2v,words,c=None):
for word in words:
words = words + find_sim(w2v,word,c)
words = mm.remove_dup_list(words, case=True)
return words[0:c]
def expand_tweet(w2v,tweet,c=3):
new_tweet = []
for word in tweet.split(" "):
new_tweet= new_tweet+[word]
w2v_words = find_sim(w2v,word,c)
#if word in w2v.vocab:
# w2v_words=w2v.most_similar(positive=[word], negative=[], topn=c)
for term,val in w2v_words:
new_tweet= new_tweet+[term]
return new_tweet
def expand_tweets(w2v,dict):
# print("Method: expand_tweets(dict)")
for id,val in dict.items():
val['expanded_tweet'] = "".join(expand_tweet(w2v,val['parsed_tweet']))
return dict
def create_w2v(corpus,size=1000,window=5,min_count=3,workers=10):
from gensim.models import Word2Vec
w2v = Word2Vec(corpus,size,window,min_count,workers)
print(w2v)
print(type(w2v))
return w2v
def main():
pass
if __name__ == "__main__": main()