forked from karpathy/llama2.c
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword2vec.py
29 lines (22 loc) · 792 Bytes
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import os
from gensim.models import Word2Vec
sentences = []
for a, b, fileList in os.walk('text'):
for file in fileList:
with open('text/' + file, encoding='utf8') as f:
txt = str.split(f.read().replace('\n', ''), '.')
for s in txt:
words = []
for w in s.split():
if (len(w.strip()) > 0):
words.append(w.strip().lower())
sentences.append(words)
train_sentences = list(sentences)
model = Word2Vec(sentences=train_sentences,
vector_size=100,
window=100,
workers=4)
print(len(model.wv))
print(model.wv.index_to_key[:100])
embedding = model.wv['марко']
print(model.wv.most_similar(positive=[embedding], topn=10))