Skip to content

Commit

Permalink
improve repository and added topic modeling
Browse files Browse the repository at this point in the history
  • Loading branch information
huseinzol05 committed Aug 30, 2019
1 parent f4fb57a commit 9895ee0
Show file tree
Hide file tree
Showing 32 changed files with 1,750 additions and 243 deletions.
493 changes: 250 additions & 243 deletions README.md

Large diffs are not rendered by default.

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
544 changes: 544 additions & 0 deletions vectorizer/5.lda2vec.ipynb

Large diffs are not rendered by default.

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
59 changes: 59 additions & 0 deletions vectorizer/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import sklearn.datasets
import numpy as np
import re
import collections
import random
from sklearn import metrics
from nltk.corpus import stopwords

english_stopwords = stopwords.words('english')


def clearstring(string):
string = re.sub('[^A-Za-z0-9 ]+', '', string)
string = string.split(' ')
string = filter(None, string)
string = [y.strip() for y in string if y.strip() not in english_stopwords]
string = ' '.join(string)
return string.lower()


def separate_dataset(trainset, ratio = 0.5):
datastring = []
datatarget = []
for i in range(len(trainset.data)):
data_ = trainset.data[i].split('\n')
data_ = list(filter(None, data_))
data_ = random.sample(data_, int(len(data_) * ratio))
for n in range(len(data_)):
data_[n] = clearstring(data_[n])
datastring += data_
for n in range(len(data_)):
datatarget.append(trainset.target[i])
return datastring, datatarget


def build_dataset(words, n_words):
count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
count.extend(collections.Counter(words).most_common(n_words - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
index = dictionary.get(word, 0)
if index == 0:
unk_count += 1
data.append(index)
count[0][1] = unk_count
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reversed_dictionary


def str_idx(corpus, dic, maxlen, UNK = 3):
X = np.zeros((len(corpus), maxlen))
for i in range(len(corpus)):
for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
X[i, -1 - no] = dic.get(k, UNK)
return X
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 9895ee0

Please sign in to comment.