diff --git a/CHANGELOG.md b/CHANGELOG.md index e581f97..fbc9984 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +# 3.0 - 更简单的定制和配置,增加了额外的开销 +* 去掉nearby words, 使用 kdtree检索空间词汇的最近临 +* 增加了对sk-learn的依赖,但是减少了对词向量的预处理 +* 优化了分词所使用的字典,也可以使用环境变量声明主字典 +* 支持自定义word2vec模型,使用环境变量声明 + # 2.5 * 使用空间距离近的词汇优化编辑距离计算 diff --git a/README.md b/README.md index c21ea4e..39d42a4 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Chinese Synonyms for Natural Language Processing and Understanding. ``` pip install -U synonyms ``` -兼容py2和py3,当前稳定版本 [v2.x](https://github.com/huyingxi/Synonyms/releases)。 +兼容py2和py3,当前稳定版本 [v3.x](https://github.com/huyingxi/Synonyms/releases)。 **Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。** @@ -53,16 +53,16 @@ print("识别: %s" % (synonyms.nearby("识别"))) print("NOT_EXIST: %s" % (synonyms.nearby("NOT_EXIST"))) ``` -```synonyms.nearby(WORD)```返回一个list,list中包含两项:```[[nearby_words], [nearby_words_score]]```,```nearby_words```是WORD的近义词们,也以list的方式存储,并且按照距离的长度由近及远排列,```nearby_words_score```是```nearby_words```中**对应位置**的词的距离的分数,分数在(0-1)区间内,越接近于1,代表越相近。比如: +```synonyms.nearby(WORD)```返回一个元组,元组中包含两项:```([nearby_words], [nearby_words_score])```,```nearby_words```是WORD的近义词们,也以list的方式存储,并且按照距离的长度由近及远排列,```nearby_words_score```是```nearby_words```中**对应位置**的词的距离的分数,分数在(0-1)区间内,越接近于1,代表越相近。比如: ``` -synonyms.nearby(人脸) = [ +synonyms.nearby(人脸) = ( ["图片", "图像", "通过观察", "数字图像", "几何图形", "脸部", "图象", "放大镜", "面孔", "Mii"], [0.597284, 0.580373, 0.568486, 0.535674, 0.531835, 0.530 -095, 0.525344, 0.524009, 0.523101, 0.516046]] +095, 0.525344, 0.524009, 0.523101, 0.516046]) ``` -在OOV的情况下,返回 ```[[], []]```,目前的字典大小: 125,792。 +在OOV的情况下,返回 ```([], [])```,目前的字典大小: 125,792。 ### synonyms#compare 两个句子的相似度比较 diff --git a/Requirements.txt b/Requirements.txt index 47bdce6..4d10392 100644 --- a/Requirements.txt +++ b/Requirements.txt @@ -1 +1 @@ -synonyms>=2.5 \ No newline at end of file +synonyms>=2.7 \ No newline at end of file diff --git a/demo.py b/demo.py index 907747f..20241b9 100755 --- a/demo.py +++ b/demo.py @@ -36,7 +36,7 @@ import numpy import unittest -compare_ = lambda x,y,z: "*"* 30 + "\n%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z)) +compare_ = lambda x,y,z: "%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z)) + "\n" +"*"* 30 + "\n" # run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample class Test(unittest.TestCase): @@ -97,7 +97,7 @@ def test_similarity(self): print("%s vs %s" % (sen1, sen2), r) def test_nearby(self): - synonyms.display("人脸") # synonyms.display calls synonyms.nearby + synonyms.display("奥运") # synonyms.display calls synonyms.nearby def test(): diff --git a/setup.py b/setup.py index 807a62d..ba4646d 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setup( name='synonyms', - version='2.6', + version='3.0', description='Chinese Synonyms for Natural Language Processing and Understanding', long_description=LONGDOC, author='Hai Liang Wang, Hu Ying Xi', @@ -41,7 +41,8 @@ install_requires=[ 'jieba>=0.39', 'six>=1.11.0', - 'numpy>=1.13.1' + 'numpy>=1.13.1', + 'scikit-learn==0.19.1' ], package_data={ 'synonyms': [ diff --git a/synonyms/__init__.py b/synonyms/__init__.py index 17dd721..27b1de5 100755 --- a/synonyms/__init__.py +++ b/synonyms/__init__.py @@ -51,6 +51,8 @@ from synonyms.utils import any2utf8 from synonyms.utils import any2unicode from synonyms.utils import sigmoid +from synonyms.utils import cosine +from sklearn.neighbors import KDTree import jieba.posseg as _tokenizer import jieba @@ -67,54 +69,23 @@ ''' # combine similarity scores _similarity_smooth = lambda x, y, z: (x * y) + z -_sim_molecule = lambda x: np.sum(x, axis=0) # 分子 - +_flat_sum_array = lambda x: np.sum(x, axis=0) # 分子 ''' tokenizer settings ''' +tokenizer_dict = os.path.join(curdir, 'data', 'vocab.txt') if "SYNONYMS_WORDSEG_DICT" in ENVIRON: - tokenizer_dict = ENVIRON["SYNONYMS_WORDSEG_DICT"] - if os.exist(tokenizer_dict): - jieba.set_dictionary(tokenizer_dict) + if os.exist(ENVIRON["SYNONYMS_WORDSEG_DICT"]): print("info: set wordseg dict with %s" % tokenizer_dict) + tokenizer_dict = ENVIRON["SYNONYMS_WORDSEG_DICT"] else: print("warning: can not find dict at [%s]" % tokenizer_dict) -''' -nearby -''' -def _load_vocab(file_path): - ''' - load vocab dict - ''' - global _vocab - if PLT == 2: - import io - fin = io.TextIOWrapper( - io.BufferedReader( - gzip.open(file_path)), - encoding='utf8', - errors='ignore') - else: - fin = gzip.open(file_path, 'rt', encoding='utf-8', errors="ignore") - - _vocab = json.loads(fin.read()) - -# build on load -print(">> Synonyms on loading vocab ...") -_load_vocab(os.path.join(curdir, "data", "words.nearby.json.gz")) - -def nearby(word): - ''' - Nearby word - ''' - try: - return _vocab[any2unicode(word)] - except KeyError as e: - return [[], []] +print(">> Synonyms load wordseg dict [%s] ... " % tokenizer_dict) +jieba.set_dictionary(tokenizer_dict) ''' -similarity +word embedding ''' # stopwords _fin_stopwords_path = os.path.join(curdir, 'data', 'stopwords.txt') @@ -131,7 +102,7 @@ def _load_stopwords(file_path): for w in stopwords: _stopwords.add(any2unicode(w).strip()) -print(">> Synonyms on loading stopwords ...") +print(">> Synonyms on loading stopwords [%s] ..." % _fin_stopwords_path) _load_stopwords(_fin_stopwords_path) def _segment_words(sen): @@ -158,7 +129,7 @@ def _load_w2v(model_file=_f_model, binary=True): raise Exception("Model file [%s] does not exist." % model_file) return KeyedVectors.load_word2vec_format( model_file, binary=binary, unicode_errors='ignore') -print(">> Synonyms on loading vectors ...") +print(">> Synonyms on loading vectors [%s] ..." % _f_model) _vectors = _load_w2v(model_file=_f_model) def _get_wv(sentence): @@ -195,18 +166,6 @@ def _get_wv(sentence): vectors.append(r) return vectors -def _unigram_overlap(sentence1, sentence2): - ''' - compute unigram overlap - ''' - x = set(sentence1.split()) - y = set(sentence2.split()) - - intersection = x & y - union = x | y - - return ((float)(len(intersection)) / (float)(len(union))) - def _levenshtein_distance(sentence1, sentence2): ''' Return the Levenshtein distance between two strings. @@ -262,25 +221,32 @@ def _similarity_distance(s1, s2): ''' compute similarity with distance measurement ''' - a = _sim_molecule(_get_wv(s1)) - b = _sim_molecule(_get_wv(s2)) - # https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html - g = 1 / (np.linalg.norm(a - b) + 1) - + g = cosine(_flat_sum_array(_get_wv(s1)), _flat_sum_array(_get_wv(s2))) u = _nearby_levenshtein_distance(s1, s2) # print("g: %s, u: %s" % (g, u)) if u > 0.8: - r = _similarity_smooth(g, 1, u) - elif u > 0.7: - r = _similarity_smooth(g, 1.5, u) + r = _similarity_smooth(g, 0.1, u) elif u > 0.6: - r = _similarity_smooth(g, 2, u) + r = _similarity_smooth(g, 0.25, u) + elif u > 0.4: + r = _similarity_smooth(g, 0.5, u) else: - r = _similarity_smooth(g, 4, u) + r = _similarity_smooth(g, 1, u) + if r < 0: r = abs(r) r = min(r, 1.0) return float("%.3f" % r) +def nearby(word): + ''' + Nearby word + ''' + words, scores = [], [] + for x in _vectors.neighbours(any2unicode(word)): + words.append(x[0]) + scores.append(x[1]) + return words, scores + def compare(s1, s2, seg=True): ''' compare similarity diff --git a/synonyms/data/words.nearby.json.gz b/synonyms/data/vocab.txt similarity index 55% rename from synonyms/data/words.nearby.json.gz rename to synonyms/data/vocab.txt index aef2793..7862670 100644 Binary files a/synonyms/data/words.nearby.json.gz and b/synonyms/data/vocab.txt differ diff --git a/synonyms/utils.py b/synonyms/utils.py index d46b1df..fa5f051 100644 --- a/synonyms/utils.py +++ b/synonyms/utils.py @@ -239,6 +239,12 @@ def any2unicode(text, encoding='utf8', errors='strict'): to_unicode = any2unicode +# cosine distance +# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html +from numpy import dot +from numpy.linalg import norm +cosine = lambda a, b: dot(a, b)/(norm(a)*norm(b)) + def sigmoid(x): return 1.0 / (1.0 + np.exp(-x)) diff --git a/synonyms/word2vec.py b/synonyms/word2vec.py index fc9f070..5302357 100644 --- a/synonyms/word2vec.py +++ b/synonyms/word2vec.py @@ -33,6 +33,7 @@ double, array, vstack, fromstring, sqrt, newaxis,\ ndarray, sum as np_sum, prod, ascontiguousarray,\ argmax +from sklearn.neighbors import KDTree class Vocab(object): """ @@ -68,6 +69,7 @@ def __init__(self): self.vocab = {} self.index2word = [] self.vector_size = None + self.kdt = None @property def wv(self): @@ -198,7 +200,12 @@ def add_word(word, weights): (result.syn0.shape[0], len(result.vocab))) result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)]) assert (len(result.vocab), vector_size) == result.syn0.shape - + ''' + KDTree + Build KDTree with vectors. + http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html#sklearn.neighbors.KDTree + ''' + result.kdt = KDTree(result.syn0, leaf_size=10, metric = "euclidean") print("loaded %s matrix from %s" % (result.syn0.shape, fname)) return result @@ -222,6 +229,22 @@ def word_vec(self, word, use_norm=False): else: raise KeyError("word '%s' not in vocabulary" % word) + def neighbours(self, word, size = 10): + """ + Get nearest words with KDTree, ranking by cosine distance + """ + v = self.word_vec(word) + [distances], [points] = self.kdt.query(array([v]), k = size, return_distance = True) + assert len(distances) == len(points), "distances and points should be in same shape." + words, scores = [], {} + for (x,y) in zip(points, distances): + w = self.index2word[x] + s = utils.cosine(v, self.syn0[x]) + if s < 0: s = abs(s) + words.append(w) + scores[w] = min(s, 1.0) + for x in sorted(words, key=scores.get, reverse=True): + yield x, scores[x] import unittest