From 67de918bfb4f61d9d49f07b07df760224a8e8d78 Mon Sep 17 00:00:00 2001 From: skblaz Date: Mon, 18 Nov 2019 13:32:09 +0100 Subject: [PATCH] added version 0.36; bugfixes + additional functionality --- README.md | 6 ++- mrakun/__init__.py | 104 ++++++++++++++++++++++++++++++++++----------- requirements.txt | 1 + setup.py | 4 +- 4 files changed, 88 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index d27edf278..7b08cdc53 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,8 @@ To test whether the core functionality is ok, you can run python3 -m pytest tests/test_core_functionality.py ``` +Note that you need Py3plex library for both tests to pass! + ## Usage with editdistance Using RaKUn is simple! Simply call the main detector method with optional arguments (as described in the paper) @@ -55,7 +57,9 @@ hyperparameters = {"distance_threshold":4, "pair_diff_length":3, "stopwords" : stopwords.words('english'), "bigram_count_threshold":2, - "num_tokens":[1,2]} + "num_tokens":[1,2] + "max_similar" : 3, ## n most similar can show up n times + "max_occuirrence" : 2} ## maximum frequency overall keyword_detector = RakunDetector(hyperparameters) keywords = keyword_detector.find_keywords(blob_of_text, input_type = "text") diff --git a/mrakun/__init__.py b/mrakun/__init__.py index fedc7360b..dddfb89ba 100644 --- a/mrakun/__init__.py +++ b/mrakun/__init__.py @@ -12,7 +12,7 @@ from nltk import word_tokenize from nltk.stem.porter import * import operator -from collections import defaultdict +from collections import defaultdict, Counter import networkx as nx import numpy as np import glob @@ -29,7 +29,9 @@ logging.getLogger().setLevel(logging.INFO) try: from py3plex.visualization.multilayer import * -except: + +except Exception as es: + print("Please install py3plex library (pip install py3plex) for visualization capabilities!") pass class RakunDetector: @@ -38,6 +40,13 @@ def __init__(self, hyperparameters, verbose=True): self.distance_method = hyperparameters["distance_method"] self.hyperparameters = hyperparameters + + if not "max_occurrence" in self.hyperparameters: + self.hyperparameters['max_occurrence'] = 3 + + if not "max_similar" in self.hyperparameters: + self.hyperparameters['max_similar'] = 3 + self.verbose = verbose self.keyword_graph = None self.inverse_lemmatizer_mapping = {} @@ -88,7 +97,8 @@ def corpus_graph(self, language_file,limit_range=3000000,verbose=False,lemmatize ctx = 0 reps = False dictionary_with_counts_of_pairs = {} - + self.whole_document = [] + def process_line(line): nonlocal G @@ -99,7 +109,7 @@ def process_line(line): stop = list(string.punctuation) line = line.strip() line = [i for i in word_tokenize(line.lower()) if i not in stop] - + self.whole_document += line if not stopwords is None: line = [w for w in line if not w in stopwords] @@ -143,7 +153,7 @@ def process_line(line): else: dictionary_with_counts_of_pairs[edge_directed] = 1 return False - + if input_type == "file": with open(language_file) as lf: for line in lf: @@ -206,9 +216,7 @@ def generate_hypervertices(self, G): def hypervertex_prunning(self, graph, distance_threshold, pair_diff_max = 2, distance_method = "editdistance"): - self.to_merge = defaultdict(list) -# stemmer = nltk.stem.snowball.SnowballStemmer(language="english") - + self.to_merge = defaultdict(list) for pair in itertools.combinations(graph.nodes(),2): abs_diff = np.abs(len(pair[0]) - len(pair[1])) if abs_diff < pair_diff_max: @@ -246,7 +254,6 @@ def find_keywords(self, document, input_type = "file", validate = False): weighted_graph,reps = self.corpus_graph(document, lemmatizer=lemmatizer,stopwords=stopwords, input_type=input_type) nn = len(list(weighted_graph.nodes())) - if distance_threshold > 0: self.centrality = nx.load_centrality(weighted_graph) @@ -274,6 +281,7 @@ def find_keywords(self, document, input_type = "file", validate = False): higher_order_1 = [] higher_order_2 = [] frequent_pairs = [] + ## Check potential edges for edge in weighted_graph.edges(data=True): if edge[0] != edge[1]: @@ -316,15 +324,65 @@ def find_keywords(self, document, input_type = "file", validate = False): higher_order_2 = [] total_keywords = [] + if 1 in num_tokens: total_keywords += keywords_with_scores + if 2 in num_tokens: total_keywords += higher_order_1 + if 3 in num_tokens: total_keywords += higher_order_2 - total_kws = sorted(set(total_keywords), key=operator.itemgetter(1),reverse=True)[0:limit_num_keywords] - + total_kws = sorted(set(total_keywords), key=operator.itemgetter(1), reverse=True) + + ## remove some noise + tokensets = [] + for keyword in total_kws: + ltx = keyword[0].split(" ") + if len(ltx) > 1: + tokensets += ltx + + penalty = set([x[0] for x in Counter(tokensets).most_common(self.hyperparameters['max_occurrence'])]) + + tmp = [] + pnx = 0 + for keyword in total_kws: + parts = set(keyword[0].split(" ")) + if len(penalty.intersection(parts)) > 0: + pnx+=1 + if pnx < self.hyperparameters['max_similar']: + tmp.append(keyword) + else: + tmp.append(keyword) + total_kws = tmp + + # ## missing connectives + # if self.hyperparameters['connectives']: + # refurbished = [] + # for keyword in total_kws: + # kn = [None, None] + # key = keyword[0] + # parts = key.split(" ") + # if len(parts) > 1: + # i1 = self.whole_document.index(parts[0]) + # i2 = self.whole_document.index(parts[1]) + # print(i1,i2) + # if np.abs(i2-i1) == 1: + # pass + # else: + # missing = self.whole_document[i2-1] + # if parts[0] != missing: + # key = parts[0]+" "+missing+" "+parts[1] + # else: + # continue + # kn[0] = key + # kn[1] = keyword[1] + # refurbished.append(kn) + # total_kws = refurbished + + total_kws = total_kws[0:limit_num_keywords] + return total_kws def calculate_edit_distance(self, key1, key2): @@ -335,26 +393,24 @@ def calculate_embedding_distance(self, key1, key2): if __name__ == "__main__": - from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords hyperparameters = {"distance_threshold":4, - "distance_method": "editdistance", - "pretrained_embedding_path": '../pretrained_models/fasttext/wiki.en.bin', - "num_keywords" : 20, - "pair_diff_length":3, - "stopwords" : stopwords.words('english'), + "distance_method": "editdistance", + "num_keywords" : 20, + "pair_diff_length":3, + "stopwords" : stopwords.words('english'), "bigram_count_threshold":2, -# "lemmatizer" : WordNetLemmatizer(), - "num_tokens":[1,2]} + "max_occurrence" : 5, + "max_similar" : 3, + "num_tokens":[1,2]} keyword_detector = RakunDetector(hyperparameters) example_data = "../datasets/wiki20/docsutf8/7183.txt" keywords = keyword_detector.find_keywords(example_data) print(keywords) - keyword_detector.visualize_network() - keyword_detector.verbose = False - keyword_detector.validate_on_corpus("../datasets/Schutz2008") - + +# keyword_detector.visualize_network() +# keyword_detector.verbose = False +# keyword_detector.validate_on_corpus("../datasets/Schutz2008") # keyword_detector.visualize_network() -# keyword_detector.validate_on_corpus("../datasets/www") diff --git a/requirements.txt b/requirements.txt index d4112be6b..5eff021c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ networkx >= 2.3 numpy >= 1.16.4 editdistance >= 0.5.3 gensim >= 3.8.0 +py3plex >= 0.68 diff --git a/setup.py b/setup.py index 1698ad72a..45d6390dc 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ from setuptools.extension import Extension setup(name='mrakun', - version='0.31', + version='0.36', description="Rank-based unsupervised keyword detection via metavertex aggregation", url='http://github.com/skblaz/mrakun', author='Blaž Škrlj', @@ -13,7 +13,7 @@ license='GPL3', packages=find_packages(), zip_safe=False, - install_requires=['nltk','networkx','editdistance','pandas','numpy'], + install_requires=['nltk','networkx','editdistance','pandas','numpy','py3plex'], include_package_data=True)