initial commit

tgem · Apr 2, 2018 · 6aff4f5 · 6aff4f5
commit 6aff4f5
Show file tree

Hide file tree

Showing 26 changed files with 4,292 additions and 0 deletions.
diff --git a/data/ground_truth.csv b/data/ground_truth.csv
diff --git a/readme.md b/readme.md
@@ -0,0 +1,43 @@
+# Glossary Term Extraction from the CrowdRe Dataset
+
+This is the code used to extract and analyze glossary term candidates from the CrowdRE dataset in
+the conference paper proposal by Gemkow, Conzelmann, Hartig and Vogelsang.
+
+The paper, and this code, builds on the CrowdRE dataset:
+P. K. Murukannaiah, N. Ajmeri, and M. P. Singh, “The smarthome crowd requirements dataset,” https://crowdre.github.io/murukannaiah-smarthome-requirements-dataset/ , Apr. 2017.
+
+## Replicating the analysis
+
+0. Install the prerequisite packages nltk (for language processing) and openpyxl (for Excel export) in the Python environment that you will be using.
+Please note that nltk needs to be installed including the WordNet corpus; it may be necessary to execute "nltk.download('wordnet')" in the Python shell
+after downloading the package.
+
+1. Download or clone this repository to your local machine.
+
+2. Download the csv version of the dataset from the url mentioned above and place the requirements.csv file in the data folder.
+
+3. Execute the src/analysis.py file in a Python 3 interpreter
+
+The results of the analysis will be output to the console.
+
+## How the analysis works
+
+The main workhorse of the analysis script is the function glossary_extraction in the file pipeline.py. Through its parameters, this function allows
+to vary each step in the pipeline along the alternatives described in the paper. In general, for each comparison, analysis.py runs this function twice
+with different pipeline configurations and then compares the outputs.
+
+NB: We do not focus on the immedate effects of changing a pipeline step (i.e. the words directly removed by a filter), but on the change of the ultimate
+pipeline output caused by changing a pipeline step. This is important since changes in the middle of the pipeline may have important repercussions on
+later steps (e.g. when ommitting stemming, many important concepts are later removed by the statistical filter because each individual form occurs to
+rarely in the dataset to fulfill the frequency criterion).
+
+## Re-using intermediate results
+
+Training a PoS tagger and PoS-tagging the requirements are computationally expensive steps that should not be re-run every time an analysis in the
+later stages of the pipeline is changed. Therefore, both the tagger itself and the tagged requirements are saved as pickle files in the temp folder.
+As a default, the glossary_extraction function starts with the PoS-tagged requirements. By setting the tag_mode parameter, you can enforce a new
+PoS-tagging using the existing tagger, or training and applying a new tagger from scratch. If you want to analyse changes in the very early pipeline
+steps (e.g. tokenization), this is necessary since the pre-computed tagged requirements will not reflect the consequences of such changes.
+
+Note that the temp folder also contains an intermediate result from analyzing the WordNet corpus for comparison (filter_index.pickle). It should
+not be necessary to re-run this benchmark creation, although the code to do so is included in indexfilter.py for completeness.
diff --git a/src/analysis.py b/src/analysis.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Nov  4 16:58:47 2017
+
+@author: Tim G
+"""
+
+"""
+Glossary term extraction
+"""
+
+import reader
+import buildindex
+import indexfilter
+import analyze_coverage 
+import analyze_tags
+import compare_indices
+from pipeline import glossary_extraction
+from matplotlib import pyplot as plt
+
+"""
+Analysis
+"""
+
+# analyze frequency distribution of the number of requirements covered by a glossary term
+
+def analyze_number_of_requirements_per_term(old_index, index):
+    frequencies = {}
+    for term in old_index:
+        frequency = len(old_index[term])
+        if frequency in frequencies:
+            frequencies[frequency] += 1
+        else:
+            frequencies[frequency] = 1
+    maxx=50
+    x=list(range(maxx+1))
+    y=[0]*(maxx+1)
+    for frequency in frequencies.keys():
+        if frequency<maxx+1:
+            y[frequency]=frequencies[frequency]
+    plt.title("How many requirements are covered by each glossary term")
+    plt.xlabel('number of requirements covered per term candidate')
+    plt.ylabel('number of glossary term candidates')
+    plt.axis([0,maxx,0,50]) ## TODO
+    plt.plot(x,y,linestyle="solid",color="b")
+    plt.savefig("number_of_requirements_per_term.pdf",format='pdf')
+
+# main script
+
+# main pipeline results
+
+index, reqs, ids, tag_index = glossary_extraction(output='file')
+compare_indices.analyze_index(index,reqs,ids,description="Index resulting from main pipeline")
+analyze_tags.analyze_tags(index,tag_index)
+full_reqs, _= reader.reqs_read(read_tags=False)
+coverage, not_covered = analyze_coverage.analyze_coverage(index,full_reqs,ids) 
+print("Coverage: ",coverage)
+print("Number of requirements not covered:",len(not_covered))
+coverage, _ = analyze_coverage.analyze_coverage(index,full_reqs,ids,threshold=2) 
+print("Coverage by two: ",coverage) 
+
+# comparisons with alternative pipeline setups
+
+index1,_,_,_ = glossary_extraction(threshold_coverage=5)
+index2,reqs2,ids2,_ = glossary_extraction(threshold_coverage=0)
+compare_indices.compare_indices(index1,index2,description="Removed threshold for minimum number of covered requirements")
+indexfilter.analyze_index_filter(index2,reqs2,ids2,output='file')
+
+index1,_,_,_ = glossary_extraction(lemmatize_mode='lemmatize')
+index2,_,_,_ = glossary_extraction(lemmatize_mode='none')
+compare_indices.compare_indices(index1,index2,description="Replaced lemmatization with no root reduction")
+
+index3,_,_,_ = glossary_extraction(lemmatize_mode='lemmatize')
+index4,_,_,_ = glossary_extraction(lemmatize_mode='porter')
+compare_indices.compare_indices(index3,index4,description="Replaced lemmatization with Porter stemming")
+
+index5,_,_,_ = glossary_extraction(capitalization_mode='lower')
+index6,_,_,_ = glossary_extraction(capitalization_mode='none')
+u5, u6 = compare_indices.compare_indices(index5,index6,description="Omitted conversion to lower case")
+print("terms only present with lower case enforcement: ",u5)
+print("terms only present without lower case enforcement: ",u6)
+
+# The following comparison requires new POS tagging and thus quite some time
+# Hence, it is disabled by default
+"""
+index8,_,_,_ = glossary_extraction(tokenize_mode='expand contractions',tag_mode="load tagger")
+index7,_,_,_ = glossary_extraction(tokenize_mode="standard",tag_mode="load tagger")
+u7, u8 = compare_indices.compare_indices(index7,index8,description="Changed tokenization")
+print("terms only present with default tokenization: ",u7)
+print("terms only present with changed tokenization: ",u8)
+"""
+
+index9,_,_,_ = glossary_extraction()
+index10,_,_,_ = glossary_extraction(tag_mode="load",tagger_name="tagger")
+u9, u10 = compare_indices.compare_indices(index9,index10,description="Changed POS tagging approach")
+print("terms only present with default POS tagging: ",u9)
+print("terms only present with changed POS tagging: ",u10)
+
+index11,_,_,_ = glossary_extraction(chunk_mode="statistical")
+index12,_,_,_ = glossary_extraction(chunk_mode="rule-based")
+u11, u12 = compare_indices.compare_indices(index11,index12,description="Changed chunking approach")
+print("terms only present with statistical chunking: ",u11)
+print("terms only present with rule-based chunking: ",u12)
+
+"""
+from nltk.corpus import treebank
+no_reqs=500
+reqs = [" ".join(sent) for sent in treebank.sents()[:no_reqs]]
+ids = list(range(no_reqs))
+tags = ["" for sent in treebank.sents()[:no_reqs]]
+index13,_,_,_ = glossary_extraction(tag_mode="load tagger", filter_mode="threshold")
+"""
+
+index13,_,_,_ = glossary_extraction(filter_mode=["threshold"])
+index14,_,_,_ = glossary_extraction(filter_mode=["threshold","specificity"])
+u13, u14 = compare_indices.compare_indices(index13,index14,description="Changed index filtering mode")
+print("terms only present with pure threshold filtering: ",u13)
+print("terms only present with threshold & specificity filtering: ",u14)
+
+short_index, reqs, ids, tag_index = glossary_extraction(chunk_mode="rule-based",threshold_coverage=1,max_lines=100,tag_mode="load tagger")
+gt = reader.gt_read(max_lines=100)
+gt_index = buildindex.tag_index(gt)
+analyze_tags.analyze_tags(short_index,gt_index,name="ground truth term")
+list1, list2 = compare_indices.compare_indices(short_index,gt_index,description="Comparing to ground truth")
+print("terms only present in generated index: ",list1)
+print("terms only present in ground truth: ",list2)
+
+index13,_,_,_ = glossary_extraction(filter_mode=[])
+index14,_,_,_ = glossary_extraction(filter_mode=["specificity"])
+u13, u14 = compare_indices.compare_indices(index13,index14,description="Changed index filtering mode")
+#print("terms only present with filtering: ",u13)
+#print("terms only present without filtering: ",u14)
diff --git a/src/analyze_coverage.py b/src/analyze_coverage.py
@@ -0,0 +1,23 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Dec  2 16:44:51 2017
+
+@author: Tim G
+"""
+
+from collections import defaultdict
+
+def analyze_coverage(index,reqs,ids,threshold=1):
+    covered_reqs = defaultdict(int)
+    for term in index:
+        for reqid in index[term]:
+            covered_reqs[reqid]=covered_reqs[reqid]+1
+    covered = 0
+    total = len(ids)
+    not_covered = []
+    for i in range(len(ids)):
+        if covered_reqs[i]>=threshold:
+            covered=covered+1
+        else:
+            not_covered = not_covered + [reqs[i]]
+    return covered/total, not_covered
diff --git a/src/analyze_tags.py b/src/analyze_tags.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jan  4 16:43:23 2018
+
+@author: Tim G
+"""
+
+def analyze_tags(index,tag_index,name="tag"):
+    print(" Number of glossary terms: "+str(len(index.keys())))
+    print(" Number of "+name+": "+str(len(tag_index.keys())))
+    # how many glossary terms are tags?
+    no_terms_as_tags = 0
+    no_terms_as_tag_parts = 0
+    no_tags_as_term_parts = 0
+    global terms_as_tags
+    terms_as_tags = []
+    global terms_as_tag_parts
+    terms_as_tag_parts = []
+    for term in index.keys():
+        if (term in tag_index.keys()) and (tag_index[term] is not None):
+            no_terms_as_tags = no_terms_as_tags + 1
+            terms_as_tags = terms_as_tags + [term]
+        else:
+            for tag in tag_index.keys():
+                if contained(term,tag):
+                    no_terms_as_tag_parts = no_terms_as_tag_parts + 1
+                    terms_as_tag_parts = terms_as_tag_parts + [(term,tag)]
+                    break
+            for tag in tag_index.keys():
+                if contained(tag,term):
+                    no_tags_as_term_parts = no_tags_as_term_parts + 1
+                    break
+    print("Number of terms also used as "+name+"s: "+str(no_terms_as_tags));
+    print("Number of terms that are part of, but not identical to "+name+"s: "+str(no_terms_as_tag_parts))
+    print("Number of "+name+"s that are part of, but not identical to terms: "+str(no_tags_as_term_parts))
+    no_identical_terms = 0
+    global identical_terms
+    identical_terms = []
+    no_contained_terms = 0
+    global contained_terms
+    contained_terms = []
+    global contained_term_tuples
+    contained_term_tuples = []
+    no_contained_tags = 0
+    global contained_tags
+    contained_tags = []
+    global contained_tag_tuples
+    contained_tag_tuples = []
+    for term in index.keys():
+        for tag in tag_index.keys():
+            term_in_tag = contained(index[term],tag_index[tag])
+            tag_in_term = contained(tag_index[tag],index[term])
+            if term_in_tag and tag_in_term:
+                no_identical_terms = no_identical_terms + 1
+                identical_terms = identical_terms + [term]
+                break
+            else:
+                if contained(index[term],tag_index[tag]):
+                    no_contained_terms = no_contained_terms + 1
+                    contained_terms = contained_terms + [term]
+                    contained_term_tuples = contained_term_tuples + [(term,tag)]
+                    break
+                if contained(tag_index[tag],index[term]):
+                    no_contained_tags = no_contained_tags + 1
+                    contained_tags = contained_tags + [tag]
+                    contained_tag_tuples = contained_tag_tuples + [(tag,term)]
+                    break
+    print("Number of terms where the requirements list matches exactly the requirements list of a "+name+":"+str(no_identical_terms))
+    print("Number of terms where the requirements list is contained in the requirements list of a "+name+":"+str(no_contained_terms))
+    print("Number of "+name+"s where the requirements list is contained in the requirements list of a term:"+str(no_contained_tags))
+
+def contained(tuple1,tuple2):
+    for element in tuple1:
+        if not element in tuple2:
+            return False
+    return True    
diff --git a/src/buildindex.py b/src/buildindex.py
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Dec  2 09:37:16 2017
+
+@author: Tim G
+"""
+
+
+def index_from_terms(terms,term_index):
+    # transform lists to tuples so that they can be used as dict indices
+    term_tuples = []
+    for term in terms:
+        term_tuples.append(tuple(word for word in term))    
+    # create dictionary
+    index = {}
+    for i, term_tuple in enumerate(term_tuples):
+        if (term_tuple in index) and (index[term_tuple] is not None):
+            if not(term_index[i] in index[term_tuple]):
+                index[term_tuple]=index[term_tuple]+[term_index[i]]
+        else:
+            index[term_tuple]=[term_index[i]]        
+    return index
+
+def tag_index(tags):
+    index = {}
+    for i, tag_list in enumerate(tags):
+        for tag in tag_list:
+            if (tag in index) and (index[tag] is not None):
+                index[tag]=index[tag]+[i]
+            else:
+                index[tag]=[i]
+    return index