added additional analysis script

tgem · Apr 19, 2018 · 037842e · 037842e
1 parent 6aff4f5
commit 037842e
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 9 deletions.
diff --git a/src/analysis2.py b/src/analysis2.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Nov  4 16:58:47 2017
+
+@author: Tim G
+"""
+
+"""
+Glossary term extraction
+"""
+
+import reader
+import buildindex
+import indexfilter
+import analyze_coverage 
+import analyze_tags
+import compare_indices
+from pipeline import glossary_extraction
+from matplotlib import pyplot as plt
+
+"""
+Analysis
+"""
+
+# main script
+
+def key_figures(filter_mode=["threshold","specificity"],description="",threshold_coverage=1):
+    global gt_index
+    index, reqs, ids, tag_index = glossary_extraction(filter_mode=filter_mode)
+    compare_indices.analyze_index(index,reqs,ids,description=description)
+    short_index, reqs, ids, tag_index = glossary_extraction(threshold_coverage=threshold_coverage,
+                                                            max_lines=100,
+                                                            tag_mode="load tagger",
+                                                            filter_mode=filter_mode)
+    gt = reader.gt_read(max_lines=100)
+    gt_index = buildindex.tag_index(gt)
+    gt_index = indexfilter.index_filter(gt_index,100,threshold_coverage=threshold_coverage,filter_mode=filter_mode)
+    analyze_tags.analyze_tags(short_index,gt_index,name="ground truth term",silent=True)
+    print("Number of glossary terms extracted from first 100 reqs:",len(short_index.keys()))
+    print("Number of ground truth terms extracted from first 100 reqs:",len(gt_index.keys()))
+    print("Recall (regarding ground truth, including partial matches): ",
+          (analyze_tags.no_terms_as_tags+analyze_tags.no_terms_as_tag_parts
+           +analyze_tags.no_tags_as_term_parts)/
+          len(gt_index.keys()))
+    print("Precision (regarding ground truth, including partial matches): ",
+          (analyze_tags.no_terms_as_tags+analyze_tags.no_terms_as_tag_parts
+           +analyze_tags.no_tags_as_term_parts)/
+          len(short_index.keys()))
+
+def key_figures_by_threshold(threshold_coverage=5):
+    print("Analysis for threshold value of",threshold_coverage)
+    key_figures(filter_mode=[],description="Only linguistic",threshold_coverage=threshold_coverage)
+    key_figures(filter_mode=["specificity"],description="Only specificity",threshold_coverage=threshold_coverage)
+    key_figures(filter_mode=["threshold"],description="Only relevance",threshold_coverage=threshold_coverage)
+    key_figures(filter_mode=["threshold","specificity"],description="Relevance & specificity",threshold_coverage=threshold_coverage)
+    print()
+    print()
+
+key_figures_by_threshold(threshold_coverage=1)
+key_figures_by_threshold(threshold_coverage=3)
+key_figures_by_threshold(threshold_coverage=5)
+
+
diff --git a/src/analyze_tags.py b/src/analyze_tags.py
@@ -5,10 +5,17 @@
 @author: Tim G
 """
 
-def analyze_tags(index,tag_index,name="tag"):
-    print(" Number of glossary terms: "+str(len(index.keys())))
-    print(" Number of "+name+": "+str(len(tag_index.keys())))
+def analyze_tags(index,tag_index,name="tag",silent=False):
+    global no_identical_terms
+    global no_contained_terms
+    global no_contained_tags
+    if not silent:
+        print(" Number of glossary terms: "+str(len(index.keys())))
+        print(" Number of "+name+": "+str(len(tag_index.keys())))
     # how many glossary terms are tags?
+    global no_terms_as_tags
+    global no_terms_as_tag_parts
+    global no_tags_as_term_parts
     no_terms_as_tags = 0
     no_terms_as_tag_parts = 0
     no_tags_as_term_parts = 0
@@ -30,9 +37,10 @@ def analyze_tags(index,tag_index,name="tag"):
                 if contained(tag,term):
                     no_tags_as_term_parts = no_tags_as_term_parts + 1
                     break
-    print("Number of terms also used as "+name+"s: "+str(no_terms_as_tags));
-    print("Number of terms that are part of, but not identical to "+name+"s: "+str(no_terms_as_tag_parts))
-    print("Number of "+name+"s that are part of, but not identical to terms: "+str(no_tags_as_term_parts))
+    if not silent:
+        print("Number of terms also used as "+name+"s: "+str(no_terms_as_tags));
+        print("Number of terms that are part of, but not identical to "+name+"s: "+str(no_terms_as_tag_parts))
+        print("Number of "+name+"s that are part of, but not identical to terms: "+str(no_tags_as_term_parts))
     no_identical_terms = 0
     global identical_terms
     identical_terms = []
@@ -65,9 +73,10 @@ def analyze_tags(index,tag_index,name="tag"):
                     contained_tags = contained_tags + [tag]
                     contained_tag_tuples = contained_tag_tuples + [(tag,term)]
                     break
-    print("Number of terms where the requirements list matches exactly the requirements list of a "+name+":"+str(no_identical_terms))
-    print("Number of terms where the requirements list is contained in the requirements list of a "+name+":"+str(no_contained_terms))
-    print("Number of "+name+"s where the requirements list is contained in the requirements list of a term:"+str(no_contained_tags))
+    if not silent:
+        print("Number of terms where the requirements list matches exactly the requirements list of a "+name+":"+str(no_identical_terms))
+        print("Number of terms where the requirements list is contained in the requirements list of a "+name+":"+str(no_contained_terms))
+        print("Number of "+name+"s where the requirements list is contained in the requirements list of a term:"+str(no_contained_tags))
 
 def contained(tuple1,tuple2):
     for element in tuple1: