Skip to content

Commit

Permalink
added additional analysis script
Browse files Browse the repository at this point in the history
  • Loading branch information
tgem committed Apr 19, 2018
1 parent 6aff4f5 commit 037842e
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 9 deletions.
63 changes: 63 additions & 0 deletions src/analysis2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 4 16:58:47 2017
@author: Tim G
"""

"""
Glossary term extraction
"""

import reader
import buildindex
import indexfilter
import analyze_coverage
import analyze_tags
import compare_indices
from pipeline import glossary_extraction
from matplotlib import pyplot as plt

"""
Analysis
"""

# main script

def key_figures(filter_mode=["threshold","specificity"],description="",threshold_coverage=1):
global gt_index
index, reqs, ids, tag_index = glossary_extraction(filter_mode=filter_mode)
compare_indices.analyze_index(index,reqs,ids,description=description)
short_index, reqs, ids, tag_index = glossary_extraction(threshold_coverage=threshold_coverage,
max_lines=100,
tag_mode="load tagger",
filter_mode=filter_mode)
gt = reader.gt_read(max_lines=100)
gt_index = buildindex.tag_index(gt)
gt_index = indexfilter.index_filter(gt_index,100,threshold_coverage=threshold_coverage,filter_mode=filter_mode)
analyze_tags.analyze_tags(short_index,gt_index,name="ground truth term",silent=True)
print("Number of glossary terms extracted from first 100 reqs:",len(short_index.keys()))
print("Number of ground truth terms extracted from first 100 reqs:",len(gt_index.keys()))
print("Recall (regarding ground truth, including partial matches): ",
(analyze_tags.no_terms_as_tags+analyze_tags.no_terms_as_tag_parts
+analyze_tags.no_tags_as_term_parts)/
len(gt_index.keys()))
print("Precision (regarding ground truth, including partial matches): ",
(analyze_tags.no_terms_as_tags+analyze_tags.no_terms_as_tag_parts
+analyze_tags.no_tags_as_term_parts)/
len(short_index.keys()))

def key_figures_by_threshold(threshold_coverage=5):
print("Analysis for threshold value of",threshold_coverage)
key_figures(filter_mode=[],description="Only linguistic",threshold_coverage=threshold_coverage)
key_figures(filter_mode=["specificity"],description="Only specificity",threshold_coverage=threshold_coverage)
key_figures(filter_mode=["threshold"],description="Only relevance",threshold_coverage=threshold_coverage)
key_figures(filter_mode=["threshold","specificity"],description="Relevance & specificity",threshold_coverage=threshold_coverage)
print()
print()

key_figures_by_threshold(threshold_coverage=1)
key_figures_by_threshold(threshold_coverage=3)
key_figures_by_threshold(threshold_coverage=5)


27 changes: 18 additions & 9 deletions src/analyze_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,17 @@
@author: Tim G
"""

def analyze_tags(index,tag_index,name="tag"):
print(" Number of glossary terms: "+str(len(index.keys())))
print(" Number of "+name+": "+str(len(tag_index.keys())))
def analyze_tags(index,tag_index,name="tag",silent=False):
global no_identical_terms
global no_contained_terms
global no_contained_tags
if not silent:
print(" Number of glossary terms: "+str(len(index.keys())))
print(" Number of "+name+": "+str(len(tag_index.keys())))
# how many glossary terms are tags?
global no_terms_as_tags
global no_terms_as_tag_parts
global no_tags_as_term_parts
no_terms_as_tags = 0
no_terms_as_tag_parts = 0
no_tags_as_term_parts = 0
Expand All @@ -30,9 +37,10 @@ def analyze_tags(index,tag_index,name="tag"):
if contained(tag,term):
no_tags_as_term_parts = no_tags_as_term_parts + 1
break
print("Number of terms also used as "+name+"s: "+str(no_terms_as_tags));
print("Number of terms that are part of, but not identical to "+name+"s: "+str(no_terms_as_tag_parts))
print("Number of "+name+"s that are part of, but not identical to terms: "+str(no_tags_as_term_parts))
if not silent:
print("Number of terms also used as "+name+"s: "+str(no_terms_as_tags));
print("Number of terms that are part of, but not identical to "+name+"s: "+str(no_terms_as_tag_parts))
print("Number of "+name+"s that are part of, but not identical to terms: "+str(no_tags_as_term_parts))
no_identical_terms = 0
global identical_terms
identical_terms = []
Expand Down Expand Up @@ -65,9 +73,10 @@ def analyze_tags(index,tag_index,name="tag"):
contained_tags = contained_tags + [tag]
contained_tag_tuples = contained_tag_tuples + [(tag,term)]
break
print("Number of terms where the requirements list matches exactly the requirements list of a "+name+":"+str(no_identical_terms))
print("Number of terms where the requirements list is contained in the requirements list of a "+name+":"+str(no_contained_terms))
print("Number of "+name+"s where the requirements list is contained in the requirements list of a term:"+str(no_contained_tags))
if not silent:
print("Number of terms where the requirements list matches exactly the requirements list of a "+name+":"+str(no_identical_terms))
print("Number of terms where the requirements list is contained in the requirements list of a "+name+":"+str(no_contained_terms))
print("Number of "+name+"s where the requirements list is contained in the requirements list of a term:"+str(no_contained_tags))

def contained(tuple1,tuple2):
for element in tuple1:
Expand Down

0 comments on commit 037842e

Please sign in to comment.