fix: fix a bug when calculating coverage (#95)

watson-developer-cloud · Sep 1, 2020 · 06ba033 · 06ba033
1 parent 10babfe
commit 06ba033
Show file tree

Hide file tree

Showing 6 changed files with 20 additions and 15 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,11 +1,11 @@
 ipython
-pandas==1.1.0
+pandas==1.0.3
 bokeh==2.0.0
 tqdm==4.43.0
 matplotlib==3.2.1
 XlsxWriter==1.2.8
 ibm-watson>=4.3.0
 numpy==1.18.2
 requests>=2.18.4
-scikit-learn==0.22.2.post1
+scikit-learn>=0.21.3
 xlrd==1.2.0
diff --git a/setup.py b/setup.py
@@ -51,7 +51,7 @@ def read_md(f):
     package_dir={'': 'src'},
     packages=setuptools.find_packages('src'),
     install_requires=[
-        'pandas>=1.0.3',
+        'pandas==1.0.3',
         'bokeh==2.0.0',
         'tqdm==4.43.0',
         'scikit-learn>=0.21.3',
@@ -60,7 +60,6 @@ def read_md(f):
         'ibm-watson>=4.3.0',
         'numpy>=1.18.2',
         'requests>=2.18.4',
-        'scikit-learn==0.22.2.post1',
         'xlrd==1.2.0'
     ],
 

diff --git a/src/assistant_improve_toolkit/computation_func.py b/src/assistant_improve_toolkit/computation_func.py
@@ -21,7 +21,7 @@
 import itertools
 import numpy as np
 from IPython.display import HTML
-from tqdm.notebook import tqdm
+from tqdm import tqdm
 
 MAX_DISAMBIGUATION_LENGTH = 5
 MAX_MORE_OPTION_LENGTH = 5
@@ -174,14 +174,17 @@ def get_coverage_df(df_tbot_raw, df_coverage_nodes, conf_threshold):
     df_tbot_raw['Not Covered cause'] = None
 
     # Filter all the valid dialog node ids for non-coverage
-    df_coverage_valid = df_coverage_nodes[df_coverage_nodes['Valid']]  # ['dialog_node'].tolist()
+    df_coverage_valid = df_coverage_nodes[df_coverage_nodes['Valid']]
+    df_coverage_valid_dict = dict()
+    for idx, row in df_coverage_nodes[df_coverage_nodes['Valid']].iterrows():
+        df_coverage_valid_dict[row['Node ID']] = {row['Node ID'], row['Node Name']}
 
     # (1) Mark all messages that hit any non-coverage node including but not limited to 'anything_else' as 'Not covered'
     #  and update the 'Not Covered cause' column
-    for node in df_coverage_valid['Node ID'].tolist():
-        cause = "'{}' node".format(df_coverage_valid.loc[df_coverage_valid['Node ID'] == node, 'Condition'].values[0])
+    for node_id, name_set in df_coverage_valid_dict.items():
+        cause = "'{}' node".format(df_coverage_valid.loc[df_coverage_valid['Node ID'] == node_id, 'Condition'].values[0])
         df_tbot_raw.loc[
-            (df_tbot_raw['response.output.nodes_visited_s'].apply(lambda x: bool(intersection(x, node)))), [
+            (df_tbot_raw['response.output.nodes_visited_s'].apply(lambda x: bool(intersection(x, name_set)))), [
                 'Covered', 'Not Covered cause']] = [False, cause]
 
     # (2) Mark all messages  that did not meet confidence threshold set as 'Not covered' and update the 'Not Covered

diff --git a/src/assistant_improve_toolkit/version.py b/src/assistant_improve_toolkit/version.py
@@ -1 +1 @@
-__version__ = '__version__ = '1.1.4''
+__version__ = '1.1.4'
diff --git a/src/assistant_improve_toolkit/visualize_func.py b/src/assistant_improve_toolkit/visualize_func.py
@@ -382,7 +382,7 @@ def show_coverage_over_time(df_coverage, interval='day'):
             start_datetime -= delta
             end_datetime += delta
 
-        time_index_df = pd.DataFrame([dt for dt in datetime_range(start_datetime, end_datetime, delta)],
+        time_index_df = pd.DataFrame([dt for dt in coverage_time.response_datetime_interval],
                                      columns=['response_datetime_interval'])
 
         coverage_data = time_index_df.merge(coverage_time, how='left', on=['response_datetime_interval'])

diff --git a/src/main/python/computation_func.py b/src/main/python/computation_func.py
@@ -173,14 +173,17 @@ def get_coverage_df(df_tbot_raw, df_coverage_nodes, conf_threshold):
     df_tbot_raw['Not Covered cause'] = None
 
     # Filter all the valid dialog node ids for non-coverage
-    df_coverage_valid = df_coverage_nodes[df_coverage_nodes['Valid']]  # ['dialog_node'].tolist()
+    df_coverage_valid = df_coverage_nodes[df_coverage_nodes['Valid']]
+    df_coverage_valid_dict = dict()
+    for idx, row in df_coverage_nodes[df_coverage_nodes['Valid']].iterrows():
+        df_coverage_valid_dict[row['Node ID']] = {row['Node ID'], row['Node Name']}
 
     # (1) Mark all messages that hit any non-coverage node including but not limited to 'anything_else' as 'Not covered'
     #  and update the 'Not Covered cause' column
-    for node in df_coverage_valid['Node ID'].tolist():
-        cause = "'{}' node".format(df_coverage_valid.loc[df_coverage_valid['Node ID'] == node, 'Condition'].values[0])
+    for node_id, name_set in df_coverage_valid_dict.items():
+        cause = "'{}' node".format(df_coverage_valid.loc[df_coverage_valid['Node ID'] == node_id, 'Condition'].values[0])
         df_tbot_raw.loc[
-            (df_tbot_raw['response.output.nodes_visited_s'].apply(lambda x: bool(intersection(x, node)))), [
+            (df_tbot_raw['response.output.nodes_visited_s'].apply(lambda x: bool(intersection(x, name_set)))), [
                 'Covered', 'Not Covered cause']] = [False, cause]
 
     # (2) Mark all messages  that did not meet confidence threshold set as 'Not covered' and update the 'Not Covered
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = '__version__ = '1.1.4''
		__version__ = '1.1.4'