From 06ba03380c1f309309f3e7cb484d736b7d3df1c3 Mon Sep 17 00:00:00 2001 From: Zhe Zhang <43827532+zzhang13@users.noreply.github.com> Date: Tue, 1 Sep 2020 18:09:50 -0400 Subject: [PATCH] fix: fix a bug when calculating coverage (#95) --- requirements.txt | 4 ++-- setup.py | 3 +-- src/assistant_improve_toolkit/computation_func.py | 13 ++++++++----- src/assistant_improve_toolkit/version.py | 2 +- src/assistant_improve_toolkit/visualize_func.py | 2 +- src/main/python/computation_func.py | 11 +++++++---- 6 files changed, 20 insertions(+), 15 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0fed5c0..bff1ec5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ ipython -pandas==1.1.0 +pandas==1.0.3 bokeh==2.0.0 tqdm==4.43.0 matplotlib==3.2.1 @@ -7,5 +7,5 @@ XlsxWriter==1.2.8 ibm-watson>=4.3.0 numpy==1.18.2 requests>=2.18.4 -scikit-learn==0.22.2.post1 +scikit-learn>=0.21.3 xlrd==1.2.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 6713911..dfa6119 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ def read_md(f): package_dir={'': 'src'}, packages=setuptools.find_packages('src'), install_requires=[ - 'pandas>=1.0.3', + 'pandas==1.0.3', 'bokeh==2.0.0', 'tqdm==4.43.0', 'scikit-learn>=0.21.3', @@ -60,7 +60,6 @@ def read_md(f): 'ibm-watson>=4.3.0', 'numpy>=1.18.2', 'requests>=2.18.4', - 'scikit-learn==0.22.2.post1', 'xlrd==1.2.0' ], diff --git a/src/assistant_improve_toolkit/computation_func.py b/src/assistant_improve_toolkit/computation_func.py index a566e95..1e1c628 100644 --- a/src/assistant_improve_toolkit/computation_func.py +++ b/src/assistant_improve_toolkit/computation_func.py @@ -21,7 +21,7 @@ import itertools import numpy as np from IPython.display import HTML -from tqdm.notebook import tqdm +from tqdm import tqdm MAX_DISAMBIGUATION_LENGTH = 5 MAX_MORE_OPTION_LENGTH = 5 @@ -174,14 +174,17 @@ def get_coverage_df(df_tbot_raw, df_coverage_nodes, conf_threshold): df_tbot_raw['Not Covered cause'] = None # Filter all the valid dialog node ids for non-coverage - df_coverage_valid = df_coverage_nodes[df_coverage_nodes['Valid']] # ['dialog_node'].tolist() + df_coverage_valid = df_coverage_nodes[df_coverage_nodes['Valid']] + df_coverage_valid_dict = dict() + for idx, row in df_coverage_nodes[df_coverage_nodes['Valid']].iterrows(): + df_coverage_valid_dict[row['Node ID']] = {row['Node ID'], row['Node Name']} # (1) Mark all messages that hit any non-coverage node including but not limited to 'anything_else' as 'Not covered' # and update the 'Not Covered cause' column - for node in df_coverage_valid['Node ID'].tolist(): - cause = "'{}' node".format(df_coverage_valid.loc[df_coverage_valid['Node ID'] == node, 'Condition'].values[0]) + for node_id, name_set in df_coverage_valid_dict.items(): + cause = "'{}' node".format(df_coverage_valid.loc[df_coverage_valid['Node ID'] == node_id, 'Condition'].values[0]) df_tbot_raw.loc[ - (df_tbot_raw['response.output.nodes_visited_s'].apply(lambda x: bool(intersection(x, node)))), [ + (df_tbot_raw['response.output.nodes_visited_s'].apply(lambda x: bool(intersection(x, name_set)))), [ 'Covered', 'Not Covered cause']] = [False, cause] # (2) Mark all messages that did not meet confidence threshold set as 'Not covered' and update the 'Not Covered diff --git a/src/assistant_improve_toolkit/version.py b/src/assistant_improve_toolkit/version.py index 59f94b2..64778fc 100644 --- a/src/assistant_improve_toolkit/version.py +++ b/src/assistant_improve_toolkit/version.py @@ -1 +1 @@ -__version__ = '__version__ = '1.1.4'' \ No newline at end of file +__version__ = '1.1.4' \ No newline at end of file diff --git a/src/assistant_improve_toolkit/visualize_func.py b/src/assistant_improve_toolkit/visualize_func.py index bcf15e5..72f20ab 100644 --- a/src/assistant_improve_toolkit/visualize_func.py +++ b/src/assistant_improve_toolkit/visualize_func.py @@ -382,7 +382,7 @@ def show_coverage_over_time(df_coverage, interval='day'): start_datetime -= delta end_datetime += delta - time_index_df = pd.DataFrame([dt for dt in datetime_range(start_datetime, end_datetime, delta)], + time_index_df = pd.DataFrame([dt for dt in coverage_time.response_datetime_interval], columns=['response_datetime_interval']) coverage_data = time_index_df.merge(coverage_time, how='left', on=['response_datetime_interval']) diff --git a/src/main/python/computation_func.py b/src/main/python/computation_func.py index 447e4a3..5567974 100644 --- a/src/main/python/computation_func.py +++ b/src/main/python/computation_func.py @@ -173,14 +173,17 @@ def get_coverage_df(df_tbot_raw, df_coverage_nodes, conf_threshold): df_tbot_raw['Not Covered cause'] = None # Filter all the valid dialog node ids for non-coverage - df_coverage_valid = df_coverage_nodes[df_coverage_nodes['Valid']] # ['dialog_node'].tolist() + df_coverage_valid = df_coverage_nodes[df_coverage_nodes['Valid']] + df_coverage_valid_dict = dict() + for idx, row in df_coverage_nodes[df_coverage_nodes['Valid']].iterrows(): + df_coverage_valid_dict[row['Node ID']] = {row['Node ID'], row['Node Name']} # (1) Mark all messages that hit any non-coverage node including but not limited to 'anything_else' as 'Not covered' # and update the 'Not Covered cause' column - for node in df_coverage_valid['Node ID'].tolist(): - cause = "'{}' node".format(df_coverage_valid.loc[df_coverage_valid['Node ID'] == node, 'Condition'].values[0]) + for node_id, name_set in df_coverage_valid_dict.items(): + cause = "'{}' node".format(df_coverage_valid.loc[df_coverage_valid['Node ID'] == node_id, 'Condition'].values[0]) df_tbot_raw.loc[ - (df_tbot_raw['response.output.nodes_visited_s'].apply(lambda x: bool(intersection(x, node)))), [ + (df_tbot_raw['response.output.nodes_visited_s'].apply(lambda x: bool(intersection(x, name_set)))), [ 'Covered', 'Not Covered cause']] = [False, cause] # (2) Mark all messages that did not meet confidence threshold set as 'Not covered' and update the 'Not Covered