diff --git a/examples/featurize.py b/examples/featurize.py index 8bb3c232..2d08ad46 100644 --- a/examples/featurize.py +++ b/examples/featurize.py @@ -18,6 +18,9 @@ juries_df = pd.read_csv("./example_data/full_empirical_datasets/jury_conversations_with_outcome_var.csv", encoding='utf-8') csop_df = pd.read_csv("./example_data/full_empirical_datasets/csop_conversations_withblanks.csv", encoding='utf-8') csopII_df = pd.read_csv("./example_data/full_empirical_datasets/csopII_conversations_withblanks.csv", encoding='utf-8') + test_df = pd.read_csv("C:/Users/amyta/Documents/GitHub/team_comm_tools/tests/data/cleaned_data/test_package_aggregation.csv", encoding='utf-8') + + # C:/Users/amyta/Documents/GitHub/team_comm_tools/tests/data/cleaned_data/test_package_aggregation.csv """ TINY / TEST DATASETS ------------------------------- diff --git a/src/team_comm_tools/feature_builder.py b/src/team_comm_tools/feature_builder.py index 3fc1f30e..5c19aaf7 100644 --- a/src/team_comm_tools/feature_builder.py +++ b/src/team_comm_tools/feature_builder.py @@ -401,6 +401,15 @@ def __init__( self.original_vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name + if custom_vect_path is not None: + print("Detected that user has requested custom vectors...") + print("We will generate features using custom vectors rather than default SBERT") + self.vect_path = custom_vect_path + else: + self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name + + self.original_vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name + self.bert_path = vector_directory + "sentiment/" + ("turns" if self.turns else "chats") + "/" + base_file_name # Check + generate embeddings @@ -417,6 +426,8 @@ def __init__( if(not need_sentiment and feature_dict[feature]["bert_sentiment_data"]): need_sentiment = True + # preprocess chat data again + self.preprocess_chat_data() # preprocess chat data again self.preprocess_chat_data() check_embeddings(self.chat_data, self.vect_path, self.bert_path, need_sentence, need_sentiment, self.regenerate_vectors, self.message_col) diff --git a/src/team_comm_tools/utils/calculate_conversation_level_features.py b/src/team_comm_tools/utils/calculate_conversation_level_features.py index 95262f27..158c9d67 100644 --- a/src/team_comm_tools/utils/calculate_conversation_level_features.py +++ b/src/team_comm_tools/utils/calculate_conversation_level_features.py @@ -8,6 +8,7 @@ from team_comm_tools.utils.gini_coefficient import * from team_comm_tools.utils.preprocess import * from fuzzywuzzy import process +from fuzzywuzzy import process class ConversationLevelFeaturesCalculator: """ @@ -245,6 +246,8 @@ def __init__(self, chat_data: pd.DataFrame, self.summable_columns = ["num_words", "num_chars", "num_messages"] + + def calculate_conversation_level_features(self, feature_methods: list) -> pd.DataFrame: """ Main driver function for creating conversation-level features. @@ -317,6 +320,18 @@ def get_conversation_level_aggregates(self) -> None: :rtype: None """ + if self.convo_aggregation == True: + # For each summarizable feature + for column in self.columns_to_summarize: + + # Average/Mean of feature across the Conversation + if 'mean' in self.convo_methods: + self.conv_data = pd.merge( + left=self.conv_data, + right=get_mean(self.chat_data.copy(), column, 'mean_'+column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) if self.convo_aggregation == True: # For each summarizable feature for column in self.columns_to_summarize: @@ -331,6 +346,14 @@ def get_conversation_level_aggregates(self) -> None: ) # Standard Deviation of feature across the Conversation + if 'stdev' in self.convo_methods: + self.conv_data = pd.merge( + left=self.conv_data, + right=get_stdev(self.chat_data.copy(), column, 'stdev_'+column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + # Standard Deviation of feature across the Conversation if 'stdev' in self.convo_methods: self.conv_data = pd.merge( left=self.conv_data, @@ -347,7 +370,34 @@ def get_conversation_level_aggregates(self) -> None: on=[self.conversation_id_col], how="inner" ) + # Minima for the feature across the Conversation + if 'min' in self.convo_methods: + self.conv_data = pd.merge( + left=self.conv_data, + right=get_min(self.chat_data.copy(), column, 'min_'+column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + # Maxima for the feature across the Conversation + if 'max' in self.convo_methods: + self.conv_data = pd.merge( + left=self.conv_data, + right=get_max(self.chat_data.copy(), column, 'max_'+column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + # Median for the feature across the Conversation + if 'median' in self.convo_methods: + self.conv_data = pd.merge( + left=self.conv_data, + right=get_median(self.chat_data.copy(), column, 'median_'+column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + # Do this only for the columns that make sense (e.g., countable things); we do this regardless of aggregation, as it's necessary for gini. # Maxima for the feature across the Conversation if 'max' in self.convo_methods: self.conv_data = pd.merge( @@ -376,6 +426,7 @@ def get_conversation_level_aggregates(self) -> None: how="inner" ) + def get_user_level_aggregates(self) -> None: """ Aggregate summary statistics from user-level features to conversation-level features. @@ -391,13 +442,26 @@ def get_user_level_aggregates(self) -> None: - Maximum of averaged user-level features + :return: None :rtype: None """ + if self.convo_aggregation == True and self.user_aggregation == True: if self.convo_aggregation == True and self.user_aggregation == True: # aggregates from the user level based on conversation methods + if 'mean' in self.convo_methods: + for user_column in self.user_columns: + for user_method in self.user_methods: + # Average/Mean of User-Level Feature + self.conv_data = pd.merge( + left=self.conv_data, + right=get_mean(self.user_data.copy(), user_method + "_" +user_column, "mean_user_" + user_method + "_" +user_column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + # aggregates from the user level based on conversation methods if 'mean' in self.convo_methods: for user_column in self.user_columns: for user_method in self.user_methods: @@ -409,6 +473,16 @@ def get_user_level_aggregates(self) -> None: how="inner" ) + if 'stdev' in self.convo_methods: + for user_column in self.user_columns: + for user_method in self.user_methods: + # Standard Deviation of User-Level Feature + self.conv_data = pd.merge( + left=self.conv_data, + right=get_stdev(self.user_data.copy(), user_method + "_" + user_column, 'stdev_user_' + user_method + "_" + user_column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) if 'stdev' in self.convo_methods: for user_column in self.user_columns: for user_method in self.user_methods: @@ -452,6 +526,38 @@ def get_user_level_aggregates(self) -> None: on=[self.conversation_id_col], how="inner" ) + if 'min' in self.convo_methods: + for user_column in self.user_columns: + for user_method in self.user_methods: + # Minima of User-Level Feature + self.conv_data = pd.merge( + left=self.conv_data, + right=get_min(self.user_data.copy(), user_method + "_" + user_column, 'min_user_' + user_method + "_" + user_column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + if 'max' in self.convo_methods: + for user_column in self.user_columns: + for user_method in self.user_methods: + # Maxima of User-Level Feature + self.conv_data = pd.merge( + left=self.conv_data, + right=get_max(self.user_data.copy(), user_method + "_" + user_column, 'max_user_' + user_method + "_" + user_column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) + + if 'median' in self.convo_methods: + for user_column in self.user_columns: + for user_method in self.user_methods: + # Median of User-Level Feature + self.conv_data = pd.merge( + left=self.conv_data, + right=get_median(self.user_data.copy(), user_method + "_" + user_column, 'median_user_' + user_method + "_" + user_column, self.conversation_id_col), + on=[self.conversation_id_col], + how="inner" + ) def get_discursive_diversity_features(self) -> None: diff --git a/src/team_comm_tools/utils/calculate_user_level_features.py b/src/team_comm_tools/utils/calculate_user_level_features.py index 5112d053..9a66e873 100644 --- a/src/team_comm_tools/utils/calculate_user_level_features.py +++ b/src/team_comm_tools/utils/calculate_user_level_features.py @@ -1,8 +1,10 @@ # Importing modules from features from team_comm_tools.utils.summarize_features import get_user_sum_dataframe, get_user_mean_dataframe, get_user_max_dataframe, get_user_min_dataframe, get_user_stdev_dataframe, get_user_median_dataframe +from team_comm_tools.utils.summarize_features import get_user_sum_dataframe, get_user_mean_dataframe, get_user_max_dataframe, get_user_min_dataframe, get_user_stdev_dataframe, get_user_median_dataframe from team_comm_tools.features.get_user_network import * from team_comm_tools.features.user_centroids import * from fuzzywuzzy import process +from fuzzywuzzy import process class UserLevelFeaturesCalculator: """ @@ -148,6 +150,8 @@ def calculate_user_level_features(self) -> pd.DataFrame: :rtype: pd.DataFrame """ + # Get mean features for all features + # self.get_user_level_mean_features() # Get mean features for all features # self.get_user_level_mean_features() @@ -157,6 +161,9 @@ def calculate_user_level_features(self) -> pd.DataFrame: # Get user summary statistics for all features (e.g. mean, min, max, stdev) self.get_user_level_summary_statistics_features() + # Get user summary statistics for all features (e.g. mean, min, max, stdev) + self.get_user_level_summary_statistics_features() + # Get 4 discursive features (discursive diversity, variance in DD, incongruent modulation, within-person discursive range) # self.get_centroids() @@ -227,6 +234,55 @@ def get_user_level_summary_statistics_features(self) -> None: on=[self.conversation_id_col, self.speaker_id_col], how="inner" ) + + if self.user_aggregation == True: + # For each summarizable feature + for column in self.columns_to_summarize: + + # Average/Mean of feature across the User + if 'mean' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_mean_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) + + # Maxima for the feature across the User + if 'max' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_max_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) + + # Minima for the feature across the User + if 'min' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_min_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) + + # Standard Deviation of feature across the User + if 'stdev' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_stdev_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) + + # Median of feature across the User + if 'median' in self.user_methods: + self.user_data = pd.merge( + left=self.user_data, + right=get_user_median_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) def get_user_level_summed_features(self) -> None: """ @@ -244,7 +300,10 @@ def get_user_level_summed_features(self) -> None: :rtype: None """ + # For each summarizable feature + for column in self.summable_columns: + for column in self.summable_columns: # Sum of feature across the Conversation @@ -278,10 +337,36 @@ def get_user_level_summed_features(self) -> None: # how="inner" # ) + def get_user_level_mean_features(self) -> None: + # if self.user_aggregation == True: + + # # For each summarizable feature + # for column in self.summable_columns: + + # # Sum of feature across the Conversation + # self.user_data = pd.merge( + # left=self.user_data, + # right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + # on=[self.conversation_id_col, self.speaker_id_col], + # how="inner" + # ) + + # for column in self.columns_to_summarize: # TODO --- Gini depends on the summation happening; something is happening here where it's causing Gini to break. + # if column not in self.summable_columns: + # # Sum of feature across the Conversation + # self.user_data = pd.merge( + # left=self.user_data, + # right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + # on=[self.conversation_id_col, self.speaker_id_col], + # how="inner" + # ) + def get_user_level_mean_features(self) -> None: """ Aggregate summary statistics by calculating mean user-level features from chat-level features. + Aggregate summary statistics by calculating mean user-level features from chat-level features. + This function calculates and merges the mean features into the user-level data. This function calculates and merges the mean features into the user-level data. :return: None @@ -301,6 +386,20 @@ def get_user_level_mean_features(self) -> None: how="inner" ) + + if self.user_aggregation == True: + # For each summarizable feature + for column in self.columns_to_summarize: + + if 'mean' in self.user_methods: + # Average/Mean of feature across the User + self.user_data = pd.merge( + left=self.user_data, + right=get_user_mean_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col), + on=[self.conversation_id_col, self.speaker_id_col], + how="inner" + ) + def get_centroids(self) -> None: """ diff --git a/src/team_comm_tools/utils/check_embeddings.py b/src/team_comm_tools/utils/check_embeddings.py index 2c56b3b6..2326cba6 100644 --- a/src/team_comm_tools/utils/check_embeddings.py +++ b/src/team_comm_tools/utils/check_embeddings.py @@ -15,6 +15,8 @@ from scipy.special import softmax from transformers import logging +from team_comm_tools.utils.preprocess import * + logging.set_verbosity(40) # only log errors model_vect = SentenceTransformer('all-MiniLM-L6-v2') @@ -24,7 +26,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false" # Check if embeddings exist -def check_embeddings(chat_data, vect_path, bert_path, need_sentence, need_sentiment, regenerate_vectors, message_col = "message"): +def check_embeddings(chat_data, vect_path, bert_path, need_sentence, need_sentiment, regenerate_vectors, original_vect_path, message_col = "message"): """ Check if embeddings and required lexicons exist, and generate them if they don't. @@ -43,6 +45,8 @@ def check_embeddings(chat_data, vect_path, bert_path, need_sentence, need_sentim :type need_sentiment: bool :param regenerate_vectors: If true, will regenerate vector data even if it already exists :type regenerate_vectors: bool, optional + :param original_vect_path: Default vector path if users do not provide custom vectors + :type original_vect_path: str, optional :param message_col: A string representing the column name that should be selected as the message. Defaults to "message". :type message_col: str, optional @@ -59,10 +63,33 @@ def check_embeddings(chat_data, vect_path, bert_path, need_sentence, need_sentim # check whether the given vector and bert data matches length of chat data if len(vector_df) != len(chat_data): print("ERROR: The length of the vector data does not match the length of the chat data. Regenerating...") - generate_vect(chat_data, vect_path, message_col) + if vect_path == original_vect_path: + print("WARNING: custom_vect_path and vector_directory are the same. Overwriting custom_vect_path...") + generate_vect(chat_data, original_vect_path, message_col) + + if "message_embedding" not in vector_df.columns: + print("ERROR: The provided vectors do not contain a message_embedding column. Regenerating...") + if vect_path == original_vect_path: + print("WARNING: custom_vect_path and vector_directory are the same. Overwriting custom_vect_path...") + generate_vect(chat_data, original_vect_path, message_col) + else: + if vector_df['message_embedding'].apply(lambda x: isinstance(x, list) and all(isinstance(i, (int, float, np.number)) for i in x)).all() == False: + print("ERROR: message_embedding columns does not only contain numeric lists. Regenerating...") + if vect_path == original_vect_path: + print("WARNING: custom_vect_path and vector_directory are the same. Overwriting custom_vect_path...") + generate_vect(chat_data, original_vect_path, message_col) + + preprocess_vect_data(vector_df, message_col) + + if vector_df["message"].equals(chat_data["message"]) == False: + print("ERROR: The provided vectors do not match the chat data. Regenerating...") + if vect_path == original_vect_path: + print("WARNING: custom_vect_path and vector_directory are the same. Overwriting custom_vect_path...") + generate_vect(chat_data, original_vect_path, message_col) + except FileNotFoundError: # It's OK if we don't have the path, if the sentence vectors are not necessary if need_sentence: - generate_vect(chat_data, vect_path, message_col) + generate_vect(chat_data, original_vect_path, message_col) try: bert_df = pd.read_csv(bert_path) @@ -256,4 +283,4 @@ def get_sentiment(texts): sent_df = pd.DataFrame(np.nan, index=texts_series.index, columns=['positive_bert', 'negative_bert', 'neutral_bert']) sent_df.loc[texts_series.apply(lambda x: pd.notnull(x) and x.strip() != ''), ['positive_bert', 'negative_bert', 'neutral_bert']] = non_null_sent_df.values - return sent_df \ No newline at end of file + return sent_df