Skip to content

Commit

Permalink
Merge branch 'amy/package_v2' of https://github.com/Watts-Lab/team_co…
Browse files Browse the repository at this point in the history
…mm_tools into amy/package_v2
  • Loading branch information
amytangzheng committed Nov 6, 2024
2 parents b10bdee + 653e386 commit d007ae8
Show file tree
Hide file tree
Showing 5 changed files with 250 additions and 4 deletions.
3 changes: 3 additions & 0 deletions examples/featurize.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
juries_df = pd.read_csv("./example_data/full_empirical_datasets/jury_conversations_with_outcome_var.csv", encoding='utf-8')
csop_df = pd.read_csv("./example_data/full_empirical_datasets/csop_conversations_withblanks.csv", encoding='utf-8')
csopII_df = pd.read_csv("./example_data/full_empirical_datasets/csopII_conversations_withblanks.csv", encoding='utf-8')
test_df = pd.read_csv("C:/Users/amyta/Documents/GitHub/team_comm_tools/tests/data/cleaned_data/test_package_aggregation.csv", encoding='utf-8')

# C:/Users/amyta/Documents/GitHub/team_comm_tools/tests/data/cleaned_data/test_package_aggregation.csv

"""
TINY / TEST DATASETS -------------------------------
Expand Down
11 changes: 11 additions & 0 deletions src/team_comm_tools/feature_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,15 @@ def __init__(

self.original_vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name

if custom_vect_path is not None:
print("Detected that user has requested custom vectors...")
print("We will generate features using custom vectors rather than default SBERT")
self.vect_path = custom_vect_path
else:
self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name

self.original_vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name

self.bert_path = vector_directory + "sentiment/" + ("turns" if self.turns else "chats") + "/" + base_file_name

# Check + generate embeddings
Expand All @@ -417,6 +426,8 @@ def __init__(
if(not need_sentiment and feature_dict[feature]["bert_sentiment_data"]):
need_sentiment = True

# preprocess chat data again
self.preprocess_chat_data()
# preprocess chat data again
self.preprocess_chat_data()
check_embeddings(self.chat_data, self.vect_path, self.bert_path, need_sentence, need_sentiment, self.regenerate_vectors, self.message_col)
Expand Down
106 changes: 106 additions & 0 deletions src/team_comm_tools/utils/calculate_conversation_level_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from team_comm_tools.utils.gini_coefficient import *
from team_comm_tools.utils.preprocess import *
from fuzzywuzzy import process
from fuzzywuzzy import process

class ConversationLevelFeaturesCalculator:
"""
Expand Down Expand Up @@ -245,6 +246,8 @@ def __init__(self, chat_data: pd.DataFrame,
self.summable_columns = ["num_words", "num_chars", "num_messages"]




def calculate_conversation_level_features(self, feature_methods: list) -> pd.DataFrame:
"""
Main driver function for creating conversation-level features.
Expand Down Expand Up @@ -317,6 +320,18 @@ def get_conversation_level_aggregates(self) -> None:
:rtype: None
"""

if self.convo_aggregation == True:
# For each summarizable feature
for column in self.columns_to_summarize:

# Average/Mean of feature across the Conversation
if 'mean' in self.convo_methods:
self.conv_data = pd.merge(
left=self.conv_data,
right=get_mean(self.chat_data.copy(), column, 'mean_'+column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)
if self.convo_aggregation == True:
# For each summarizable feature
for column in self.columns_to_summarize:
Expand All @@ -331,6 +346,14 @@ def get_conversation_level_aggregates(self) -> None:
)

# Standard Deviation of feature across the Conversation
if 'stdev' in self.convo_methods:
self.conv_data = pd.merge(
left=self.conv_data,
right=get_stdev(self.chat_data.copy(), column, 'stdev_'+column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)
# Standard Deviation of feature across the Conversation
if 'stdev' in self.convo_methods:
self.conv_data = pd.merge(
left=self.conv_data,
Expand All @@ -347,7 +370,34 @@ def get_conversation_level_aggregates(self) -> None:
on=[self.conversation_id_col],
how="inner"
)
# Minima for the feature across the Conversation
if 'min' in self.convo_methods:
self.conv_data = pd.merge(
left=self.conv_data,
right=get_min(self.chat_data.copy(), column, 'min_'+column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)

# Maxima for the feature across the Conversation
if 'max' in self.convo_methods:
self.conv_data = pd.merge(
left=self.conv_data,
right=get_max(self.chat_data.copy(), column, 'max_'+column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)

# Median for the feature across the Conversation
if 'median' in self.convo_methods:
self.conv_data = pd.merge(
left=self.conv_data,
right=get_median(self.chat_data.copy(), column, 'median_'+column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)

# Do this only for the columns that make sense (e.g., countable things); we do this regardless of aggregation, as it's necessary for gini.
# Maxima for the feature across the Conversation
if 'max' in self.convo_methods:
self.conv_data = pd.merge(
Expand Down Expand Up @@ -376,6 +426,7 @@ def get_conversation_level_aggregates(self) -> None:
how="inner"
)


def get_user_level_aggregates(self) -> None:
"""
Aggregate summary statistics from user-level features to conversation-level features.
Expand All @@ -391,13 +442,26 @@ def get_user_level_aggregates(self) -> None:
- Maximum of averaged user-level features
:return: None
:rtype: None
"""

if self.convo_aggregation == True and self.user_aggregation == True:
if self.convo_aggregation == True and self.user_aggregation == True:

# aggregates from the user level based on conversation methods
if 'mean' in self.convo_methods:
for user_column in self.user_columns:
for user_method in self.user_methods:
# Average/Mean of User-Level Feature
self.conv_data = pd.merge(
left=self.conv_data,
right=get_mean(self.user_data.copy(), user_method + "_" +user_column, "mean_user_" + user_method + "_" +user_column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)
# aggregates from the user level based on conversation methods
if 'mean' in self.convo_methods:
for user_column in self.user_columns:
for user_method in self.user_methods:
Expand All @@ -409,6 +473,16 @@ def get_user_level_aggregates(self) -> None:
how="inner"
)

if 'stdev' in self.convo_methods:
for user_column in self.user_columns:
for user_method in self.user_methods:
# Standard Deviation of User-Level Feature
self.conv_data = pd.merge(
left=self.conv_data,
right=get_stdev(self.user_data.copy(), user_method + "_" + user_column, 'stdev_user_' + user_method + "_" + user_column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)
if 'stdev' in self.convo_methods:
for user_column in self.user_columns:
for user_method in self.user_methods:
Expand Down Expand Up @@ -452,6 +526,38 @@ def get_user_level_aggregates(self) -> None:
on=[self.conversation_id_col],
how="inner"
)
if 'min' in self.convo_methods:
for user_column in self.user_columns:
for user_method in self.user_methods:
# Minima of User-Level Feature
self.conv_data = pd.merge(
left=self.conv_data,
right=get_min(self.user_data.copy(), user_method + "_" + user_column, 'min_user_' + user_method + "_" + user_column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)

if 'max' in self.convo_methods:
for user_column in self.user_columns:
for user_method in self.user_methods:
# Maxima of User-Level Feature
self.conv_data = pd.merge(
left=self.conv_data,
right=get_max(self.user_data.copy(), user_method + "_" + user_column, 'max_user_' + user_method + "_" + user_column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)

if 'median' in self.convo_methods:
for user_column in self.user_columns:
for user_method in self.user_methods:
# Median of User-Level Feature
self.conv_data = pd.merge(
left=self.conv_data,
right=get_median(self.user_data.copy(), user_method + "_" + user_column, 'median_user_' + user_method + "_" + user_column, self.conversation_id_col),
on=[self.conversation_id_col],
how="inner"
)


def get_discursive_diversity_features(self) -> None:
Expand Down
99 changes: 99 additions & 0 deletions src/team_comm_tools/utils/calculate_user_level_features.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# Importing modules from features
from team_comm_tools.utils.summarize_features import get_user_sum_dataframe, get_user_mean_dataframe, get_user_max_dataframe, get_user_min_dataframe, get_user_stdev_dataframe, get_user_median_dataframe
from team_comm_tools.utils.summarize_features import get_user_sum_dataframe, get_user_mean_dataframe, get_user_max_dataframe, get_user_min_dataframe, get_user_stdev_dataframe, get_user_median_dataframe
from team_comm_tools.features.get_user_network import *
from team_comm_tools.features.user_centroids import *
from fuzzywuzzy import process
from fuzzywuzzy import process

class UserLevelFeaturesCalculator:
"""
Expand Down Expand Up @@ -148,6 +150,8 @@ def calculate_user_level_features(self) -> pd.DataFrame:
:rtype: pd.DataFrame
"""

# Get mean features for all features
# self.get_user_level_mean_features()
# Get mean features for all features
# self.get_user_level_mean_features()

Expand All @@ -157,6 +161,9 @@ def calculate_user_level_features(self) -> pd.DataFrame:
# Get user summary statistics for all features (e.g. mean, min, max, stdev)
self.get_user_level_summary_statistics_features()

# Get user summary statistics for all features (e.g. mean, min, max, stdev)
self.get_user_level_summary_statistics_features()

# Get 4 discursive features (discursive diversity, variance in DD, incongruent modulation, within-person discursive range)
# self.get_centroids()

Expand Down Expand Up @@ -227,6 +234,55 @@ def get_user_level_summary_statistics_features(self) -> None:
on=[self.conversation_id_col, self.speaker_id_col],
how="inner"
)

if self.user_aggregation == True:
# For each summarizable feature
for column in self.columns_to_summarize:

# Average/Mean of feature across the User
if 'mean' in self.user_methods:
self.user_data = pd.merge(
left=self.user_data,
right=get_user_mean_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
on=[self.conversation_id_col, self.speaker_id_col],
how="inner"
)

# Maxima for the feature across the User
if 'max' in self.user_methods:
self.user_data = pd.merge(
left=self.user_data,
right=get_user_max_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
on=[self.conversation_id_col, self.speaker_id_col],
how="inner"
)

# Minima for the feature across the User
if 'min' in self.user_methods:
self.user_data = pd.merge(
left=self.user_data,
right=get_user_min_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
on=[self.conversation_id_col, self.speaker_id_col],
how="inner"
)

# Standard Deviation of feature across the User
if 'stdev' in self.user_methods:
self.user_data = pd.merge(
left=self.user_data,
right=get_user_stdev_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
on=[self.conversation_id_col, self.speaker_id_col],
how="inner"
)

# Median of feature across the User
if 'median' in self.user_methods:
self.user_data = pd.merge(
left=self.user_data,
right=get_user_median_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
on=[self.conversation_id_col, self.speaker_id_col],
how="inner"
)

def get_user_level_summed_features(self) -> None:
"""
Expand All @@ -244,7 +300,10 @@ def get_user_level_summed_features(self) -> None:
:rtype: None
"""


# For each summarizable feature
for column in self.summable_columns:

for column in self.summable_columns:

# Sum of feature across the Conversation
Expand Down Expand Up @@ -278,10 +337,36 @@ def get_user_level_summed_features(self) -> None:
# how="inner"
# )

def get_user_level_mean_features(self) -> None:
# if self.user_aggregation == True:

# # For each summarizable feature
# for column in self.summable_columns:

# # Sum of feature across the Conversation
# self.user_data = pd.merge(
# left=self.user_data,
# right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
# on=[self.conversation_id_col, self.speaker_id_col],
# how="inner"
# )

# for column in self.columns_to_summarize: # TODO --- Gini depends on the summation happening; something is happening here where it's causing Gini to break.
# if column not in self.summable_columns:
# # Sum of feature across the Conversation
# self.user_data = pd.merge(
# left=self.user_data,
# right=get_user_sum_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
# on=[self.conversation_id_col, self.speaker_id_col],
# how="inner"
# )

def get_user_level_mean_features(self) -> None:
"""
Aggregate summary statistics by calculating mean user-level features from chat-level features.
Aggregate summary statistics by calculating mean user-level features from chat-level features.
This function calculates and merges the mean features into the user-level data.
This function calculates and merges the mean features into the user-level data.
:return: None
Expand All @@ -301,6 +386,20 @@ def get_user_level_mean_features(self) -> None:
how="inner"
)


if self.user_aggregation == True:
# For each summarizable feature
for column in self.columns_to_summarize:

if 'mean' in self.user_methods:
# Average/Mean of feature across the User
self.user_data = pd.merge(
left=self.user_data,
right=get_user_mean_dataframe(self.chat_data, column, self.conversation_id_col, self.speaker_id_col),
on=[self.conversation_id_col, self.speaker_id_col],
how="inner"
)


def get_centroids(self) -> None:
"""
Expand Down
Loading

0 comments on commit d007ae8

Please sign in to comment.