From 006bb42ed00eefea98d2b062f737290bbc2c0a8f Mon Sep 17 00:00:00 2001 From: zhouhelena Date: Fri, 26 Apr 2024 00:17:53 -0400 Subject: [PATCH] encoding & vect/bert paths --- feature_engine/feature_builder.py | 7 ++++--- feature_engine/featurize.py | 11 +++++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/feature_engine/feature_builder.py b/feature_engine/feature_builder.py index 4a32be2e..43518988 100644 --- a/feature_engine/feature_builder.py +++ b/feature_engine/feature_builder.py @@ -18,6 +18,7 @@ import re import numpy as np from pathlib import Path +import time # Imports from feature files and classes # from utils.summarize_chat_level_features import * @@ -92,9 +93,9 @@ def __init__( df_type = df_type + "/cumulative/within_task/" df_type = df_type + "/cumulative/" - file_name = re.findall("\/([^\/]+)$", self.input_file_path)[0] - self.vect_path = self.vector_directory + "sentence/" + df_type + "/"+ file_name - self.bert_path = self.vector_directory + "sentiment/" + df_type + "/"+ file_name + base_file_name = f"features_{int(time.time())}.csv" + self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name + self.bert_path = vector_directory + "sentiment/" + ("turns" if self.turns else "chats") + "/" + base_file_name self.output_file_path_chat_level = re.sub('chat', 'turn', output_file_path_chat_level) if self.turns else output_file_path_chat_level # Check + generate embeddings diff --git a/feature_engine/featurize.py b/feature_engine/featurize.py index 48e4a719..b2edf666 100644 --- a/feature_engine/featurize.py +++ b/feature_engine/featurize.py @@ -8,11 +8,18 @@ # Importing the Feature Generating Class from feature_builder import FeatureBuilder import pandas as pd +import chardet # Main Function if __name__ == "__main__": - chat_df = pd.read_csv("../feature_engine/testing/data/cleaned_data/test_chat_level.csv") - conv_df = pd.read_csv("../feature_engine/testing/data/cleaned_data/test_conv_level.csv") + with open("../feature_engine/testing/data/cleaned_data/test_chat_level.csv", 'rb') as file: + chat_encoding = chardet.detect(file.read()) + + with open("../feature_engine/testing/data/cleaned_data/test_conv_level.csv", 'rb') as file: + conv_encoding = chardet.detect(file.read()) + + chat_df = pd.read_csv("../feature_engine/testing/data/cleaned_data/test_chat_level.csv", encoding=chat_encoding['encoding']) + conv_df = pd.read_csv("../feature_engine/testing/data/cleaned_data/test_conv_level.csv", encoding=conv_encoding['encoding']) # Instantiating the Feature Generating Class # Calling the "engine"/"driver" function of the FeatureBuilder class