diff --git a/feature_engine/featurize.py b/feature_engine/featurize.py index c9ed1dcd..52421316 100644 --- a/feature_engine/featurize.py +++ b/feature_engine/featurize.py @@ -28,7 +28,7 @@ # ) # feature_builder.featurize(col="message") - # # Tiny multi-task + # Tiny multi-task # tiny_multi_task_feature_builder = FeatureBuilder( # input_file_path = "../feature_engine/tpm-data/cleaned_data/test_data/multi_task_TINY.csv", # vector_directory = "../feature_engine/tpm-data/vector_data/", diff --git a/feature_engine/utils/preprocess.py b/feature_engine/utils/preprocess.py index 663e4fdd..0f8ded8b 100644 --- a/feature_engine/utils/preprocess.py +++ b/feature_engine/utils/preprocess.py @@ -25,6 +25,10 @@ def assert_key_columns_present(df): # Assert that key columns are present if {'conversation_num', 'message', 'speaker_nickname'}.issubset(df.columns): print("Confirmed that data has `conversation_num`, `message`, and `speaker_nickname` columns!") + # ensure no NA's in essential columns + df['message'] = df['message'].fillna('') + df['conversation_num'] = df['conversation_num'].fillna(0) + df['speaker_nickname'] = df['speaker_nickname'].fillna(0) else: print("One of `conversation_num`, `message`, or `speaker_nickname` is missing! Raising error...") print("Columns available: ")