diff --git a/feature_engine/features/politeness_v2.py b/feature_engine/features/politeness_v2.py index 8b9f85da..a2c90190 100644 --- a/feature_engine/features/politeness_v2.py +++ b/feature_engine/features/politeness_v2.py @@ -1,5 +1,5 @@ import pandas as pd -from features import feature_extraction +from features.politeness_v2_helper import * def get_politeness_v2(df,on_column): """ @@ -9,25 +9,13 @@ def get_politeness_v2(df,on_column): The dataframe after adding the politness v2 features """ - #extract the column headers by running the script for a random text. We sort the names of the features in alphabetical order. - - ''' - TODO --- this code should be fixed stylistically; is there a cleaner way of doing this? - ''' - # # This is done because the original package sorts the features by counts. It is not possible do so if we have a number of rows, as each row may have different counts for different features - column_headers = feature_extraction.feat_counts("hahaha",feature_extraction.kw).sort_values(by='Features')['Features'].tolist() + # Extract column headers by running script on first row; we sort feature names alphabetically + column_headers = feat_counts(df.iloc[0][on_column],kw).sort_values(by='Features')['Features'].tolist() - # Apply the function to each row in 'text_column' and store the result in a new column 'output_column'. We sort the names of the features in alphabetical order - df_output = df[on_column].apply(lambda x: feature_extraction.feat_counts(x,feature_extraction.kw).sort_values(by='Features')['Counts']) - - ''' - TODO -- this code breaks for me: - df_output = df[on_column].apply(lambda x: feature_extraction.feat_counts(x,feature_extraction.kw).sort_values(by='Features')['Counts']) - ^^^^^^^^^^^^^^^^^^^^^ - AttributeError: module 'features.feature_extraction' has no attribute 'kw' - ''' + # Apply the function to each row in text dataframe and store the result in a new output dataframe + df_output = df[on_column].apply(lambda x: feat_counts(x,kw).sort_values(by='Features')['Counts']) - #add the column headers + # Add column headers df_output.columns = column_headers return df_output diff --git a/feature_engine/features/feature_extraction.py b/feature_engine/features/politeness_v2_helper.py similarity index 92% rename from feature_engine/features/feature_extraction.py rename to feature_engine/features/politeness_v2_helper.py index 839af3a6..31522735 100644 --- a/feature_engine/features/feature_extraction.py +++ b/feature_engine/features/politeness_v2_helper.py @@ -1,17 +1,17 @@ import os import pandas as pd -#import prep import spacy import en_core_web_sm import re import numpy as np -from features import keywords +import features.keywords as keywords import regex import pickle import errno nlp = en_core_web_sm.load() nlp.enable_pipe("senter") +kw = keywords.kw import nltk from nltk.corpus import stopwords @@ -19,12 +19,6 @@ def sentence_split(doc): - ''' - TODO --- there are quite a few randomly commented out lines here. I removed some code that seemed - obviously redunant (e.g., print statements), but I do not know why doc = nlp(text) and other commented-out - code is not being included. Can we clean this up? - ''' - # doc = nlp(text) sentences = [str(sent) for sent in doc.sents] sentences = [' ' + prep_simple(str(s)) + ' ' for s in sentences] @@ -52,8 +46,6 @@ def count_matches(keywords, doc): text = sentence_pad(doc) - # print(text) - key_res = [] phrase2_count = [] @@ -138,8 +130,6 @@ def count_spacy_matches(keywords, dep_pairs): phrase2_count = [] for key in keywords: - # print(key) - key_res.append(key) counter = 0 @@ -179,10 +169,6 @@ def bare_command(doc): keywords = set([' be ', ' do ', ' please ', ' have ', ' thank ', ' hang ', ' let ']) - # nlp.enable_pipe("senter") - # doc = nlp(text) - - # Returns first word of every sentence along with the corresponding POS first_words = [' ' + prep_simple(str(sent[0])) + ' ' for sent in doc.sents] POS_fw = [sent[0].tag_ for sent in doc.sents] @@ -268,7 +254,6 @@ def feat_counts(text, kw): doc_text = nlp(text) doc_clean_text = nlp(clean_text) - # Count key words and dependency pairs with negation kw_matches = count_matches(kw['word_matches'], doc_text) dep_pairs, negations = get_dep_pairs(doc_clean_text) @@ -287,7 +272,6 @@ def feat_counts(text, kw): scores = scores.groupby('Features').sum().sort_values(by='Counts', ascending=False) scores = scores.reset_index() - # add remaining features bc = bare_command(doc_text) scores.loc[len(scores)] = ['Bare_Command', bc] @@ -335,8 +319,6 @@ def load_to_lists(path, words): all_lines.append(splitLine) feature_names.append(all_filenames[i]) - - # print(keywords[all_filenames[i]]) except IOError as exc: if exc.errno != errno.EISDIR: raise @@ -442,8 +424,8 @@ def prep_simple(text): t = text.lower() t = clean_text(t) - t = re.sub(r"[.?!]+\ *", "", t) # spcifially replace punctuations with nothing - t = re.sub('[^A-Za-z,]', ' ', t) # all other special chracters are replaced with blanks + t = re.sub(r"[.?!]+\ *", "", t) + t = re.sub('[^A-Za-z,]', ' ', t) return t @@ -464,7 +446,6 @@ def prep_whole(text): def sentenciser(text): - #nlp = spacy.load("en_core_web_sm", exclude=["parser"]) nlp.enable_pipe("senter") doc = nlp(text) @@ -476,9 +457,6 @@ def sentenciser(text): def punctuation_seperator(text): - #x = tokenize.sent_tokenize(self.text) - - # split string by punctuation PUNCT_RE = regex.compile(r'(\p{Punctuation})') split_punct = PUNCT_RE.split(text)