Skip to content

Commit

Permalink
updates to politeness v2
Browse files Browse the repository at this point in the history
Fixing style + bug with imports
  • Loading branch information
kumarnik1 committed Apr 10, 2024
1 parent d5cdc65 commit 48603f1
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 44 deletions.
24 changes: 6 additions & 18 deletions feature_engine/features/politeness_v2.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pandas as pd
from features import feature_extraction
from features.politeness_v2_helper import *

def get_politeness_v2(df,on_column):
"""
Expand All @@ -9,25 +9,13 @@ def get_politeness_v2(df,on_column):
The dataframe after adding the politness v2 features
"""

#extract the column headers by running the script for a random text. We sort the names of the features in alphabetical order.

'''
TODO --- this code should be fixed stylistically; is there a cleaner way of doing this?
'''
# # This is done because the original package sorts the features by counts. It is not possible do so if we have a number of rows, as each row may have different counts for different features
column_headers = feature_extraction.feat_counts("hahaha",feature_extraction.kw).sort_values(by='Features')['Features'].tolist()
# Extract column headers by running script on first row; we sort feature names alphabetically
column_headers = feat_counts(df.iloc[0][on_column],kw).sort_values(by='Features')['Features'].tolist()

# Apply the function to each row in 'text_column' and store the result in a new column 'output_column'. We sort the names of the features in alphabetical order
df_output = df[on_column].apply(lambda x: feature_extraction.feat_counts(x,feature_extraction.kw).sort_values(by='Features')['Counts'])

'''
TODO -- this code breaks for me:
df_output = df[on_column].apply(lambda x: feature_extraction.feat_counts(x,feature_extraction.kw).sort_values(by='Features')['Counts'])
^^^^^^^^^^^^^^^^^^^^^
AttributeError: module 'features.feature_extraction' has no attribute 'kw'
'''
# Apply the function to each row in text dataframe and store the result in a new output dataframe
df_output = df[on_column].apply(lambda x: feat_counts(x,kw).sort_values(by='Features')['Counts'])

#add the column headers
# Add column headers
df_output.columns = column_headers

return df_output
Original file line number Diff line number Diff line change
@@ -1,30 +1,24 @@
import os
import pandas as pd
#import prep
import spacy
import en_core_web_sm
import re
import numpy as np
from features import keywords
import features.keywords as keywords
import regex
import pickle
import errno

nlp = en_core_web_sm.load()
nlp.enable_pipe("senter")
kw = keywords.kw

import nltk
from nltk.corpus import stopwords
from nltk import tokenize

def sentence_split(doc):

'''
TODO --- there are quite a few randomly commented out lines here. I removed some code that seemed
obviously redunant (e.g., print statements), but I do not know why doc = nlp(text) and other commented-out
code is not being included. Can we clean this up?
'''
# doc = nlp(text)
sentences = [str(sent) for sent in doc.sents]
sentences = [' ' + prep_simple(str(s)) + ' ' for s in sentences]

Expand Down Expand Up @@ -52,8 +46,6 @@ def count_matches(keywords, doc):

text = sentence_pad(doc)

# print(text)

key_res = []
phrase2_count = []

Expand Down Expand Up @@ -138,8 +130,6 @@ def count_spacy_matches(keywords, dep_pairs):
phrase2_count = []

for key in keywords:
# print(key)

key_res.append(key)
counter = 0

Expand Down Expand Up @@ -179,10 +169,6 @@ def bare_command(doc):

keywords = set([' be ', ' do ', ' please ', ' have ', ' thank ', ' hang ', ' let '])

# nlp.enable_pipe("senter")
# doc = nlp(text)

# Returns first word of every sentence along with the corresponding POS
first_words = [' ' + prep_simple(str(sent[0])) + ' ' for sent in doc.sents]

POS_fw = [sent[0].tag_ for sent in doc.sents]
Expand Down Expand Up @@ -268,7 +254,6 @@ def feat_counts(text, kw):
doc_text = nlp(text)
doc_clean_text = nlp(clean_text)

# Count key words and dependency pairs with negation
kw_matches = count_matches(kw['word_matches'], doc_text)

dep_pairs, negations = get_dep_pairs(doc_clean_text)
Expand All @@ -287,7 +272,6 @@ def feat_counts(text, kw):
scores = scores.groupby('Features').sum().sort_values(by='Counts', ascending=False)
scores = scores.reset_index()

# add remaining features
bc = bare_command(doc_text)
scores.loc[len(scores)] = ['Bare_Command', bc]

Expand Down Expand Up @@ -335,8 +319,6 @@ def load_to_lists(path, words):
all_lines.append(splitLine)

feature_names.append(all_filenames[i])

# print(keywords[all_filenames[i]])
except IOError as exc:
if exc.errno != errno.EISDIR:
raise
Expand Down Expand Up @@ -442,8 +424,8 @@ def prep_simple(text):

t = text.lower()
t = clean_text(t)
t = re.sub(r"[.?!]+\ *", "", t) # spcifially replace punctuations with nothing
t = re.sub('[^A-Za-z,]', ' ', t) # all other special chracters are replaced with blanks
t = re.sub(r"[.?!]+\ *", "", t)
t = re.sub('[^A-Za-z,]', ' ', t)

return t

Expand All @@ -464,7 +446,6 @@ def prep_whole(text):

def sentenciser(text):

#nlp = spacy.load("en_core_web_sm", exclude=["parser"])
nlp.enable_pipe("senter")

doc = nlp(text)
Expand All @@ -476,9 +457,6 @@ def sentenciser(text):

def punctuation_seperator(text):

#x = tokenize.sent_tokenize(self.text)

# split string by punctuation
PUNCT_RE = regex.compile(r'(\p{Punctuation})')
split_punct = PUNCT_RE.split(text)

Expand Down

0 comments on commit 48603f1

Please sign in to comment.