Skip to content

Commit 0237b5a

Browse files
authored
Merge pull request #21 from haohaom1/haohaom1-patch-2
Add files via upload
2 parents 8dee87b + 8d0868c commit 0237b5a

File tree

6 files changed

+160
-55
lines changed

6 files changed

+160
-55
lines changed

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,14 @@ WINDOWS VERSION
2121
-FIXED BUG binds not working: set focus AFTER displaying graph
2222

2323
- Changed Scoring Metric: words now have to be at least length 3
24+
25+
26+
Version 1.3.0
27+
10-18-18
28+
29+
WINDOWS VERSION
30+
31+
- decoded email id into string data type
32+
- this will be encoded back to byte data later
33+
- made constituent id into int data type
34+
- Added option to choose to automate things and the threshold from GUI

classifier.py

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
scraper = Scraper()
1212
reader = Emailreader()
1313

14-
constituent_df = pd.read_csv(pathlib.PureWindowsPath(pathlib.Path('datasets/OrganizationRelationships_NickNamesAdded_5.24.2018.csv'))
15-
)
16-
14+
# constituent_df = pd.read_csv(pathlib.PureWindowsPath(pathlib.Path('datasets/OrganizationRelationships_NickNamesAdded_5.24.2018.csv'))
15+
# )
16+
constituent_df = 'datasets/OrganizationRelationships_NickNamesAdded_5.24.2018.csv'
1717

1818

1919
def score(df, clf, return_proba=False, remove_nan=True):
@@ -119,8 +119,6 @@ def classify_mails(mail, folder, clf=None, cap_at=None, latest_first=True, thres
119119
:return: dataframe containing UID of the emails, Scores, probability,
120120
and confidence, decision, and timestamp, constituent info, sorted by confidence,
121121
'''
122-
from datetime import datetime
123-
print('starting classify', datetime.now())
124122

125123
if not clf:
126124
clf = joblib.load('Classifiers/LR_7_30.pkl')
@@ -150,10 +148,17 @@ def classify_mails(mail, folder, clf=None, cap_at=None, latest_first=True, thres
150148

151149
df = classify_mails_from_data(mail=mail, df=scores_df, folder=folder, threshold=threshold, move=move)
152150

151+
# turn constituent_id to int
152+
df['constituent_id'] = df['constituent_id'].astype(np.int64)
153+
153154
# gets the actual words from the urls
154155
# IF TIME PERMITTED USE WORDS FROM THE START TO BE MORE EFFICIENT
155156
df['text'] = df['url'].apply(lambda x: ' '.join(scraper.get_text_from_url(x, clean=False)))
156157

158+
# documents the mail source
159+
df['folder'] = folder
160+
161+
157162
# sorts df by probability
158163
# df.sort_values(['proba'], inplace=True)
159164

@@ -166,7 +171,8 @@ def classify_mails(mail, folder, clf=None, cap_at=None, latest_first=True, thres
166171
date = datetime.strftime(datetime.now(), '%Y-%m-%d %H.%M.%S')
167172

168173
# saves to the logs
169-
windows_path = pathlib.PureWindowsPath(pathlib.Path('logs/{}_logs.csv'.format(date)))
174+
# windows_path = pathlib.PureWindowsPath(pathlib.Path('logs/{}_logs.csv'.format(date)))
175+
windows_path = 'logs/{}_logs.csv'.format(date)
170176
df.to_csv(windows_path, index=False)
171177

172178
# if to_raiser is true AND there is an available data from logs, then return the data to be
@@ -195,6 +201,8 @@ def create_csv_for_raiser(logs=None, df=None, return_merged_df=False):
195201

196202
# returns null if there are no values in the log to be moved
197203
if df.empty:
204+
if return_merged_df:
205+
return df, df
198206
return df
199207

200208
dates = datetime.strftime(datetime.now(), '%m/%d/%Y')
@@ -232,6 +240,33 @@ def get_description(fname, lname, arg):
232240

233241
if return_merged_df:
234242

235-
return df
243+
return raisers_df, df
236244

237245
return raisers_df
246+
247+
248+
def move_emails(mail, df):
249+
'''
250+
Uses the Raiser CSV to determine which emails to move to which folder
251+
'''
252+
253+
# converts str to boolean
254+
df['moved'].apply(lambda x: x == 'True')
255+
256+
for _, row in df.iterrows():
257+
folder = row['folder']
258+
email_uid = str.encode(str(row['id']))
259+
260+
if row['label'] == 0 and row['moved']:
261+
target_folder = 'Received'
262+
elif row['label'] == 1 and row['moved']:
263+
target_folder = 'Completed'
264+
else:
265+
target_folder = 'Further Review Needed'
266+
267+
# print('moving from', folder, 'to', target_folder)
268+
# print(row['id'], type(row['id']))
269+
# byte_id = str.encode('1779')
270+
# print(byte_id, type(byte_id))
271+
272+
reader.move_email_to_folder(mail=mail, orig_folder=folder, target_folder=target_folder, email_uid=email_uid)

emailreader.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@ def login_email(self, username=None, password=None):
3131
return mail
3232

3333

34-
# In[4]:
35-
3634
def switch_folders(self, mail, folder_name):
3735
'''
3836
Switches the folders of the email
@@ -45,7 +43,11 @@ def get_emails_from_folder(self, mail, folder_name, latest_first=True, cap_at=No
4543
params: folder_name
4644
return: pandas series containing the email uid as the index and the email_message object as the data
4745
'''
48-
mail.select('"Google Alerts/{}"'.format(folder_name))
46+
47+
if folder_name == 'INBOX':
48+
mail.select('INBOX')
49+
else:
50+
mail.select('"Google Alerts/{}"'.format(folder_name))
4951
result, data = mail.uid('search', None, "ALL") # search and return uids instead
5052

5153
ids = data[0].split()
@@ -61,9 +63,13 @@ def get_emails_from_folder(self, mail, folder_name, latest_first=True, cap_at=No
6163
raw_emails = [str(d[0][1], 'utf-8') for d in data]
6264
email_messages = [email.message_from_string(raw) for raw in raw_emails]
6365

66+
# decodes the ids into string
67+
ids = [x.decode() for x in ids]
68+
6469
df = pd.DataFrame({'mail': email_messages,
6570
'id': ids})
6671

72+
6773
print('You have {} messages in folder {}'.format(num_emails, folder_name))
6874

6975
return df

scraper.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -204,8 +204,6 @@ def scrape_words_from_urls(self, urls, split_up_links):
204204
# scrapes words from a list of urls
205205
# returns a list of list of words
206206

207-
print('starting scrape', datetime.now())
208-
209207
if split_up_links:
210208
urls = [urls]
211209

@@ -233,12 +231,10 @@ def scrape_words_from_urls(self, urls, split_up_links):
233231
except (URLError, HTTPError, KeyboardInterrupt) as error:
234232
warnings.warn('Unable to load {}'.format(url))
235233
print(error)
236-
print('reached', datetime.now())
237234
if split_up_links:
238235
return None
239236
list_of_words.append('')
240237

241-
print('end scrape', datetime.now())
242238
return list_of_words
243239

244240

test.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,23 @@
1616
import os
1717
import warnings
1818
import webbrowser
19-
# nltk.download('punkt')
20-
#%%
19+
##%%
2120
scraper = Scraper()
2221
reader = Emailreader()
2322
username = 'prospectstudent@colby.edu'
2423
password = 'Student.2017'
2524
mail = reader.login_email(username, password)
25+
#%%
26+
a = 'b\'123'
27+
str.encode(a)
28+
print(a)
29+
a.decode()
30+
31+
#%%
32+
mail.list()
33+
mail.select('INBOX')
34+
#%%
35+
df['id'].apply(lambda s: s.decode())
2636

2737
#%%
2838
# x = zip(np.random.choice(a=[1], size=10), np.random.randn(10))

0 commit comments

Comments
 (0)