Merge pull request #21 from haohaom1/haohaom1-patch-2

haohaom1 · web-flow · commit 0237b5af7fcc · 2018-10-18T13:08:27.000-04:00
Add files via upload
diff --git a/README.md b/README.md
@@ -21,3 +21,14 @@ WINDOWS VERSION
 -FIXED BUG binds not working: set focus AFTER displaying graph
 
 - Changed Scoring Metric: words now have to be at least length 3
+
+
+Version 1.3.0
+10-18-18
+
+WINDOWS VERSION
+
+- decoded email id into string data type
+    - this will be encoded back to byte data later
+- made constituent id into int data type
+- Added option to choose to automate things and the threshold from GUI
diff --git a/classifier.py b/classifier.py
@@ -11,9 +11,9 @@
 scraper = Scraper()
 reader = Emailreader()
 
-constituent_df = pd.read_csv(pathlib.PureWindowsPath(pathlib.Path('datasets/OrganizationRelationships_NickNamesAdded_5.24.2018.csv'))
-)
-
+# constituent_df = pd.read_csv(pathlib.PureWindowsPath(pathlib.Path('datasets/OrganizationRelationships_NickNamesAdded_5.24.2018.csv'))
+# )
+constituent_df = 'datasets/OrganizationRelationships_NickNamesAdded_5.24.2018.csv'
 
 
 def score(df, clf, return_proba=False, remove_nan=True):
@@ -119,8 +119,6 @@ def classify_mails(mail, folder, clf=None, cap_at=None, latest_first=True, thres
     :return: dataframe containing UID of the emails, Scores, probability,
     and confidence, decision, and timestamp, constituent info, sorted by confidence,
     '''
-    from datetime import datetime
-    print('starting classify', datetime.now())
 
     if not clf:
         clf = joblib.load('Classifiers/LR_7_30.pkl')
@@ -150,10 +148,17 @@ def classify_mails(mail, folder, clf=None, cap_at=None, latest_first=True, thres
 
     df = classify_mails_from_data(mail=mail, df=scores_df, folder=folder, threshold=threshold, move=move)
 
+    # turn constituent_id to int
+    df['constituent_id'] = df['constituent_id'].astype(np.int64)
+
     # gets the actual words from the urls
     # IF TIME PERMITTED USE WORDS FROM THE START TO BE MORE EFFICIENT
     df['text'] = df['url'].apply(lambda x: ' '.join(scraper.get_text_from_url(x, clean=False)))
 
+    # documents the mail source
+    df['folder'] = folder
+
+
     # sorts df by probability
     # df.sort_values(['proba'], inplace=True)
 
@@ -166,7 +171,8 @@ def classify_mails(mail, folder, clf=None, cap_at=None, latest_first=True, thres
             date = datetime.strftime(datetime.now(), '%Y-%m-%d %H.%M.%S')
 
             # saves to the logs
-            windows_path = pathlib.PureWindowsPath(pathlib.Path('logs/{}_logs.csv'.format(date)))
+            # windows_path = pathlib.PureWindowsPath(pathlib.Path('logs/{}_logs.csv'.format(date)))
+            windows_path = 'logs/{}_logs.csv'.format(date)
             df.to_csv(windows_path, index=False)
 
             # if to_raiser is true AND there is an available data from logs, then return the data to be
@@ -195,6 +201,8 @@ def create_csv_for_raiser(logs=None, df=None, return_merged_df=False):
 
     # returns null if there are no values in the log to be moved
     if df.empty:
+        if return_merged_df:
+            return df, df
         return df
 
     dates = datetime.strftime(datetime.now(), '%m/%d/%Y')
@@ -232,6 +240,33 @@ def get_description(fname, lname, arg):
 
     if return_merged_df:
 
-        return df
+        return raisers_df, df
 
     return raisers_df
+
+
+def move_emails(mail, df):
+    '''
+    Uses the Raiser CSV to determine which emails to move to which folder
+    '''
+
+    # converts str to boolean
+    df['moved'].apply(lambda x: x == 'True')
+
+    for _, row in df.iterrows():
+        folder = row['folder']
+        email_uid = str.encode(str(row['id']))
+
+        if row['label'] == 0 and row['moved']:
+            target_folder = 'Received'
+        elif row['label'] == 1 and row['moved']:
+            target_folder = 'Completed'
+        else:
+            target_folder = 'Further Review Needed'
+
+        # print('moving from', folder, 'to', target_folder)
+        # print(row['id'], type(row['id']))
+        # byte_id = str.encode('1779')
+        # print(byte_id, type(byte_id))
+
+        reader.move_email_to_folder(mail=mail, orig_folder=folder, target_folder=target_folder, email_uid=email_uid)
diff --git a/emailreader.py b/emailreader.py
@@ -31,8 +31,6 @@ def login_email(self, username=None, password=None):
         return mail
 
 
-    # In[4]:
-
     def switch_folders(self, mail, folder_name):
         '''
         Switches the folders of the email
@@ -45,7 +43,11 @@ def get_emails_from_folder(self, mail, folder_name, latest_first=True, cap_at=No
         params: folder_name
         return: pandas series containing the email uid as the index and the email_message object as the data
         '''
-        mail.select('"Google Alerts/{}"'.format(folder_name))
+
+        if folder_name == 'INBOX':
+            mail.select('INBOX')
+        else:
+            mail.select('"Google Alerts/{}"'.format(folder_name))
         result, data = mail.uid('search', None, "ALL")  # search and return uids instead
 
         ids = data[0].split()
@@ -61,9 +63,13 @@ def get_emails_from_folder(self, mail, folder_name, latest_first=True, cap_at=No
         raw_emails = [str(d[0][1], 'utf-8') for d in data]
         email_messages = [email.message_from_string(raw) for raw in raw_emails]
 
+        # decodes the ids into string
+        ids = [x.decode() for x in ids]
+
         df = pd.DataFrame({'mail': email_messages,
                            'id': ids})
 
+
         print('You have {} messages in folder {}'.format(num_emails, folder_name))
 
         return df
diff --git a/scraper.py b/scraper.py
@@ -204,8 +204,6 @@ def scrape_words_from_urls(self, urls, split_up_links):
         # scrapes words from a list of urls
         # returns a list of list of words
 
-        print('starting scrape', datetime.now())
-
         if split_up_links:
             urls = [urls]
 
@@ -233,12 +231,10 @@ def scrape_words_from_urls(self, urls, split_up_links):
             except (URLError, HTTPError, KeyboardInterrupt) as error:
                 warnings.warn('Unable to load {}'.format(url))
                 print(error)
-                print('reached', datetime.now())
                 if split_up_links:
                     return None
                 list_of_words.append('')
 
-        print('end scrape', datetime.now())
         return list_of_words
 
 
diff --git a/test.py b/test.py
@@ -16,13 +16,23 @@
 import os
 import warnings
 import webbrowser
-# nltk.download('punkt')
-#%%
+##%%
 scraper = Scraper()
 reader = Emailreader()
 username = 'prospectstudent@colby.edu'
 password = 'Student.2017'
 mail = reader.login_email(username, password)
+#%%
+a = 'b\'123'
+str.encode(a)
+print(a)
+a.decode()
+
+#%%
+mail.list()
+mail.select('INBOX')
+#%%
+df['id'].apply(lambda s: s.decode())
 
 #%%
 # x = zip(np.random.choice(a=[1], size=10), np.random.randn(10))
diff --git a/tkinter-skeleton.py b/tkinter-skeleton.py