Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def move_emails(mail, df):
'''

# converts str to boolean
df['moved'].apply(lambda x: x == 'True')
df['moved_bool'] = df['moved'].apply(lambda x: x == 'True')

for _, row in df.iterrows():
folder = row['folder']
Expand All @@ -269,4 +269,6 @@ def move_emails(mail, df):
# byte_id = str.encode('1779')
# print(byte_id, type(byte_id))

### COMMENT OUT the line below to prevent emails from being moved
reader.move_email_to_folder(mail=mail, orig_folder=folder, target_folder=target_folder, email_uid=email_uid)
print('moved emails')
3 changes: 3 additions & 0 deletions emailreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ def get_links(self, email_df):
# Method for moving an email
def move_email_to_folder(self, mail, orig_folder, target_folder, email_uid):

if not mail:
mail = self.mail

if orig_folder.lower() == 'inbox':
mail.select('"INBOX"')
else:
Expand Down
10 changes: 5 additions & 5 deletions scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,9 @@ def __init__(self):

self.num_features = 3 # determines the number of features to use

# exits the function after 60 seconds; throws keyboard interrupt error if happens.
@threading_timer.exit_after(60)
# @threading_timer.exit_after(60)
def get_text_from_url(self, url, clean=True):
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A'}
hdr = {'User-Agent': 'Mozilla/5.0'}

try:

Expand Down Expand Up @@ -212,6 +211,7 @@ def scrape_words_from_urls(self, urls, split_up_links):
# urls is a 1D list of urls
# returns a list of list of words
for url in urls:
print('url', url)

try:
words = self.get_text_from_url(url, clean=False)
Expand All @@ -228,9 +228,9 @@ def scrape_words_from_urls(self, urls, split_up_links):
return words
list_of_words.append(words)

except (URLError, HTTPError, KeyboardInterrupt) as error:
except:# (URLError, HTTPError, KeyboardInterrupt, TimeoutError) as error:
warnings.warn('Unable to load {}'.format(url))
print(error)
# print(error)
if split_up_links:
return None
list_of_words.append('')
Expand Down
44 changes: 44 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,56 @@
import os
import warnings
import webbrowser
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import requests

##%%
scraper = Scraper()
reader = Emailreader()
username = 'prospectstudent@colby.edu'
password = 'Student.2017'
mail = reader.login_email(username, password)
#%%
df = pd.read_csv(r'logs\2018-11-08 13.09.54_logs.csv')
df.head(15)
#%%
# print(df.columns)
classifier.move_emails(mail, df)
# reader.move_email_to_folder(mail, 'Priority Mail', 'Completed', b'1896')
#%%


url = 'http://www.ifre.com/bobs-big-bet-barclays-purchase-of-lehman-brothers-10-years-later/21356030.fullarticle'

hdr = {'User-Agent': 'Mozilla/5.0'}

try:
page = requests.get(url).text
soup = BeautifulSoup(page, 'lxml') # creates a BS4 object
except:
print('asfd')

# try:
# r = requests.get(url, timeout=10.0)
# except requests.Timeout as err:
# print(err)
# except requests.RequestException as err:
# print(err)

#%%
import requests
MAX_RETRIES = 20

session = requests.Session()
adapter = requests.adapters.HTTPAdapter(max_retries=MAX_RETRIES)
session.mount('https://', adapter)
session.mount('http://', adapter)

r = session.get(url)
soup = BeautifulSoup(r, 'lxml')


#%%
a = 'b\'123'
str.encode(a)
Expand Down
85 changes: 54 additions & 31 deletions tkinter-skeleton.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,18 @@
'''
TO DO LIST

- - Record whether the user chose to automatically move all emails in raiser CSV function

- Fix Pathing Error in Windows (control f 'logs/')
- Debug functionalities on Windows machines, such as key binds, etc
- Add a waiting animation when Classifying Emails
- fix bug of why i cant just classify 1 mail
- Record whether the user chose to automatically move all emails in raiser CSV function: have two booleans
move and move email: Set the column to move and only move emails if both are true
- Add arrow keys as shortcut in table
- When moving emails, concatenate emails based on their id and process each situation accordingly

IDEAS FOR FUTURE

- Fix Anti-scrape policy
- idea 1: create a separate bot that identifies whether an article is
- idea 2: rotate IP addresses, delay scrape time
- optimize retrieving data (maybe line by line) so a crash wont lose all the data

- Add a waiting animation when Classifying Emails


TO SWITCH FROM WINDOWS AND MAC
Expand Down Expand Up @@ -316,6 +314,7 @@ def setBindings(self):
# binds control-e to switch the label of an element in the bottom table
self.bottomFrame.bind('<Control-e>', self.switchLabel)
self.bottomFrame.bind('<Control-w>', self.switchMovedState)
self.bottomFrame.bind('<Control-a>', self.moveAll)


def handleQuit(self, event=None):
Expand Down Expand Up @@ -393,11 +392,16 @@ def process_text(string, length=50, total_string_size=100):
self.tree.insert('', i, text=i, values=tuple(row), tag=shaded)

self.tree.tag_configure('even_row', background='lightgrey')
self.tree.pack(fill=tk.X, padx=5, pady=5)
self.tree.pack(fill=tk.X, side='left', expand=1)

# binds Button1 to the tree
self.tree.bind('<<TreeviewSelect>>', self.onselect)

# adds scrollbar to the tree
vsb = ttk.Scrollbar(self.bottomFrame, orient="vertical", command=self.tree.yview)
vsb.pack(side='right', fill=tk.Y)
self.tree.configure(yscrollcommand=vsb.set)

def buildScoresTable(self, curItem):
'''
passes in the current selected treeview row to display the score table
Expand Down Expand Up @@ -653,7 +657,7 @@ def handleDisplayCSV(self):
messagebox.showwarning('Warning', 'No data available to export to Raiser\'s Edge')
return

d = RaiserDialog(self.root, df=merged_df, merged_df=True, title='Raiser Edge CSV')
d = RaiserDialog(self.root, df=merged_df, merged_df=True, main_df=self.df, title='Raiser Edge CSV')

def handleConfidence(self):
if self.df is None:
Expand Down Expand Up @@ -783,7 +787,11 @@ def switchMovedState(self, event=None):

self.df.ix[row_num, 'moved'] = new_label # updates the new value in the locally stored dataframe

pass
# switches all the states from true to false
def moveAll(self, event=None):
self.df['moved'] = 'True'
self.buildBottomTable()


def exportEmails(self, event=None):
'''
Expand Down Expand Up @@ -842,11 +850,12 @@ def cancel(self, event=None):
# dialog box for displaying CSVs
class RaiserDialog(simpledialog.Dialog):

def __init__(self, parent, csv_path=None, df=None, merged_df=False, title='Title'):
def __init__(self, parent, csv_path=None, df=None, main_df=None, merged_df=False, title='Title'):
'''

:param parent: parent widget
:param csv_path: the csv path of the raiser_df, optional
:param main_df: the entire dataframe, containing both received and completed data
:param df: the actual raiser df object
:param merged_df: whether or not the passed raiser_df is the merged version, or has already been processed already
:param title: title of this widget
Expand All @@ -857,55 +866,69 @@ def __init__(self, parent, csv_path=None, df=None, merged_df=False, title='Title
value=preset_path,
name='pathVar') # path of raiser file to be exported

self.main_df = main_df # df containing both received and completed data

if df is None:
self.df = pd.read_csv(csv_path)
else:
self.df = df

if merged_df:
self.merged_df = df

raiser_headers = ['constituent_id', 'date', 'move/status change (or n/a)', 'type',
'author', 'description', 'text']
self.df = self.df.loc[:, raiser_headers].reset_index(drop=True)


simpledialog.Dialog.__init__(self, parent, title)

def body(self, master):
## Builds the Table to display the CSV

# builds two different frames
top_frame = tk.Frame(master=master)
top_frame = tk.Frame(master=master) # used to contain the table
top_frame.pack(side=tk.TOP, fill=tk.X)
bottom_frame = tk.Frame(master=master)
bottom_frame = tk.Frame(master=master) # used to contain the path
bottom_frame.pack(side=tk.TOP)
error_frame = tk.Frame(master=master)
error_frame.pack(side=tk.BOTTOM)

values = self.df.values

# builds column and row names
for c, col_name in enumerate(self.df.columns, start=1):
tk.Label(top_frame, text=col_name).grid(row=0, column=c)
# preprocesses the dataframe for displaying
df = self.df.rename(index=int, columns={'constituent_id': 'id'}) # make the columns shorter
df['text'] = self.df['text'].apply(lambda x: x[:min(len(x), 150)]) # limits the characters in the text column

for r, index_name in enumerate(self.df.index, start=1):
tk.Label(top_frame, text=index_name).grid(row=r, column=0)
# # builds column and row names
tree = ttk.Treeview(top_frame)
num_col = len(df.columns)
tree['columns'] = tuple(range(num_col))
for i, col in enumerate(self.df.columns):
# give extra width to text
width = 300 if col == 'text' else 110
stretch = col == 'text'

# adds the values
tree.column(i, width=width, minwidth=0, stretch=stretch)
tree.heading(i, text=col)

for i, val_row in enumerate(values, start=1):
for j, val in enumerate(val_row, start=1):
# builds the contents of the table
for i, row in df.iterrows():
shaded = 'even_row' if i % 2 == 0 else 'odd_row'
tree.insert('', i, text=i, values=tuple(row), tag=shaded)

tree.tag_configure('even_row', background='lightgrey')
tree.pack(fill=tk.X, side='left', expand=1)

# reformat the string if it is the text column
if isinstance(val, str):
val = val[:40] + '...'
tk.Label(top_frame, text=val).grid(row=i, column=j)
# adds scrollbar to the tree
vsb = ttk.Scrollbar(top_frame, orient="vertical", command=tree.yview)
vsb.pack(side='right', fill=tk.Y)
tree.configure(yscrollcommand=vsb.set)

# makes the index column smaller
tree.column("#0", width=50)

label = tk.Label(bottom_frame, text='Path: ')
label.pack(side=tk.LEFT)


e = tk.Entry(bottom_frame, textvariable=self.path, exportselection=0, width=35)
e.pack(side=tk.LEFT)

Expand Down Expand Up @@ -954,9 +977,9 @@ def apply(self):
print('downloaded at ', self.path.get())

# moves the emails as well
classifier.move_emails(mail=mail, df=self.merged_df)
print('moved emails')

classifier.move_emails(mail=mail, df=self.main_df)
# print(self.main_df)
print('finished moving emails')

class ClassifyDialog(simpledialog.Dialog):

Expand Down