This repository has been archived by the owner on Jun 13, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
113 lines (88 loc) · 3.74 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
Simple script to crawl through user's MS Exchange inbox to download attachments from emails
"""
import os
import errno
from datetime import datetime, timedelta
from getpass import getpass
import pytz
from exchangelib import (
DELEGATE, Account, Configuration, EWSDateTime, FileAttachment,
ItemAttachment, Message, ServiceAccount
)
from config import (
ALLOWED_EXTENSIONS, DOWNLOAD_ATTACHED_EMAILS,
DOWNLOAD_ROOT_PATH, OUTGOING_SERVER, RANGE_IN_SECONDS,
TIMEZONE
)
PARSED_EXTENSIONS = [ext for ext in (x.strip() for x in ALLOWED_EXTENSIONS.split(","))]
def get_user_login():
"""Get user login credentials"""
# User login
username, password = ("", "")
while username == "":
username = input("Enter username: ")
email = input("Enter email (optional if username is email): ")
while password == "":
password = getpass("Enter password: ")
return (username.strip(), email.strip(), password)
def login(username, email, password):
"""Login to MS Exchange account"""
print("Logging in...")
# Construct login credentials with fault tolerant ServiceAccount
credentials = ServiceAccount(username=username, password=password)
config = Configuration(server=OUTGOING_SERVER, credentials=credentials)
# Retrieve account
account = Account(
primary_smtp_address=email if bool(email) else username,
autodiscover=False,
config=config,
access_type=DELEGATE
)
print("Login successful.")
return account
def check_directories(path):
"""Checks if directories exist along path and create accordingly"""
if not os.path.exists(os.path.dirname(path)):
try:
os.makedirs(os.path.dirname(path))
except OSError as exc: # Guard against race condition
if exc.errno != errno.EEXIST:
raise
def is_valid_extension(filename):
"""Checks if file is of correct extension"""
ext = filename.split(".")[-1]
return ext in PARSED_EXTENSIONS
def get_attachments(inbox):
"""Downloads all attachments to host machine"""
start_date = datetime.now() - timedelta(seconds=RANGE_IN_SECONDS)
year, month, date = (start_date.year, start_date.month, start_date.day)
ews_start_date = pytz.timezone(TIMEZONE).localize(EWSDateTime(year, month, date + 1))
print("Retrieving attachments from {0}...".format(ews_start_date))
qs = inbox.filter(datetime_received__gte=ews_start_date)
for item in inbox.all():
formatted_datetime = datetime.strftime(item.datetime_received, "%Y-%m-%d-%H-%M-%S")
for attachment in item.attachments:
if isinstance(attachment, FileAttachment) and is_valid_extension(attachment.name):
local_path = os.path.join(DOWNLOAD_ROOT_PATH, formatted_datetime, attachment.name)
check_directories(local_path)
with open(local_path, 'wb') as f:
f.write(attachment.content)
print("Saved attachment to {0}".format(local_path))
elif isinstance(attachment, ItemAttachment) and DOWNLOAD_ATTACHED_EMAILS:
if isinstance(attachment.item, Message):
local_path = os.path.join(DOWNLOAD_ROOT_PATH, formatted_datetime, attachment.item.subject)
check_directories(local_path)
with open(local_path, 'wb') as f:
f.write(attachment.item.body)
print("Saved email to {0}".format(local_path))
else:
print("Skipping..")
def run():
"""Executes script"""
username, email, password = get_user_login()
account = login(username, email, password)
inbox = account.inbox
get_attachments(inbox)
if __name__ == "__main__":
run()