-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmbox_to_graph.py
68 lines (54 loc) · 1.89 KB
/
mbox_to_graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import mailbox
import re
import time
import datetime
import email.utils
import pdb
import math
import logging
def make_graph(mbox, year):
if type(mbox) is str:
mbox = mailbox.mbox(mbox)
year = int(year)
pattern = "^(\[.+\]|\s|Re:|Fwd:)+"
graph = {}
mailing_threads = {}
for msg in mbox:
try:
thread = re.sub(pattern, "", msg["Subject"])
except:
logging.error("Could not parse subject " + str(msg['Subject']))
continue
try:
A_time = datetime.datetime.fromtimestamp(time.mktime(email.utils.parsedate(msg['Date'])))
A_email = email.utils.parseaddr(msg['From'])[1]
except:
logging.error("Could not parse date or email: " + str(msg['Date']) + ", " + msg['From'])
continue
# remove emails with the wrong date
if A_time.year == year:
if thread not in mailing_threads:
mailing_threads[thread] = []
for B_email, B_time in mailing_threads[thread]:
if A_time > B_time:
greater_time, lesser_time = A_time, B_time
greater_email, lesser_email = A_email, B_email
else:
greater_time, lesser_time = B_time, A_time
greater_email, lesser_email = B_email, A_email
if greater_email not in graph:
graph[greater_email] = {}
if lesser_email not in graph[greater_email]:
graph[greater_email][lesser_email] = new_graph_weight(greater_time - lesser_time)
else:
graph[greater_email][lesser_email] = update_graph_weight(graph[greater_email][lesser_email], greater_time - lesser_time)
mailing_threads[thread].append((A_email, A_time))
return graph
def new_graph_weight(timedelta):
diff = timedelta.total_seconds()
if diff < 1:
logging.error("Time difference is " + str(diff))
diff = 1
return 1/math.sqrt(diff)
def update_graph_weight(old_weight, new_timedelta):
return old_weight + new_graph_weight(new_timedelta)