-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathemail_parser.py
More file actions
54 lines (49 loc) · 2.05 KB
/
email_parser.py
File metadata and controls
54 lines (49 loc) · 2.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import email
import os
import html2text
os.chdir('C:/Users/Ernest/Desktop/corpus')
h2t = html2text.HTML2Text()
h2t.body_width=20000
def text_parser(path):
with open(path) as eml:
m = email.message_from_file(eml)
if m.get_content_type!='mixed':
for m in m.walk():
if m.get_content_subtype()=='plain':
try:
text = str(m.get_payload(decode=True),encoding='utf-8')
except:
text = str(m.get_payload(decode=True),encoding='gbk')
text = text.replace('--\n发自我的网易邮箱平板适配版','')
text = text.split('----------------')[0]
text = text.strip()
return text
if m.get_content_subtype()=='html':
try:
text = str(m.get_payload(decode=True),encoding='utf-8')
except:
text = str(m.get_payload(decode=True),encoding='gbk')
text = h2t.handle(text)
text = text.strip()
text = text.replace(' ','')
return text
for root, dirs, files in os.walk("."):
for file_name in files:
if file_name.endswith('.eml'):
path = os.path.join(root,file_name)
try:
text = text_parser(path)
text = text.replace(u'\u202f','')
text = text.replace('\n ','')
text = text.splitlines()
text = list(filter(None, text))
for ed in text:
if 'Editor:' in ed or 'Editors:' in ed:
text = text[:text.index(ed)+1]
text = '\n'.join(text)
with open(path.replace('.eml', '')+'.txt', 'w') as txt:
txt.write(text)
except:
print('fuck:',path)
break
print('\r','已完成:{:.2f}%'.format(round((files.index(file_name)+1)*100/len(files))), end='', flush=True)