This repository has been archived by the owner on Mar 16, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathmail_ingestor.py
executable file
·284 lines (248 loc) · 11.4 KB
/
mail_ingestor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
This script browse a folder and its subfolders, looking for plain-text file which looks like mails.
It then ingest the found files and stores them in a single mbox file.
The non-mail files founds are considered as attachments and included in the ingested mails.
All ingested files are removed, only leaving files the script was not able to decide their fate.
In case of indecision, the script open a PDB prompt.
This script is full of cases hard-coded for my special needs. These are easy to spot and feel free to remove them to fit your needs.
"""
import os
import time
import shutil
import stat
import re
import mailbox
import subprocess
import mimetypes
from datetime import datetime
import email.utils
from email.parser import Parser
from email.MIMEMultipart import MIMEMultipart
from email.MIMEBase import MIMEBase
from email import Encoders
from email.MIMEText import MIMEText
## Configuration
ARCHIVE_FOLDER = './Mail'
MBOX_FILE = './export.mbox'
FORCED_ATTACHMENT = [
'./Mail/friend1/in/content.fla',
'./Mail/friend2/in/Makefile',
]
## End of configuration
mbox = mailbox.mbox(MBOX_FILE)
def get_date(s):
"""
Transform a string to a mail-ready date
"""
date = None
try:
date = datetime.strptime(s, '%d/%m/%y %H:%M:%S')
except ValueError:
try:
date = datetime.strptime(s, '%d/%m/%Y %H:%M:%S')
except ValueError:
try:
date = datetime.strptime(s, '%d/%m/%Y %H:%M')
except ValueError:
pass
if date:
date = email.utils.formatdate(time.mktime(date.timetuple()), True)
return date
# I was too lazy to code a proper loop, so I used this qhick and dirty loop.
# The right condition to stop iterating is having no mails left to ingest.
for i in range(10):
for path, dirs, files in os.walk(ARCHIVE_FOLDER):
print "Processing folder %s" % path
if len(dirs):
print "This folder has sub folders, skip it..."
continue
# Count the number of non-mail files
mail_files = []
non_mail_files = []
for file_name in files:
filepath = os.path.join(path, file_name)
if filepath in FORCED_ATTACHMENT:
non_mail_files.append(filepath)
continue
file_content = open(filepath, 'r').read()
mail = Parser().parsestr(file_content)
# Detect french-translated headers and translate them
if len(mail.keys()) == 1 and mail.get('De', None) is not None:
replace_str = [
("De:\r\n", "From: "),
("Sujet: ", "Subject: "),
("A: ", "To: "),
("\r\n", "\n"),
]
for (s, r) in replace_str:
file_content = file_content.replace(s, r)
date = None
fc = ""
for l in file_content.split('\n'):
if l.startswith("Date: "):
date = get_date(l.split('Date: ')[1])
fc += 'Date: %s\n' % (date)
else:
fc += l + '\n'
mail = Parser().parsestr(fc)
# General purpose fixes
else:
for d in mail.defects:
if isinstance(d, email.errors.StartBoundaryNotFoundDefect):
mail.del_param('boundary')
mail_body = mail.get_payload()
# Strip redundant HTML part
mail_body = re.sub('(?is)<!DOCTYPE (.*)>', '', mail_body)
mail_body = re.sub('(?is)<html>(.*)</html>', '', mail_body)
mail_body = re.sub('(?is)<p>(.*)>', '', mail_body) # Yahoo! mail footer is not enclosed by proper <html> tags, but always starts with a <p>
if mail_body != mail.get_payload():
mail.set_payload(mail_body.strip())
elif isinstance(d, email.errors.MultipartInvariantViolationDefect):
mail.set_type('text/plain', requote=False)
else:
# Unknown mail parsing defect
import pdb; pdb.set_trace()
# Messages from voila seems to have messy encoding
if mail.get('Message-Id', '').find('@voila.fr') != -1:
del mail['Content-Transfer-Encoding']
# If we found headers then it means the file was being parsed well as an email.
if len(mail.keys()):
mail_files.append((filepath, mail))
else:
mime_type = mimetypes.guess_type(filepath)[0]
if mime_type == 'text/plain':
print "Consider %s as a mail." % filepath
mail = MIMEText(open(filepath, 'r').read())
mail_files.append((filepath, mail))
elif not mime_type:
import pdb; pdb.set_trace()
else:
non_mail_files.append(filepath)
# Bundle attachements to their mails
if len(non_mail_files):
if len(mail_files) == 1:
# Attach all non-mail files to the only mail found here
print "Add to %s these attachements: %r" % (mail_files[0][0], non_mail_files)
mail = mail_files[0][1]
# Transform our simple mail to a multipart one
if not mail.is_multipart():
new_mail = MIMEMultipart()
new_mail.attach(MIMEText(mail.get_payload()))
# Transfer all headers
headers_to_transfer = set([h.lower() for h in mail.keys()]) - set([h.lower() for h in new_mail.keys()])
for h in headers_to_transfer:
h_value = mail.get_all(h)
print repr(h)
print repr(h_value)
if type(h_value) == type([]):
for v in h_value:
new_mail.add_header(h, v)
else:
new_mail.add_header(h, h_value)
mail = new_mail
for f in non_mail_files:
part = MIMEBase('application', "octet-stream")
part.set_payload(open(f, "rb").read())
Encoders.encode_base64(part)
part.add_header('Content-Disposition', 'attachment; filename="%s"' % os.path.basename(f))
mail.attach(part)
os.remove(f)
mail_files[0] = (mail_files[0][0], mail)
else:
print "Can't decide how to bundle the %s non-mail files among the %s mails. Skip the folder..." % (len(non_mail_files), len(mail_files))
continue
parent_folder_name = os.path.split(path)[-1]
parent_folders = path.split(os.sep)[::-1]
for (filepath, mail) in mail_files:
# The mail was saved from an IMP account
if mail.get('X-Mailer-Version', None) == 'v3.57 (r)' and not mail.get('From', None) and not mail.get('Date', None):
mail.add_header('From', '[email protected]')
# Get the main body of the mail
mail_body = None
if mail.is_multipart():
for payload in mail.get_payload():
if isinstance(payload, email.mime.text.MIMEText):
mail_body = payload.get_payload()
break
else:
mail_body = mail.get_payload()
assert type(mail_body) == type('')
# Find the hard coded date
date = None
# Normalize line endings
mail_body = mail_body.replace('\r\n', '\n')
cleaned_body = ''
for l in mail_body.split('\n'):
# Exclude body content until we find the date
if not date:
if l:
date = get_date(l)
if not date:
# We have failed to convert the first non-empty line of the payload, which means the mail doesn't looks like what we expect:
# We probably found a new species that we must analyze
import pdb; pdb.set_trace
break
else:
cleaned_body += l + '\n'
# If we haven't found the date, then restore the body to its original state
if date:
mail.add_header('Date', date)
cleaned_body = cleaned_body.strip()
if mail.is_multipart():
payload.set_payload(cleaned_body)
else:
mail.set_payload(cleaned_body)
# If there is no subject, then use the file name
if not mail.get('Subject', None):
subject = os.path.splitext(os.path.split(filepath)[-1])[0]
print "Transform filepath %s to subject %r" % (filepath, subject)
mail.add_header('Subject', subject)
# If no date found: use the date of the file
if not mail.get('Date', None):
mail.add_header('Date', email.utils.formatdate(os.path.getmtime(filepath)))
# Adjust some mails before saving
folder_from_map = {
'./Mail/friend1/out/': ['[email protected]'],
'./Mail/friend2/out/': ['[email protected]'],
'./Mail/friend1/in/': ['[email protected]'],
'./Mail/friend2/in/': ['[email protected]'],
}
if not mail.get('From', None):
for p in folder_from_map:
if filepath.startswith(p):
for m in folder_from_map[p]:
mail.add_header('From', m)
break
# Adjust some mails before saving
folder_to_map = {
'./Mail/friend1/out/': ['[email protected]'],
'./Mail/friend2/out/': ['[email protected]'],
'./Mail/friend1/in/': ['[email protected]'],
'./Mail/friend2/in/': ['[email protected]'],
}
if not mail.get('To', None):
del mail["To"]
for p in folder_to_map:
if filepath.startswith(p):
for m in folder_to_map[p]:
mail.add_header('To', m)
break
try:
assert mail.get('From', None) is not None
assert mail.get('To', None)
assert mail.get('Subject', None) is not None
assert mail.get('Date', None) is not None
except:
import pdb; pdb.set_trace()
mbox.add(mailbox.mboxMessage(mail))
# The mail was successfuly migrated, remove its source
os.remove(filepath)
# Remove empty folders
for root, dirs, files in os.walk(ARCHIVE_FOLDER, topdown=False):
for name in dirs:
fname = os.path.join(root, name)
if os.path.exists(fname) and not os.listdir(fname):
print 'Remove %s ...' % fname
os.removedirs(fname)