-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_html_part.py
73 lines (58 loc) · 2.28 KB
/
extract_html_part.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/python
# FileName: Subsampling.py
# Version 1.0 by Tao Ban, 2010.5.26
# This function extract all the contents, ie subject and first part from the .eml file
# and store it in a new file with the same name in the dst dir.
import email.parser
import os
import sys
import stat
import shutil
import pyzmail
def ExtractSubPayload(filename):
''' Extract the subject and payload from the .eml file.
'''
if not os.path.exists(filename): # dest path doesnot exist
print("ERROR: input file does not exist:" + filename)
os.exit(1)
fp = open(filename)
msg = pyzmail.message_from_file(fp)
payload=""
if msg.html_part != None:
payload = msg.html_part.get_payload()
payload=str(payload)
# print payload
return payload
def ExtractBodyFromDir(srcdir, dstdir):
'''Extract the body information from all .eml files in the srcdir and
save the file to the dstdir with the same name.'''
if not os.path.exists(dstdir): # dest path doesnot exist
os.makedirs(dstdir)
files = os.listdir(srcdir)
for file in files:
srcpath = os.path.join(srcdir, file)
dstpath = os.path.join(dstdir, file)
src_info = os.stat(srcpath)
if stat.S_ISDIR(src_info.st_mode): # for subfolders, recurse
ExtractBodyFromDir(srcpath, dstpath)
else: # copy the file
body = ExtractSubPayload(srcpath)
dstfile = open(dstpath, 'w')
dstfile.write(body)
dstfile.close()
###################################################################
# main function start here
# srcdir is the directory where the .eml are stored
print('Input source directory: ') # ask for source and dest dirs
srcdir = input()
if not os.path.exists(srcdir):
print('The source directory %s does not exist, exit...' % (srcdir))
sys.exit()
# dstdir is the directory where the content .eml are stored
print('Input destination directory: ') # ask for source and dest dirs
dstdir = input()
if not os.path.exists(dstdir):
print('The destination directory is newly created.')
os.makedirs(dstdir)
###################################################################
ExtractBodyFromDir(srcdir, dstdir)