-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathhtml_parser.py
101 lines (86 loc) · 3.17 KB
/
html_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from bs4 import BeautifulSoup
import copy
import re
import json
import os
speaker1 = '<p><strong>(.+)</strong>:'
speaker2 = '<p><b>(.+):</b>'
speaker3 = '<p><strong>(.+):</strong>'
speaker_regex_1 = re.compile(speaker1)
speaker_regex_2 = re.compile(speaker2)
speaker_regex_3 = re.compile(speaker3)
def episode_id(s, e):
return 's%02d_e%02d' % (s, e)
def scene_id(s, e, c):
return '%s_c%02d' % (episode_id(s, e), c)
def utt_id(s, e, c, u):
return '%s_u%03d' % (scene_id(s, e, c), u)
def parse_line(tag):
line = str(tag)
if '</strong>:' in line:
speaker = speaker_regex_1.findall(line)
elif ':</b>' in line:
speaker = speaker_regex_2.findall(line)
elif ':</strong>' in line:
speaker = speaker_regex_3.findall(line)
if speaker is None or len(speaker) == 0:
return None, None
speaker = speaker[0]
utt = tag.text.replace(speaker + ':', '').strip().replace('\n', ' ').replace('', "'").replace('…', '...').replace(
'', '').replace('
', '-').replace('', '-')
utt = utt.split()
utt = ' '.join([i for i in utt if i != ''])
return speaker, utt
def parse_p(p_list, s, e):
c = 0
u = 0
in_scene = False
episode_data = []
scenes_data = {}
for tag in p_list:
line = str(tag)
# print(line)
if in_scene:
if ':</b>' not in line and '</strong>:' not in line and '[Scene' not in line and ':</strong>' not in line:
continue
if '[Scene' not in line and (':</b>' in line or '</strong>:' in line or ':</strong>' in line):
speakers, utt = parse_line(tag)
if speakers is not None:
u += 1
uttid = utt_id(s, e, c, u)
# print(uttid, utt)
utt = {'utterance_id': uttid, 'speakers': [speakers], 'transcript': utt}
scenes_data['utterances'].append(utt)
elif '[Scene' in line:
c += 1
t = copy.deepcopy(scenes_data)
episode_data.append(t)
scenes_data = {'scene_id': scene_id(s, e, c), 'scenes_name': tag.text, 'utterances': []}
else:
if '[Scene' in line:
c += 1
scenes_data = {'scene_id': scene_id(s, e, c), 'scenes_name': tag.text, 'utterances': []}
in_scene = True
if in_scene and 'scene_id' in scenes_data:
episode_data.append(scenes_data)
return episode_data
htmlfile = open('season/0911.html', 'r', encoding='utf-8')
htmlhandle = htmlfile.read()
soup = BeautifulSoup(htmlhandle, "lxml")
res1 = soup.find_all("p")
episode_data = parse_p(res1, 9, 11)
#
# folder = 'season'
# data = {}
# for filename in os.listdir(folder):
# htmlfile = open(os.path.join(folder, filename), 'r', encoding='utf-8')
# htmlhandle = htmlfile.read()
# soup = BeautifulSoup(htmlhandle, "lxml")
# res1 = soup.find_all("p")
# s = int(filename[:2])
# e = int(filename[2:4])
# episode_data = parse_p(res1, s, e)
# data[episode_id(s, e)] = episode_data
# with open('friends_transcript.json', 'w', encoding='utf-8') as w:
# w.write(json.dumps(data))
# print(len(data.keys()))