-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspage_reader.py
118 lines (100 loc) · 3.19 KB
/
spage_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from os_rotatefile import open_file
from .common import COLON, DEFAULT_ENCODING
from .default_schema import InnerHeaderKeys as I_KEYS
from .validator import simple_check_url
def read(fp):
reader = Reader(fp)
for record in reader.read():
yield record
class Reader(object):
def __init__(self, fp):
self._fp = fp
self._url_latest = None
self._reset()
def _reset(self):
self._url = self._url_latest
self._inner_header = {}
self._http_header = {}
self._data = None
self._read = self._read_inner_header
self._url_latest = None
def _generate(self):
d = {}
d[u"url"] = self._url
d[u"inner_header"] = self._inner_header
d[u"http_header"] = self._http_header
d[u"data"] = self._data
return d
def _read_inner_header(self):
line = self._fp.readline()
if not line:
raise StopIteration
try:
line = line.decode(DEFAULT_ENCODING).strip()
except Exception:
return self._read()
line_length = len(line)
if line_length <= 0 and self._inner_header and self._url:
self._read = self._read_http_header
elif line_length > 1024:
pass
elif simple_check_url(line):
self._reset()
self._url = line
else:
d = line.find(COLON)
if d > 0:
key = line[0:d].strip()
value = line[d + 1 :].strip()
self._inner_header[key] = value
return self._read()
def _read_http_header(self):
line = self._fp.readline()
if not line:
raise StopIteration
try:
line = line.decode(DEFAULT_ENCODING).strip()
except Exception:
return self._read()
if not line:
self._read = self._read_data
elif simple_check_url(line):
self._url_latest = line
self._read = self._read_data
else:
d = line.find(COLON)
if d > 0:
key = line[0:d].strip()
value = line[d + 1 :].strip()
self._http_header[key] = value
return self._read()
def _read_data(self):
size = int(self._inner_header.get(I_KEYS.STORE_SIZE, -1))
if size < 0 or self._url_latest is not None:
return self._generate()
data = self._fp.read(size)
if size > 0 and not data:
raise StopIteration
if (
not self._http_header
): # compat invalid format: no http headers but write two '\r\n'
if data[0:2] == b"\r\n":
o = self._fp.read(2)
data = data[2:] + o
self._data = data
return self._generate()
def read(self):
while True:
try:
yield self._read()
self._reset()
except StopIteration:
return
class SpageReader(object):
def __init__(self, base_filename):
self._fp = open_file(base_filename, "r")
def close(self):
self._fp.close()
def read(self):
for record in read(self._fp):
yield record