diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 0000000..b04ae3e --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,4 @@ +[isort] +known_first_party = os_spage +known_third_party = pytest, jsonschema +skip_glob = *.pyi diff --git a/.travis.yml b/.travis.yml index cfb10b1..a9ea398 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,7 +5,7 @@ matrix: - python: 2.7 env: TOXENV=py27,codecov - python: 3.6 - env: TOXENV=py36,codecov + env: TOXENV=py36,codecov,lint - python: pypy env: TOXENV=pypy - python: pypy3 diff --git a/requirements/requirements-lint.txt b/requirements/requirements-lint.txt new file mode 100644 index 0000000..ddec3be --- /dev/null +++ b/requirements/requirements-lint.txt @@ -0,0 +1,3 @@ +autoflake +black +isort diff --git a/script/lint.sh b/script/lint.sh new file mode 100644 index 0000000..a543e10 --- /dev/null +++ b/script/lint.sh @@ -0,0 +1,15 @@ +#!/bin/sh -e + + +export PREFIX="" +if [ -d 'venv' ] ; then + export PREFIX="venv/bin/" +fi + +set -x + +pip install -r requirements/requirements-lint.txt + +${PREFIX}autoflake --in-place --recursive --remove-all-unused-imports --remove-unused-variables src tests +${PREFIX}black --exclude=".pyi$" src tests +${PREFIX}isort --multi-line=3 --trailing-comma --force-grid-wrap=0 --combine-as --line-width 88 --recursive --apply src tests diff --git a/src/os_spage/__init__.py b/src/os_spage/__init__.py index b3dbd56..ff9a0a5 100644 --- a/src/os_spage/__init__.py +++ b/src/os_spage/__init__.py @@ -1,8 +1,11 @@ import pkgutil import sys -from .spage_reader import SpageReader, read as read_spage -from .spage_writer import SpageWriter, write + from .offpage_reader import OffpageReader, read as read_offpage +from .spage_reader import SpageReader, read as read_spage +from .spage_writer import SpageWriter, write as _write + +write = _write def __not_supported_mode(name, **kwargs): @@ -13,27 +16,27 @@ def __not_supported_page_type(name, **kwargs): raise ValueError("page_type must be 'spage' or 'offpage'") -def read(s, page_type='spage'): - r = {'spage': read_spage, 'offpage': read_offpage}.get( - page_type, __not_supported_page_type) +def read(s, page_type="spage"): + r = {"spage": read_spage, "offpage": read_offpage}.get( + page_type, __not_supported_page_type + ) return r(s) def open_file(name, mode, **kwargs): - r = {'w': SpageWriter, - 'r': {'spage': SpageReader, - 'offpage': OffpageReader}}.get(mode, __not_supported_mode) - if mode == 'r': - r = r.get(kwargs.pop('page_type', 'spage'), __not_supported_page_type) + r = {"w": SpageWriter, "r": {"spage": SpageReader, "offpage": OffpageReader}}.get( + mode, __not_supported_mode + ) + if mode == "r": + r = r.get(kwargs.pop("page_type", "spage"), __not_supported_page_type) return r(name, **kwargs) -__all__ = ['__version__', 'version_info', 'open_file'] +__all__ = ["__version__", "version_info", "open_file"] -__version__ = pkgutil.get_data(__package__, 'VERSION').decode('ascii').strip() -version_info = tuple(int(v) if v.isdigit() else v - for v in __version__.split('.')) +__version__ = pkgutil.get_data(__package__, "VERSION").decode("ascii").strip() +version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split(".")) if sys.version_info < (2, 7): sys.exit("os-spage %s requires Python 2.7+" % __version__) diff --git a/src/os_spage/common.py b/src/os_spage/common.py index 8eb239a..05122b0 100644 --- a/src/os_spage/common.py +++ b/src/os_spage/common.py @@ -1,5 +1,4 @@ - -DEFAULT_ENCODING = 'utf-8' +DEFAULT_ENCODING = "utf-8" COLON = ":" diff --git a/src/os_spage/compat.py b/src/os_spage/compat.py index 0f5b9c6..87ea67d 100644 --- a/src/os_spage/compat.py +++ b/src/os_spage/compat.py @@ -1,12 +1,16 @@ -import sys import operator +import sys PY3 = sys.version_info[0] >= 3 if PY3: - from io import StringIO + from io import StringIO as _StringIO + iteritems = operator.methodcaller("items") else: - from StringIO import StringIO + from StringIO import StringIO as _StringIO + iteritems = operator.methodcaller("iteritems") + +StringIO = _StringIO diff --git a/src/os_spage/default_schema.py b/src/os_spage/default_schema.py index 43b60dc..601c1b4 100644 --- a/src/os_spage/default_schema.py +++ b/src/os_spage/default_schema.py @@ -3,127 +3,132 @@ class RecordTypes(object): - FLAT = 'flat' - DELETED = 'deleted' - COMPRESSED = 'compressed' + FLAT = "flat" + DELETED = "deleted" + COMPRESSED = "compressed" class InnerHeaderKeys(object): - VERSION = 'Version' - TYPE = 'Type' - FETCH_TIME = 'Fetch-Time' - ORIGINAL_SIZE = 'Original-Size' - STORE_SIZE = 'Store-Size' - BATCH_ID = 'batchID' - ATTACH = 'attach' - IP_ADDRESS = 'IP-Address' - SPIDER_ADDRESS = 'Spider-Address' - DIGEST = 'Digest' - USER_AGENT = 'User-Agent' - FETCH_IP = 'Fetch-IP' - NODE_FETCH_TIME = 'Node-Fetch-Time' - ERROR_REASON = 'Error-Reason' + VERSION = "Version" + TYPE = "Type" + FETCH_TIME = "Fetch-Time" + ORIGINAL_SIZE = "Original-Size" + STORE_SIZE = "Store-Size" + BATCH_ID = "batchID" + ATTACH = "attach" + IP_ADDRESS = "IP-Address" + SPIDER_ADDRESS = "Spider-Address" + DIGEST = "Digest" + USER_AGENT = "User-Agent" + FETCH_IP = "Fetch-IP" + NODE_FETCH_TIME = "Node-Fetch-Time" + ERROR_REASON = "Error-Reason" INNER_HEADER_SCHEMA = { "type": "object", - "properties": OrderedDict([ - (InnerHeaderKeys.VERSION, { - "type": "string", - "default": "1.2", - }), - (InnerHeaderKeys.TYPE, { # autofill or specify - "type": "string", - "enum": set([getattr(RecordTypes, i) for i in dir(RecordTypes) if not i.startswith('_')]) , - }), - (InnerHeaderKeys.FETCH_TIME, { # record store time - "anyOf": [ - { - "type": "datetime", + "properties": OrderedDict( + [ + (InnerHeaderKeys.VERSION, {"type": "string", "default": "1.2"}), + ( + InnerHeaderKeys.TYPE, + { # autofill or specify + "type": "string", + "enum": set( + [ + getattr(RecordTypes, i) + for i in dir(RecordTypes) + if not i.startswith("_") + ] + ), + }, + ), + ( + InnerHeaderKeys.FETCH_TIME, + { # record store time + "anyOf": [ + {"type": "datetime"}, + {"type": "string", "format": "readable_time"}, + ], + "default": datetime.now, }, - { + ), + ( + InnerHeaderKeys.ORIGINAL_SIZE, + {"type": "number"}, # data(html) size, autofill + ), + (InnerHeaderKeys.STORE_SIZE, {"type": "number"}), # store size, autofill + ( + InnerHeaderKeys.BATCH_ID, + { # batch identity "type": "string", - "format": "readable_time" + "minLength": 3, + "default": "__CHANGE_ME__", }, - ], - "default": datetime.now, - }), - (InnerHeaderKeys.ORIGINAL_SIZE, { # data(html) size, autofill - "type": "number", - }), - (InnerHeaderKeys.STORE_SIZE, { # store size, autofill - "type": "number", - }), - (InnerHeaderKeys.BATCH_ID, { # batch identity - "type": "string", - "minLength": 3, - "default": '__CHANGE_ME__', - }), - (InnerHeaderKeys.ATTACH, { - "type": "string", - }), - (InnerHeaderKeys.IP_ADDRESS, { # remote host ip - "type": "string", - "format": "ipv4", - "default": "0.0.0.0", - }), - (InnerHeaderKeys.SPIDER_ADDRESS, { # spider node identity - "type": "string", - "default": "0.0.0.0", - }), - (InnerHeaderKeys.DIGEST, { # can be html md5 - "type": "string", - "default": "0" * 32, - "maxLength": 32, - "minLength": 32, - }), - (InnerHeaderKeys.USER_AGENT, { - "type": "string", - }), - (InnerHeaderKeys.FETCH_IP, { # generate page machine ip - "type": "string", - "format": "ipv4", - "default": "0.0.0.0", - }), - (InnerHeaderKeys.NODE_FETCH_TIME, { # real fetch time - "anyOf": [ - { - "type": "datetime", + ), + (InnerHeaderKeys.ATTACH, {"type": "string"}), + ( + InnerHeaderKeys.IP_ADDRESS, + { # remote host ip + "type": "string", + "format": "ipv4", + "default": "0.0.0.0", }, - { + ), + ( + InnerHeaderKeys.SPIDER_ADDRESS, + {"type": "string", "default": "0.0.0.0"}, # spider node identity + ), + ( + InnerHeaderKeys.DIGEST, + { # can be html md5 "type": "string", - "format": "readable_time" + "default": "0" * 32, + "maxLength": 32, + "minLength": 32, }, - ] - }), - (InnerHeaderKeys.ERROR_REASON, { - "type": "string", - "format": "error_reaseon", - }), - ]), + ), + (InnerHeaderKeys.USER_AGENT, {"type": "string"}), + ( + InnerHeaderKeys.FETCH_IP, + { # generate page machine ip + "type": "string", + "format": "ipv4", + "default": "0.0.0.0", + }, + ), + ( + InnerHeaderKeys.NODE_FETCH_TIME, + { # real fetch time + "anyOf": [ + {"type": "datetime"}, + {"type": "string", "format": "readable_time"}, + ] + }, + ), + ( + InnerHeaderKeys.ERROR_REASON, + {"type": "string", "format": "error_reaseon"}, + ), + ] + ), } + class SpageKeys(object): - URL = 'url' - INNER_HEADER = 'inner_header' - HTTP_HEADER = 'http_header' - DATA = 'data' + URL = "url" + INNER_HEADER = "inner_header" + HTTP_HEADER = "http_header" + DATA = "data" META_SCHEMA = { "type": "object", "properties": { - SpageKeys.URL: { - "type": "string", - "format": "url", - }, + SpageKeys.URL: {"type": "string", "format": "url"}, SpageKeys.INNER_HEADER: INNER_HEADER_SCHEMA, - SpageKeys.HTTP_HEADER: { - "type": "object" - }, - SpageKeys.DATA: { - "type": "bytes", - }, + SpageKeys.HTTP_HEADER: {"type": "object"}, + SpageKeys.DATA: {"type": "bytes"}, }, "required": [SpageKeys.URL, SpageKeys.INNER_HEADER], } diff --git a/src/os_spage/offpage_reader.py b/src/os_spage/offpage_reader.py index b87cacc..3f03459 100644 --- a/src/os_spage/offpage_reader.py +++ b/src/os_spage/offpage_reader.py @@ -3,7 +3,7 @@ from .common import COLON, DEFAULT_ENCODING from .validator import simple_check_url -CONTENT_TYPE = 'Content-Type' +CONTENT_TYPE = "Content-Type" def read(fp): @@ -27,9 +27,9 @@ def _reset(self): def _generate(self): d = {} - d[u'url'] = self._url - d[u'header'] = self._header - d[u'data'] = self._data + d[u"url"] = self._url + d[u"header"] = self._header + d[u"data"] = self._data return d def _read_header(self): @@ -38,7 +38,7 @@ def _read_header(self): raise StopIteration try: line = line.decode(DEFAULT_ENCODING).strip() - except Exception as e: + except Exception: return self._read() line_length = len(line) if line_length <= 0 and self._header: @@ -52,12 +52,12 @@ def _read_header(self): d = line.find(COLON) if d > 0: key = line[0:d].strip() - value = line[d + 1:].strip() + value = line[d + 1 :].strip() self._header[key] = value return self._read() def _split_series(self, series): - s = [tuple(i.split(',')) for i in series.split(';') if ',' in i] + s = [tuple(i.split(",")) for i in series.split(";") if "," in i] return s def _read_data(self): @@ -82,7 +82,7 @@ def read(self): class OffpageReader(object): def __init__(self, base_filename): - self._fp = open_file(base_filename, 'r') + self._fp = open_file(base_filename, "r") def close(self): self._fp.close() diff --git a/src/os_spage/spage_reader.py b/src/os_spage/spage_reader.py index 449c279..7b8235f 100644 --- a/src/os_spage/spage_reader.py +++ b/src/os_spage/spage_reader.py @@ -27,10 +27,10 @@ def _reset(self): def _generate(self): d = {} - d[u'url'] = self._url - d[u'inner_header'] = self._inner_header - d[u'http_header'] = self._http_header - d[u'data'] = self._data + d[u"url"] = self._url + d[u"inner_header"] = self._inner_header + d[u"http_header"] = self._http_header + d[u"data"] = self._data return d def _read_inner_header(self): @@ -39,7 +39,7 @@ def _read_inner_header(self): raise StopIteration try: line = line.decode(DEFAULT_ENCODING).strip() - except Exception as e: + except Exception: return self._read() line_length = len(line) if line_length <= 0 and self._inner_header: @@ -53,7 +53,7 @@ def _read_inner_header(self): d = line.find(COLON) if d > 0: key = line[0:d].strip() - value = line[d + 1:].strip() + value = line[d + 1 :].strip() self._inner_header[key] = value return self._read() @@ -63,7 +63,7 @@ def _read_http_header(self): raise StopIteration try: line = line.decode(DEFAULT_ENCODING).strip() - except Exception as e: + except Exception: return self._read() if not line: self._read = self._read_data @@ -74,7 +74,7 @@ def _read_http_header(self): d = line.find(COLON) if d > 0: key = line[0:d].strip() - value = line[d + 1:].strip() + value = line[d + 1 :].strip() self._http_header[key] = value return self._read() @@ -98,7 +98,7 @@ def read(self): class SpageReader(object): def __init__(self, base_filename): - self._fp = open_file(base_filename, 'r') + self._fp = open_file(base_filename, "r") def close(self): self._fp.close() diff --git a/src/os_spage/spage_writer.py b/src/os_spage/spage_writer.py index c65968b..20177aa 100644 --- a/src/os_spage/spage_writer.py +++ b/src/os_spage/spage_writer.py @@ -1,8 +1,6 @@ import abc import copy -import time import zlib -from collections import OrderedDict from datetime import datetime from io import BytesIO @@ -10,10 +8,12 @@ from .common import DEFAULT_ENCODING, TIME_FORMAT from .compat import StringIO, iteritems -from .default_schema import InnerHeaderKeys as I_KEYS -from .default_schema import RecordTypes as R_TYPES -from .default_schema import SpageKeys as S_KEYS -from .default_schema import META_SCHEMA +from .default_schema import ( + META_SCHEMA, + InnerHeaderKeys as I_KEYS, + RecordTypes as R_TYPES, + SpageKeys as S_KEYS, +) from .validator import create_validator @@ -37,8 +37,9 @@ def process(self, record, **kwargs): if not record[S_KEYS.HTTP_HEADER]: record.pop(S_KEYS.HTTP_HEADER) inner_header = record[S_KEYS.INNER_HEADER] - store_size = original_size = len( - record[S_KEYS.DATA]) if record[S_KEYS.DATA] is not None else -1 + store_size = original_size = ( + len(record[S_KEYS.DATA]) if record[S_KEYS.DATA] is not None else -1 + ) if original_size >= 0: data = record[S_KEYS.DATA] @@ -46,7 +47,8 @@ def process(self, record, **kwargs): if r_type is None: if I_KEYS.ORIGINAL_SIZE in inner_header: raise ValueError( - 'do not specify %s without Type' % I_KEYS.ORIGINAL_SIZE) + "do not specify %s without Type" % I_KEYS.ORIGINAL_SIZE + ) inner_header[I_KEYS.ORIGINAL_SIZE] = original_size if self._compress: inner_header[I_KEYS.TYPE] = R_TYPES.COMPRESSED @@ -56,8 +58,7 @@ def process(self, record, **kwargs): inner_header[I_KEYS.TYPE] = R_TYPES.FLAT elif r_type == R_TYPES.COMPRESSED or r_type == R_TYPES.DELETED: if I_KEYS.ORIGINAL_SIZE not in inner_header: - raise ValueError('inner_header require %s' % - I_KEYS.ORIGINAL_SIZE) + raise ValueError("inner_header require %s" % I_KEYS.ORIGINAL_SIZE) elif r_type == R_TYPES.FLAT: inner_header[I_KEYS.ORIGINAL_SIZE] = original_size @@ -84,20 +85,21 @@ def dumps(self, record, **kwargs): class SpageRecordEncoder(RecordEncoder): - def __init__(self, allowed_inner_header_keys=None): self._inner_header_keys = allowed_inner_header_keys def _http_header_str(self, http_header): if not http_header: return None - return '\r\n'.join([': '.join((k.strip(), v.strip())) - for k, v in iteritems(http_header)]) + return "\r\n".join( + [": ".join((k.strip(), v.strip())) for k, v in iteritems(http_header)] + ) def _inner_header_str(self, inner_header): o = StringIO() - keys = self._inner_header_keys if self._inner_header_keys \ - else inner_header.keys() + keys = ( + self._inner_header_keys if self._inner_header_keys else inner_header.keys() + ) for k in keys: if k in inner_header: v = inner_header[k] @@ -105,8 +107,8 @@ def _inner_header_str(self, inner_header): continue if isinstance(v, datetime): v = v.strftime(TIME_FORMAT) - o.write(': '.join((str(k).strip(), str(v).strip()))) - o.write('\n') + o.write(": ".join((str(k).strip(), str(v).strip()))) + o.write("\n") data_length = o.tell() - 1 o.seek(0) @@ -115,24 +117,23 @@ def _inner_header_str(self, inner_header): def dumps(self, record, **kwargs): o = BytesIO() o.write(record[S_KEYS.URL].encode(DEFAULT_ENCODING)) - o.write(b'\n') + o.write(b"\n") inner_header_str = self._inner_header_str(record[S_KEYS.INNER_HEADER]) o.write(inner_header_str.encode(DEFAULT_ENCODING)) - o.write(b'\n\n') + o.write(b"\n\n") - http_header_str = self._http_header_str( - record.get(S_KEYS.HTTP_HEADER, None)) + http_header_str = self._http_header_str(record.get(S_KEYS.HTTP_HEADER, None)) if not http_header_str: - o.write(b'\r\n') + o.write(b"\r\n") else: o.write(http_header_str.encode(DEFAULT_ENCODING)) - o.write(b'\r\n\r\n') + o.write(b"\r\n\r\n") data = record.get(S_KEYS.DATA, None) if data is not None: o.write(data) - o.write(b'\r\n') + o.write(b"\r\n") o.seek(0) return o.read() @@ -154,13 +155,16 @@ def __init__(self, processor, encoder): def write(self, f, url, inner_header=None, http_header=None, data=None): if not isinstance(data, (bytes, type(None))): raise ValueError( - "bytes-like data is required, not {}".format(type(data).__name__)) + "bytes-like data is required, not {}".format(type(data).__name__) + ) record = {} record[S_KEYS.URL] = url - record[S_KEYS.INNER_HEADER] = {} if inner_header is None else copy.deepcopy( - inner_header) - record[S_KEYS.HTTP_HEADER] = {} if http_header is None else copy.deepcopy( - http_header) + record[S_KEYS.INNER_HEADER] = ( + {} if inner_header is None else copy.deepcopy(inner_header) + ) + record[S_KEYS.HTTP_HEADER] = ( + {} if http_header is None else copy.deepcopy(http_header) + ) record[S_KEYS.DATA] = data record = self._processor.process(record) @@ -168,31 +172,28 @@ def write(self, f, url, inner_header=None, http_header=None, data=None): def create_writer(**kwargs): - validator = kwargs.get('validator', None) - validator = create_validator( - META_SCHEMA) if validator is None else validator - processor = SpageRecordProcessor(validator, kwargs.get('compress', True)) - allowed_keys = validator.schema['properties'][S_KEYS.INNER_HEADER]['properties'].keys( - ) + validator = kwargs.get("validator", None) + validator = create_validator(META_SCHEMA) if validator is None else validator + processor = SpageRecordProcessor(validator, kwargs.get("compress", True)) + allowed_keys = validator.schema["properties"][S_KEYS.INNER_HEADER][ + "properties" + ].keys() encoder = SpageRecordEncoder(allowed_keys) return SpageRecordWriter(processor, encoder) class SpageWriter(object): - def __init__(self, base_filename, roll_size='1G', compress=True, validator=None): - self._fp = open_file(base_filename, 'w', roll_size=roll_size) - self._record_writer = create_writer( - validator=validator, compress=compress) + def __init__(self, base_filename, roll_size="1G", compress=True, validator=None): + self._fp = open_file(base_filename, "w", roll_size=roll_size) + self._record_writer = create_writer(validator=validator, compress=compress) def close(self): self._fp.close() def write(self, url, inner_header=None, http_header=None, data=None, flush=False): self._record_writer.write( - self._fp, url, - inner_header=inner_header, - http_header=http_header, - data=data) + self._fp, url, inner_header=inner_header, http_header=http_header, data=data + ) if flush: self._fp.flush() diff --git a/src/os_spage/validator.py b/src/os_spage/validator.py index 4cc9e9f..8bae235 100644 --- a/src/os_spage/validator.py +++ b/src/os_spage/validator.py @@ -1,8 +1,7 @@ import copy from datetime import datetime -from jsonschema import (Draft4Validator, FormatChecker, ValidationError, - validators) +from jsonschema import Draft4Validator, FormatChecker, validators from jsonschema.compat import str_types from .common import TIME_FORMAT @@ -26,11 +25,11 @@ def simple_check_url(url): @FormatChecker.cls_checks("error_reason") def check_error_reason(err_string): - c = err_string.split(' ') + c = err_string.split(" ") if len(c) != 2: return False - return c[0] in ERROR_TYPES and c[1].lstrip('-').isdigit() + return c[0] in ERROR_TYPES and c[1].lstrip("-").isdigit() def extend_with_default(validator_class): @@ -44,19 +43,15 @@ def set_defaults(validator, properties, instance, schema): o = o() instance.setdefault(property, o) - for error in validate_properties( - validator, properties, instance, schema - ): + for error in validate_properties(validator, properties, instance, schema): yield error - return validators.extend( - validator_class, {"properties": set_defaults}, - ) + return validators.extend(validator_class, {"properties": set_defaults}) DefaultPropertyDraft4Validator = extend_with_default(Draft4Validator) -EXTRA_TYPES = {'datetime': datetime, 'bytes': bytes} +EXTRA_TYPES = {"datetime": datetime, "bytes": bytes} def create_validator(schema, extra_types=None, format_checker=None): @@ -67,7 +62,5 @@ def create_validator(schema, extra_types=None, format_checker=None): format_checker = FormatChecker() return DefaultPropertyDraft4Validator( - schema, - types=types, - format_checker=format_checker, + schema, types=types, format_checker=format_checker ) diff --git a/tests/test_offpage_reader.py b/tests/test_offpage_reader.py index ff9ed1b..22201b9 100644 --- a/tests/test_offpage_reader.py +++ b/tests/test_offpage_reader.py @@ -1,35 +1,36 @@ -import zlib from io import BytesIO from os_spage import open_file, read def test_read_offpage(tmpdir): - url = 'http://www.google.com/' - data1 = '1'*10 - data2 = '2'*11 - data3 = '3'*12 - raw = ''' + url = "http://www.google.com/" + data1 = "1" * 10 + data2 = "2" * 11 + data3 = "3" * 12 + raw = """ {url} Key1: Value1 Content-Type: A, 10;B, 11;C, 12; {data1}{data2}{data3} - '''.format(url=url, data1=data1, data2=data2, data3=data3) - f = tmpdir.join('testfile.dat') - f.write(raw.encode('utf8')) - reader = open_file(f.strpath, 'r', page_type='offpage') + """.format( + url=url, data1=data1, data2=data2, data3=data3 + ) + f = tmpdir.join("testfile.dat") + f.write(raw.encode("utf8")) + reader = open_file(f.strpath, "r", page_type="offpage") for page in reader.read(): - assert page['url'] == 'http://www.google.com/' - assert page['header']['Key1'] == 'Value1' - assert page['data']['A'] == data1.encode() - assert page['data']['B'] == data2.encode() - assert page['data']['C'] == data3.encode() + assert page["url"] == "http://www.google.com/" + assert page["header"]["Key1"] == "Value1" + assert page["data"]["A"] == data1.encode() + assert page["data"]["B"] == data2.encode() + assert page["data"]["C"] == data3.encode() - s = BytesIO(raw.encode('utf8')) - for page in read(s, page_type='offpage'): - assert page['url'] == 'http://www.google.com/' - assert page['header']['Key1'] == 'Value1' - assert page['data']['A'] == data1.encode() - assert page['data']['B'] == data2.encode() - assert page['data']['C'] == data3.encode() + s = BytesIO(raw.encode("utf8")) + for page in read(s, page_type="offpage"): + assert page["url"] == "http://www.google.com/" + assert page["header"]["Key1"] == "Value1" + assert page["data"]["A"] == data1.encode() + assert page["data"]["B"] == data2.encode() + assert page["data"]["C"] == data3.encode() diff --git a/tests/test_reader_and_writer.py b/tests/test_reader_and_writer.py index c6b903d..e369abe 100644 --- a/tests/test_reader_and_writer.py +++ b/tests/test_reader_and_writer.py @@ -2,23 +2,26 @@ import pytest from jsonschema import ValidationError + from os_spage import open_file, read, write from os_spage.compat import iteritems -from os_spage.default_schema import InnerHeaderKeys as I_KEYS -from os_spage.default_schema import RecordTypes as R_TYPES -from os_spage.default_schema import SpageKeys as S_KEYS +from os_spage.default_schema import ( + InnerHeaderKeys as I_KEYS, + RecordTypes as R_TYPES, + SpageKeys as S_KEYS, +) def test_write_invalid_data(tmpdir): with tmpdir.as_cwd(): - f = open_file('test', 'w') + f = open_file("test", "w") with pytest.raises(ValidationError): - f.write(url='abc', inner_header={I_KEYS.BATCH_ID: 'test'}) + f.write(url="abc", inner_header={I_KEYS.BATCH_ID: "test"}) def check_inner_header(w_inner_header, r_inner_header): if not w_inner_header: - assert r_inner_header[I_KEYS.BATCH_ID] == '__CHANGE_ME__' + assert r_inner_header[I_KEYS.BATCH_ID] == "__CHANGE_ME__" return for k, v in iteritems(w_inner_header): @@ -48,27 +51,32 @@ def check_data(w_data, r_data, r_type): RECORDS = [ # inner_header, http_header, data (None, None, None), - ({I_KEYS.BATCH_ID: 'test'}, {'k1': 'v1'}, b'hello'), - ({I_KEYS.BATCH_ID: 'test'}, {}, b'hello'), - ({I_KEYS.BATCH_ID: 'test'}, {'k1': 'v1'}, None), - ({}, {'k1': 'v1'}, b'hello'), - ({}, {'k1': 'v1'}, None), + ({I_KEYS.BATCH_ID: "test"}, {"k1": "v1"}, b"hello"), + ({I_KEYS.BATCH_ID: "test"}, {}, b"hello"), + ({I_KEYS.BATCH_ID: "test"}, {"k1": "v1"}, None), + ({}, {"k1": "v1"}, b"hello"), + ({}, {"k1": "v1"}, None), ] def test_spage_reader_and_writer(tmpdir): base_url = "http://www.test.com/" - filename_prefix = 'test_file_' + filename_prefix = "test_file_" with tmpdir.as_cwd(): - f = open_file(filename_prefix, 'w', roll_size=100) + f = open_file(filename_prefix, "w", roll_size=100) idx = 0 for inner_header, http_header, data in RECORDS: url = base_url + str(idx) idx += 1 - f.write(url, inner_header=inner_header, - http_header=http_header, data=data, flush=True) + f.write( + url, + inner_header=inner_header, + http_header=http_header, + data=data, + flush=True, + ) f.close() - f = open_file(filename_prefix, 'r') + f = open_file(filename_prefix, "r") idx = 0 for record in f.read(): inner_header, http_header, data = RECORDS[idx] @@ -77,23 +85,23 @@ def test_spage_reader_and_writer(tmpdir): assert url == record[S_KEYS.URL] check_inner_header(inner_header, record[S_KEYS.INNER_HEADER]) check_http_header(http_header, record[S_KEYS.HTTP_HEADER]) - check_data(data, record[S_KEYS.DATA], - record[S_KEYS.INNER_HEADER][I_KEYS.TYPE]) + check_data( + data, record[S_KEYS.DATA], record[S_KEYS.INNER_HEADER][I_KEYS.TYPE] + ) def test_genergal_read_and_write(tmpdir): base_url = "http://www.test.com/" with tmpdir.as_cwd(): - filename = 'test_file' - f = open(filename, 'wb') + filename = "test_file" + f = open(filename, "wb") idx = 0 for inner_header, http_header, data in RECORDS: url = base_url + str(idx) idx += 1 - write(f, url, inner_header=inner_header, - http_header=http_header, data=data) + write(f, url, inner_header=inner_header, http_header=http_header, data=data) f.close() - f = open(filename, 'rb') + f = open(filename, "rb") idx = 0 for record in read(f): inner_header, http_header, data = RECORDS[idx] @@ -102,18 +110,36 @@ def test_genergal_read_and_write(tmpdir): assert url == record[S_KEYS.URL] check_inner_header(inner_header, record[S_KEYS.INNER_HEADER]) check_http_header(http_header, record[S_KEYS.HTTP_HEADER]) - check_data(data, record[S_KEYS.DATA], - record[S_KEYS.INNER_HEADER][I_KEYS.TYPE]) + check_data( + data, record[S_KEYS.DATA], record[S_KEYS.INNER_HEADER][I_KEYS.TYPE] + ) CUSTOM_RECORDS = [ # inner_header, http_header, data - ({I_KEYS.BATCH_ID: 'test', I_KEYS.TYPE: R_TYPES.COMPRESSED, - I_KEYS.ORIGINAL_SIZE: 5}, {'k1': 'v1'}, zlib.compress(b'hello')), - ({I_KEYS.BATCH_ID: 'test', I_KEYS.TYPE: R_TYPES.FLAT, - I_KEYS.ORIGINAL_SIZE: 5}, {'k1': 'v1'}, b'hello'), - ({I_KEYS.BATCH_ID: 'test', I_KEYS.TYPE: R_TYPES.DELETED, - I_KEYS.ORIGINAL_SIZE: 5}, {'k1': 'v1'}, b'hello'), + ( + { + I_KEYS.BATCH_ID: "test", + I_KEYS.TYPE: R_TYPES.COMPRESSED, + I_KEYS.ORIGINAL_SIZE: 5, + }, + {"k1": "v1"}, + zlib.compress(b"hello"), + ), + ( + {I_KEYS.BATCH_ID: "test", I_KEYS.TYPE: R_TYPES.FLAT, I_KEYS.ORIGINAL_SIZE: 5}, + {"k1": "v1"}, + b"hello", + ), + ( + { + I_KEYS.BATCH_ID: "test", + I_KEYS.TYPE: R_TYPES.DELETED, + I_KEYS.ORIGINAL_SIZE: 5, + }, + {"k1": "v1"}, + b"hello", + ), ] @@ -127,16 +153,15 @@ def check_custom_data(w_data, r_data): def test_write_custom_data_and_read(tmpdir): base_url = "http://www.test.com/" with tmpdir.as_cwd(): - filename_prefix = 'test_file_' - f = open_file(filename_prefix, 'w', roll_size=100) + filename_prefix = "test_file_" + f = open_file(filename_prefix, "w", roll_size=100) idx = 0 for inner_header, http_header, data in CUSTOM_RECORDS: url = base_url + str(idx) idx += 1 - f.write(url, inner_header=inner_header, - http_header=http_header, data=data) + f.write(url, inner_header=inner_header, http_header=http_header, data=data) f.close() - f = open_file(filename_prefix, 'r') + f = open_file(filename_prefix, "r") idx = 0 for record in f.read(): inner_header, http_header, data = CUSTOM_RECORDS[idx] @@ -150,23 +175,30 @@ def test_write_custom_data_and_read(tmpdir): INVALID_CUSTOM_RECORDS = [ # inner_header, http_header, data - ({I_KEYS.BATCH_ID: 'test', I_KEYS.TYPE: R_TYPES.COMPRESSED}, - {'k1': 'v1'}, zlib.compress(b'hello')), - ({I_KEYS.BATCH_ID: 'test', I_KEYS.ORIGINAL_SIZE: 5}, - {'k1': 'v1'}, zlib.compress(b'hello')), + ( + {I_KEYS.BATCH_ID: "test", I_KEYS.TYPE: R_TYPES.COMPRESSED}, + {"k1": "v1"}, + zlib.compress(b"hello"), + ), + ( + {I_KEYS.BATCH_ID: "test", I_KEYS.ORIGINAL_SIZE: 5}, + {"k1": "v1"}, + zlib.compress(b"hello"), + ), ] def test_write_invalid_custom_data(tmpdir): base_url = "http://www.test.com/" with tmpdir.as_cwd(): - filename_prefix = 'test_file_' - f = open_file(filename_prefix, 'w', roll_size=100) + filename_prefix = "test_file_" + f = open_file(filename_prefix, "w", roll_size=100) idx = 0 for inner_header, http_header, data in INVALID_CUSTOM_RECORDS: url = base_url + str(idx) idx += 1 with pytest.raises(ValueError): - f.write(url, inner_header=inner_header, - http_header=http_header, data=data) + f.write( + url, inner_header=inner_header, http_header=http_header, data=data + ) f.close() diff --git a/tests/test_validator.py b/tests/test_validator.py index 2e0ef97..b28c8ed 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -1,28 +1,21 @@ - from os_spage.validator import check_error_reason, simple_check_url def test_check_error_reason(): - valid_data = ['HTTP 404', 'RULE 16', 'DNS -2', 'SERVER 110', 'SSL -2'] + valid_data = ["HTTP 404", "RULE 16", "DNS -2", "SERVER 110", "SSL -2"] for data in valid_data: assert check_error_reason(data) == True - invalid_data = ['HTTP404', 'RULE 16', 'RULE TEST'] + invalid_data = ["HTTP404", "RULE 16", "RULE TEST"] for data in invalid_data: assert check_error_reason(data) == False def test_simple_check_url(): - valid_data = [ - "http://www.google.com/", - "https://www.google.com/", - ] + valid_data = ["http://www.google.com/", "https://www.google.com/"] for data in valid_data: assert simple_check_url(data) == True - invalid_data = [ - "htp://www.google.com/", - "https/www.google.com/", - ] + invalid_data = ["htp://www.google.com/", "https/www.google.com/"] for data in invalid_data: - assert simple_check_url(data) == False \ No newline at end of file + assert simple_check_url(data) == False diff --git a/tox.ini b/tox.ini index 963889e..8e66d75 100644 --- a/tox.ini +++ b/tox.ini @@ -29,3 +29,10 @@ commands = coverage combine coverage report codecov + +[testenv:lint] +deps = + -rrequirements/requirements-lint.txt +commands = + black --exclude=".pyi$" --check src tests + isort --diff --multi-line=3 --trailing-comma --force-grid-wrap=0 --combine-as --line-width 88 --recursive --check-only src tests