Skip to content

Commit

Permalink
add: lint
Browse files Browse the repository at this point in the history
  • Loading branch information
cfhamlet committed Sep 24, 2019
1 parent ca8c4ee commit 7ae163c
Show file tree
Hide file tree
Showing 16 changed files with 332 additions and 272 deletions.
4 changes: 4 additions & 0 deletions .isort.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[isort]
known_first_party = os_spage
known_third_party = pytest, jsonschema
skip_glob = *.pyi
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ matrix:
- python: 2.7
env: TOXENV=py27,codecov
- python: 3.6
env: TOXENV=py36,codecov
env: TOXENV=py36,codecov,lint
- python: pypy
env: TOXENV=pypy
- python: pypy3
Expand Down
3 changes: 3 additions & 0 deletions requirements/requirements-lint.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
autoflake
black
isort
15 changes: 15 additions & 0 deletions script/lint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/sh -e


export PREFIX=""
if [ -d 'venv' ] ; then
export PREFIX="venv/bin/"
fi

set -x

pip install -r requirements/requirements-lint.txt

${PREFIX}autoflake --in-place --recursive --remove-all-unused-imports --remove-unused-variables src tests
${PREFIX}black --exclude=".pyi$" src tests
${PREFIX}isort --multi-line=3 --trailing-comma --force-grid-wrap=0 --combine-as --line-width 88 --recursive --apply src tests
31 changes: 17 additions & 14 deletions src/os_spage/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import pkgutil
import sys
from .spage_reader import SpageReader, read as read_spage
from .spage_writer import SpageWriter, write

from .offpage_reader import OffpageReader, read as read_offpage
from .spage_reader import SpageReader, read as read_spage
from .spage_writer import SpageWriter, write as _write

write = _write


def __not_supported_mode(name, **kwargs):
Expand All @@ -13,27 +16,27 @@ def __not_supported_page_type(name, **kwargs):
raise ValueError("page_type must be 'spage' or 'offpage'")


def read(s, page_type='spage'):
r = {'spage': read_spage, 'offpage': read_offpage}.get(
page_type, __not_supported_page_type)
def read(s, page_type="spage"):
r = {"spage": read_spage, "offpage": read_offpage}.get(
page_type, __not_supported_page_type
)
return r(s)


def open_file(name, mode, **kwargs):
r = {'w': SpageWriter,
'r': {'spage': SpageReader,
'offpage': OffpageReader}}.get(mode, __not_supported_mode)
if mode == 'r':
r = r.get(kwargs.pop('page_type', 'spage'), __not_supported_page_type)
r = {"w": SpageWriter, "r": {"spage": SpageReader, "offpage": OffpageReader}}.get(
mode, __not_supported_mode
)
if mode == "r":
r = r.get(kwargs.pop("page_type", "spage"), __not_supported_page_type)

return r(name, **kwargs)


__all__ = ['__version__', 'version_info', 'open_file']
__all__ = ["__version__", "version_info", "open_file"]

__version__ = pkgutil.get_data(__package__, 'VERSION').decode('ascii').strip()
version_info = tuple(int(v) if v.isdigit() else v
for v in __version__.split('.'))
__version__ = pkgutil.get_data(__package__, "VERSION").decode("ascii").strip()
version_info = tuple(int(v) if v.isdigit() else v for v in __version__.split("."))

if sys.version_info < (2, 7):
sys.exit("os-spage %s requires Python 2.7+" % __version__)
Expand Down
3 changes: 1 addition & 2 deletions src/os_spage/common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@

DEFAULT_ENCODING = 'utf-8'
DEFAULT_ENCODING = "utf-8"

COLON = ":"

Expand Down
10 changes: 7 additions & 3 deletions src/os_spage/compat.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import sys
import operator
import sys

PY3 = sys.version_info[0] >= 3

if PY3:
from io import StringIO
from io import StringIO as _StringIO

iteritems = operator.methodcaller("items")

else:
from StringIO import StringIO
from StringIO import StringIO as _StringIO

iteritems = operator.methodcaller("iteritems")

StringIO = _StringIO
203 changes: 104 additions & 99 deletions src/os_spage/default_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,127 +3,132 @@


class RecordTypes(object):
FLAT = 'flat'
DELETED = 'deleted'
COMPRESSED = 'compressed'
FLAT = "flat"
DELETED = "deleted"
COMPRESSED = "compressed"


class InnerHeaderKeys(object):
VERSION = 'Version'
TYPE = 'Type'
FETCH_TIME = 'Fetch-Time'
ORIGINAL_SIZE = 'Original-Size'
STORE_SIZE = 'Store-Size'
BATCH_ID = 'batchID'
ATTACH = 'attach'
IP_ADDRESS = 'IP-Address'
SPIDER_ADDRESS = 'Spider-Address'
DIGEST = 'Digest'
USER_AGENT = 'User-Agent'
FETCH_IP = 'Fetch-IP'
NODE_FETCH_TIME = 'Node-Fetch-Time'
ERROR_REASON = 'Error-Reason'
VERSION = "Version"
TYPE = "Type"
FETCH_TIME = "Fetch-Time"
ORIGINAL_SIZE = "Original-Size"
STORE_SIZE = "Store-Size"
BATCH_ID = "batchID"
ATTACH = "attach"
IP_ADDRESS = "IP-Address"
SPIDER_ADDRESS = "Spider-Address"
DIGEST = "Digest"
USER_AGENT = "User-Agent"
FETCH_IP = "Fetch-IP"
NODE_FETCH_TIME = "Node-Fetch-Time"
ERROR_REASON = "Error-Reason"


INNER_HEADER_SCHEMA = {
"type": "object",
"properties": OrderedDict([
(InnerHeaderKeys.VERSION, {
"type": "string",
"default": "1.2",
}),
(InnerHeaderKeys.TYPE, { # autofill or specify
"type": "string",
"enum": set([getattr(RecordTypes, i) for i in dir(RecordTypes) if not i.startswith('_')]) ,
}),
(InnerHeaderKeys.FETCH_TIME, { # record store time
"anyOf": [
{
"type": "datetime",
"properties": OrderedDict(
[
(InnerHeaderKeys.VERSION, {"type": "string", "default": "1.2"}),
(
InnerHeaderKeys.TYPE,
{ # autofill or specify
"type": "string",
"enum": set(
[
getattr(RecordTypes, i)
for i in dir(RecordTypes)
if not i.startswith("_")
]
),
},
),
(
InnerHeaderKeys.FETCH_TIME,
{ # record store time
"anyOf": [
{"type": "datetime"},
{"type": "string", "format": "readable_time"},
],
"default": datetime.now,
},
{
),
(
InnerHeaderKeys.ORIGINAL_SIZE,
{"type": "number"}, # data(html) size, autofill
),
(InnerHeaderKeys.STORE_SIZE, {"type": "number"}), # store size, autofill
(
InnerHeaderKeys.BATCH_ID,
{ # batch identity
"type": "string",
"format": "readable_time"
"minLength": 3,
"default": "__CHANGE_ME__",
},
],
"default": datetime.now,
}),
(InnerHeaderKeys.ORIGINAL_SIZE, { # data(html) size, autofill
"type": "number",
}),
(InnerHeaderKeys.STORE_SIZE, { # store size, autofill
"type": "number",
}),
(InnerHeaderKeys.BATCH_ID, { # batch identity
"type": "string",
"minLength": 3,
"default": '__CHANGE_ME__',
}),
(InnerHeaderKeys.ATTACH, {
"type": "string",
}),
(InnerHeaderKeys.IP_ADDRESS, { # remote host ip
"type": "string",
"format": "ipv4",
"default": "0.0.0.0",
}),
(InnerHeaderKeys.SPIDER_ADDRESS, { # spider node identity
"type": "string",
"default": "0.0.0.0",
}),
(InnerHeaderKeys.DIGEST, { # can be html md5
"type": "string",
"default": "0" * 32,
"maxLength": 32,
"minLength": 32,
}),
(InnerHeaderKeys.USER_AGENT, {
"type": "string",
}),
(InnerHeaderKeys.FETCH_IP, { # generate page machine ip
"type": "string",
"format": "ipv4",
"default": "0.0.0.0",
}),
(InnerHeaderKeys.NODE_FETCH_TIME, { # real fetch time
"anyOf": [
{
"type": "datetime",
),
(InnerHeaderKeys.ATTACH, {"type": "string"}),
(
InnerHeaderKeys.IP_ADDRESS,
{ # remote host ip
"type": "string",
"format": "ipv4",
"default": "0.0.0.0",
},
{
),
(
InnerHeaderKeys.SPIDER_ADDRESS,
{"type": "string", "default": "0.0.0.0"}, # spider node identity
),
(
InnerHeaderKeys.DIGEST,
{ # can be html md5
"type": "string",
"format": "readable_time"
"default": "0" * 32,
"maxLength": 32,
"minLength": 32,
},
]
}),
(InnerHeaderKeys.ERROR_REASON, {
"type": "string",
"format": "error_reaseon",
}),
]),
),
(InnerHeaderKeys.USER_AGENT, {"type": "string"}),
(
InnerHeaderKeys.FETCH_IP,
{ # generate page machine ip
"type": "string",
"format": "ipv4",
"default": "0.0.0.0",
},
),
(
InnerHeaderKeys.NODE_FETCH_TIME,
{ # real fetch time
"anyOf": [
{"type": "datetime"},
{"type": "string", "format": "readable_time"},
]
},
),
(
InnerHeaderKeys.ERROR_REASON,
{"type": "string", "format": "error_reaseon"},
),
]
),
}


class SpageKeys(object):
URL = 'url'
INNER_HEADER = 'inner_header'
HTTP_HEADER = 'http_header'
DATA = 'data'
URL = "url"
INNER_HEADER = "inner_header"
HTTP_HEADER = "http_header"
DATA = "data"


META_SCHEMA = {
"type": "object",
"properties": {
SpageKeys.URL: {
"type": "string",
"format": "url",
},
SpageKeys.URL: {"type": "string", "format": "url"},
SpageKeys.INNER_HEADER: INNER_HEADER_SCHEMA,
SpageKeys.HTTP_HEADER: {
"type": "object"
},
SpageKeys.DATA: {
"type": "bytes",
},
SpageKeys.HTTP_HEADER: {"type": "object"},
SpageKeys.DATA: {"type": "bytes"},
},
"required": [SpageKeys.URL, SpageKeys.INNER_HEADER],
}
Loading

0 comments on commit 7ae163c

Please sign in to comment.