Skip to content

Commit

Permalink
chore(refactor): Move utility functions into their callers, for bette…
Browse files Browse the repository at this point in the history
…r logical organization
  • Loading branch information
jpmckinney committed Jul 19, 2024
1 parent 276fab5 commit b1854fd
Show file tree
Hide file tree
Showing 12 changed files with 291 additions and 293 deletions.
11 changes: 11 additions & 0 deletions docs/news.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,17 @@ CLI
- Remove all ``twistd`` subcommands (FTP servers, etc.). Run ``twistd``, if needed.
- Run the ``scrapyd.__main__`` module, instead of the ``scrapyd.scripts.scrapyd_run`` module.

Utils
^^^^^

Move functions from ``scrapyd.utils`` into their callers:

- ``sorted_versions`` to ``scrapyd.eggstorage``
- ``get_crawl_args`` to ``scrapyd.launcher``
- ``JsonResource``, ``get_spider_list`` and ``UtilsCache`` to ``scrapyd.webservice``

Move ``activate_egg`` from ``scrapyd.eggutils`` to ``scrapyd.runner``

Fixed
~~~~~

Expand Down
9 changes: 8 additions & 1 deletion scrapyd/eggstorage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,18 @@
import shutil
from glob import escape, glob

from packaging.version import InvalidVersion, Version
from zope.interface import implementer

from scrapyd.exceptions import DirectoryTraversalError, EggNotFoundError, ProjectNotFoundError
from scrapyd.interfaces import IEggStorage
from scrapyd.utils import sorted_versions


def sorted_versions(versions):
try:
return sorted(versions, key=Version)
except InvalidVersion:
return sorted(versions)


@implementer(IEggStorage)
Expand Down
22 changes: 0 additions & 22 deletions scrapyd/eggutils.py

This file was deleted.

19 changes: 18 additions & 1 deletion scrapyd/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,24 @@

from scrapyd import __version__
from scrapyd.interfaces import IEnvironment, IJobStorage, IPoller
from scrapyd.utils import get_crawl_args, native_stringify_dict
from scrapyd.utils import native_stringify_dict, to_native_str


def get_crawl_args(message):
"""Return the command-line arguments to use for the scrapy crawl process
that will be started for this message
"""
msg = message.copy()
args = [to_native_str(msg["_spider"])]
del msg["_project"], msg["_spider"]
settings = msg.pop("settings", {})
for k, v in native_stringify_dict(msg, keys_only=False).items():
args += ["-a"]
args += [f"{k}={v}"]
for k, v in native_stringify_dict(settings, keys_only=False).items():
args += ["-s"]
args += [f"{k}={v}"]
return args


class Launcher(Service):
Expand Down
20 changes: 19 additions & 1 deletion scrapyd/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,28 @@
import tempfile
from contextlib import contextmanager

import pkg_resources
from scrapy.utils.misc import load_object

from scrapyd import Config
from scrapyd.eggutils import activate_egg
from scrapyd.exceptions import BadEggError


def activate_egg(eggpath):
"""Activate a Scrapy egg file. This is meant to be used from egg runners
to activate a Scrapy egg file. Don't use it from other code as it may
leave unwanted side effects.
"""
distributions = pkg_resources.find_distributions(eggpath)
if isinstance(distributions, tuple):
raise BadEggError
try:
d = next(distributions)
except StopIteration:
raise BadEggError from None
d.activate()
settings_module = d.get_entry_info("scrapy", "settings").module_name
os.environ.setdefault("SCRAPY_SETTINGS_MODULE", settings_module)


@contextmanager
Expand Down
129 changes: 4 additions & 125 deletions scrapyd/utils.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,7 @@
import json
import os
import sys
from subprocess import PIPE, Popen
from typing import ClassVar
from urllib.parse import urlsplit

from packaging.version import InvalidVersion, Version
from scrapy.utils.misc import load_object
from twisted.web import resource

from scrapyd.config import Config
from scrapyd.exceptions import RunnerError
from scrapyd.sqlite import JsonSqliteDict


class JsonResource(resource.Resource):
json_encoder = json.JSONEncoder()

def render(self, txrequest):
r = resource.Resource.render(self, txrequest)
return self.encode_object(r, txrequest)

def encode_object(self, obj, txrequest):
r = "" if obj is None else self.json_encoder.encode(obj) + "\n"
txrequest.setHeader("Content-Type", "application/json")
txrequest.setHeader("Access-Control-Allow-Origin", "*")
txrequest.setHeader("Access-Control-Allow-Methods", "GET, POST, PATCH, PUT, DELETE")
txrequest.setHeader("Access-Control-Allow-Headers", " X-Requested-With")
txrequest.setHeader("Content-Length", str(len(r)))
return r


class UtilsCache:
# array of project name that need to be invalided
invalid_cached_projects: ClassVar = []

def __init__(self):
self.cache_manager = JsonSqliteDict(table="utils_cache_manager")

# Invalid the spider's list's cache of a given project (by name)
@staticmethod
def invalid_cache(project):
UtilsCache.invalid_cached_projects.append(project)

def __getitem__(self, key):
for p in UtilsCache.invalid_cached_projects:
if p in self.cache_manager:
del self.cache_manager[p]
UtilsCache.invalid_cached_projects[:] = []
return self.cache_manager[key]

def __setitem__(self, key, value):
self.cache_manager[key] = value

def __repr__(self):
return f"UtilsCache(cache_manager={self.cache_manager!r})"


def get_spider_queues(config):
Expand Down Expand Up @@ -96,88 +43,20 @@ def native_stringify_dict(dct_or_tuples, encoding="utf-8", *, keys_only=True):
"""
d = {}
for k, v in dct_or_tuples.items():
key = _to_native_str(k, encoding)
key = to_native_str(k, encoding)
if keys_only:
value = v
elif isinstance(v, dict):
value = native_stringify_dict(v, encoding=encoding, keys_only=keys_only)
elif isinstance(v, list):
value = [_to_native_str(e, encoding) for e in v]
value = [to_native_str(e, encoding) for e in v]
else:
value = _to_native_str(v, encoding)
value = to_native_str(v, encoding)
d[key] = value
return d


def get_crawl_args(message):
"""Return the command-line arguments to use for the scrapy crawl process
that will be started for this message
"""
msg = message.copy()
args = [_to_native_str(msg["_spider"])]
del msg["_project"], msg["_spider"]
settings = msg.pop("settings", {})
for k, v in native_stringify_dict(msg, keys_only=False).items():
args += ["-a"]
args += [f"{k}={v}"]
for k, v in native_stringify_dict(settings, keys_only=False).items():
args += ["-s"]
args += [f"{k}={v}"]
return args


def get_spider_list(project, runner=None, pythonpath=None, version=None):
"""Return the spider list from the given project, using the given runner"""

# UtilsCache uses JsonSqliteDict, which encodes the project's value as JSON, but JSON allows only string keys,
# so the stored dict will have a "null" key, instead of a None key.
if version is None:
version = ""

if "cache" not in get_spider_list.__dict__:
get_spider_list.cache = UtilsCache()
try:
return get_spider_list.cache[project][version]
except KeyError:
pass

if runner is None:
runner = Config().get("runner")

env = os.environ.copy()
env["PYTHONIOENCODING"] = "UTF-8"
env["SCRAPY_PROJECT"] = project
if pythonpath:
env["PYTHONPATH"] = pythonpath
if version:
env["SCRAPYD_EGG_VERSION"] = version
pargs = [sys.executable, "-m", runner, "list", "-s", "LOG_STDOUT=0"]
proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env)
out, err = proc.communicate()
if proc.returncode:
msg = err or out or ""
msg = msg.decode("utf8")
raise RunnerError(msg)

spiders = out.decode("utf-8").splitlines()
try:
project_cache = get_spider_list.cache[project]
project_cache[version] = spiders
except KeyError:
project_cache = {version: spiders}
get_spider_list.cache[project] = project_cache

return spiders


def _to_native_str(text, encoding="utf-8", errors="strict"):
def to_native_str(text, encoding="utf-8", errors="strict"):
if isinstance(text, str):
return text
return text.decode(encoding, errors)


def sorted_versions(versions):
try:
return sorted(versions, key=Version)
except InvalidVersion:
return sorted(versions)
99 changes: 96 additions & 3 deletions scrapyd/webservice.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,25 @@
from __future__ import annotations

import functools
import json
import os
import sys
import traceback
import uuid
import zipfile
from copy import copy
from io import BytesIO
from subprocess import PIPE, Popen
from typing import ClassVar

from twisted.python import log
from twisted.web import error, http
from twisted.web import error, http, resource

from scrapyd.exceptions import EggNotFoundError, ProjectNotFoundError
from scrapyd.config import Config
from scrapyd.exceptions import EggNotFoundError, ProjectNotFoundError, RunnerError
from scrapyd.jobstorage import job_items_url, job_log_url
from scrapyd.utils import JsonResource, UtilsCache, get_spider_list, native_stringify_dict
from scrapyd.sqlite import JsonSqliteDict
from scrapyd.utils import native_stringify_dict


def param(
Expand Down Expand Up @@ -52,6 +58,93 @@ def wrapper(self, txrequest, *args, **kwargs):
return decorator


def get_spider_list(project, runner=None, pythonpath=None, version=None):
"""Return the spider list from the given project, using the given runner"""

# UtilsCache uses JsonSqliteDict, which encodes the project's value as JSON, but JSON allows only string keys,
# so the stored dict will have a "null" key, instead of a None key.
if version is None:
version = ""

if "cache" not in get_spider_list.__dict__:
get_spider_list.cache = UtilsCache()
try:
return get_spider_list.cache[project][version]
except KeyError:
pass

if runner is None:
runner = Config().get("runner")

env = os.environ.copy()
env["PYTHONIOENCODING"] = "UTF-8"
env["SCRAPY_PROJECT"] = project
if pythonpath:
env["PYTHONPATH"] = pythonpath
if version:
env["SCRAPYD_EGG_VERSION"] = version
pargs = [sys.executable, "-m", runner, "list", "-s", "LOG_STDOUT=0"]
proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env)
out, err = proc.communicate()
if proc.returncode:
msg = err or out or ""
msg = msg.decode("utf8")
raise RunnerError(msg)

spiders = out.decode("utf-8").splitlines()
try:
project_cache = get_spider_list.cache[project]
project_cache[version] = spiders
except KeyError:
project_cache = {version: spiders}
get_spider_list.cache[project] = project_cache

return spiders


class UtilsCache:
# array of project name that need to be invalided
invalid_cached_projects: ClassVar = []

def __init__(self):
self.cache_manager = JsonSqliteDict(table="utils_cache_manager")

# Invalid the spider's list's cache of a given project (by name)
@staticmethod
def invalid_cache(project):
UtilsCache.invalid_cached_projects.append(project)

def __getitem__(self, key):
for p in UtilsCache.invalid_cached_projects:
if p in self.cache_manager:
del self.cache_manager[p]
UtilsCache.invalid_cached_projects[:] = []
return self.cache_manager[key]

def __setitem__(self, key, value):
self.cache_manager[key] = value

def __repr__(self):
return f"UtilsCache(cache_manager={self.cache_manager!r})"


class JsonResource(resource.Resource):
json_encoder = json.JSONEncoder()

def render(self, txrequest):
r = resource.Resource.render(self, txrequest)
return self.encode_object(r, txrequest)

def encode_object(self, obj, txrequest):
r = "" if obj is None else self.json_encoder.encode(obj) + "\n"
txrequest.setHeader("Content-Type", "application/json")
txrequest.setHeader("Access-Control-Allow-Origin", "*")
txrequest.setHeader("Access-Control-Allow-Methods", "GET, POST, PATCH, PUT, DELETE")
txrequest.setHeader("Access-Control-Allow-Headers", " X-Requested-With")
txrequest.setHeader("Content-Length", str(len(r)))
return r


class WsResource(JsonResource):
def __init__(self, root):
JsonResource.__init__(self)
Expand Down
Loading

0 comments on commit b1854fd

Please sign in to comment.