Skip to content

Commit

Permalink
feat: get_spider_list() (used by Schedule and ListSpiders) supports t…
Browse files Browse the repository at this point in the history
…he [settings] section. Add Root._config (hack) #526. chore: Root has the correct default for runner, if it were somehow not configured.
  • Loading branch information
jpmckinney committed Jul 21, 2024
1 parent 41fba51 commit 94e87da
Show file tree
Hide file tree
Showing 15 changed files with 119 additions and 48 deletions.
2 changes: 2 additions & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ listspiders.json

Get the spiders in a version of a project.

.. note:: If :ref:`the project is in a Python module rather than a Python egg<config-settings>`, don't set the ``version`` parameter.

Supported request methods
``GET``
Parameters
Expand Down
1 change: 1 addition & 0 deletions docs/news.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ Web UI
API
^^^

- The :ref:`schedule.json` and :ref:`listspiders.json` webservices support Scrapy projects stored as Python modules, using the previously undocumented :ref:`[settings]<config-settings>` section.
- Clarify error messages, for example:

- ``'project' parameter is required``, instead of ``'project'`` (KeyError)
Expand Down
3 changes: 1 addition & 2 deletions integration_tests/test_webservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_options(webservice, method):
assert response.headers["Allow"] == f"OPTIONS, HEAD, {method}"


# cancel.json, status.json and listjobs.json will error with "project '%b' not found" on directory traversal attempts.
# Cancel, Status, ListJobs and ListSpiders will error with "project '%b' not found" on directory traversal attempts.
# The egg storage (in get_project_list, called by get_spider_queues, called by QueuePoller, used by these webservices)
# would need to find a project like "../project" (which is impossible with the default eggstorage) to not error.
@pytest.mark.parametrize(
Expand Down Expand Up @@ -76,7 +76,6 @@ def test_project_directory_traversal(webservice, method, params):
("webservice", "method", "params"),
[
("schedule", "post", {"spider": "s"}),
("listspiders", "get", {}),
],
)
def test_project_directory_traversal_runner(webservice, method, params):
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ ignore = [
"DTZ005", # `datetime.datetime.now()` called without a `tz` argument
"DTZ006", # `datetime.datetime.fromtimestamp()` called without a `tz` argument
"DTZ007", # Naive datetime constructed using `datetime.datetime.strptime()` without %z

# https://github.com/scrapy/scrapyd/issues/526
"FIX002",
"SLF001",
]

[tool.ruff.lint.flake8-builtins]
Expand Down
27 changes: 19 additions & 8 deletions scrapyd/webservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def wrapper(self, txrequest, *args, **kwargs):
return decorator


def get_spider_list(project, runner=None, pythonpath=None, version=None):
def get_spider_list(project, runner=None, pythonpath=None, version=None, config=None):
"""Return the spider list from the given project, using the given runner"""

# UtilsCache uses JsonSqliteDict, which encodes the project's value as JSON, but JSON allows only string keys,
Expand All @@ -72,16 +72,24 @@ def get_spider_list(project, runner=None, pythonpath=None, version=None):
except KeyError:
pass

settings = {} if config is None else dict(config.items("settings", default=[]))

# runner should always be set.
if runner is None:
runner = Config().get("runner")
runner = Config().get("runner", "scrapyd.runner")

env = os.environ.copy()
env["PYTHONIOENCODING"] = "UTF-8"
env["SCRAPY_PROJECT"] = project
# TODO(jpmckinney): Remove
# https://github.com/scrapy/scrapyd/commit/17520a32d19726dc4b09611ff732a9ff3fa8b6ea
if pythonpath:
env["PYTHONPATH"] = pythonpath
if version:
env["SCRAPYD_EGG_VERSION"] = version
if project in dict(settings):
env["SCRAPY_SETTINGS_MODULE"] = settings[project]

pargs = [sys.executable, "-m", runner, "list", "-s", "LOG_STDOUT=0"]
proc = Popen(pargs, stdout=PIPE, stderr=PIPE, env=env)
out, err = proc.communicate()
Expand Down Expand Up @@ -196,7 +204,7 @@ def render_POST(self, txrequest, project, spider, version, jobid, priority, sett
raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode())
raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode())

spiders = get_spider_list(project, version=version, runner=self.root.runner)
spiders = get_spider_list(project, version=version, runner=self.root.runner, config=self.root._config)
if spider not in spiders:
raise error.Error(code=http.OK, message=b"spider '%b' not found" % spider.encode())

Expand Down Expand Up @@ -251,9 +259,11 @@ def render_POST(self, txrequest, project, version, egg):
)

self.root.eggstorage.put(BytesIO(egg), project, version)
spiders = get_spider_list(project, version=version, runner=self.root.runner)
self.root.update_projects()

spiders = get_spider_list(project, version=version, runner=self.root.runner, config=self.root._config)
UtilsCache.invalid_cache(project)

return {
"node_name": self.root.nodename,
"status": "ok",
Expand All @@ -280,12 +290,13 @@ class ListSpiders(WsResource):
@param("project")
@param("_version", dest="version", required=False, default=None)
def render_GET(self, txrequest, project, version):
if self.root.eggstorage.get(project, version) == (None, None):
if version:
raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode())
if project not in self.root.scheduler.queues:
raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode())

spiders = get_spider_list(project, version=version, runner=self.root.runner)
if version and self.root.eggstorage.get(project, version) == (None, None):
raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode())

spiders = get_spider_list(project, version=version, runner=self.root.runner, config=self.root._config)

return {"node_name": self.root.nodename, "status": "ok", "spiders": spiders}

Expand Down
25 changes: 15 additions & 10 deletions scrapyd/website.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,22 +128,27 @@ def _getFilesAndDirectories(self, directory):
class Root(resource.Resource):
def __init__(self, config, app):
resource.Resource.__init__(self)

logs_dir = config.get("logs_dir")
items_dir = config.get("items_dir")

self.app = app
# TODO(jpmckinney): Make Config a Component
# https://github.com/scrapy/scrapyd/issues/526
self._config = config
self.debug = config.getboolean("debug", False)
self.runner = config.get("runner")
self.runner = config.get("runner", "scrapyd.runner")
self.prefix_header = config.get("prefix_header")
logsdir = config.get("logs_dir")
itemsdir = config.get("items_dir")
self.local_items = itemsdir and (urlparse(itemsdir).scheme.lower() in ["", "file"])
self.app = app
self.local_items = items_dir and (urlparse(items_dir).scheme.lower() in ["", "file"])
self.nodename = config.get("node_name", socket.gethostname())

self.putChild(b"", Home(self, self.local_items))
if logsdir:
self.putChild(b"logs", File(logsdir.encode("ascii", "ignore"), "text/plain"))
if logs_dir:
self.putChild(b"logs", File(logs_dir.encode("ascii", "ignore"), "text/plain"))
if self.local_items:
self.putChild(b"items", File(itemsdir, "text/plain"))
self.putChild(b"items", File(items_dir, "text/plain"))
self.putChild(b"jobs", Jobs(self, self.local_items))
services = config.items("services", ())
for service_name, service_path in services:
for service_name, service_path in config.items("services", default=[]):
service_cls = load_object(service_path)
self.putChild(service_name.encode(), service_cls(self))

Expand Down
5 changes: 5 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,10 @@ def get_egg_data(basename):
return pkgutil.get_data("tests", f"fixtures/{basename}.egg")


def has_settings(root):
# https://github.com/scrapy/scrapyd/issues/526
return root._config.cp.has_section("settings")


def root_add_version(root, project, version, basename):
root.eggstorage.put(io.BytesIO(get_egg_data(basename)), project, version)
14 changes: 11 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import os.path
import shutil

import pytest
from twisted.web import http
from twisted.web.http import Request
Expand All @@ -8,6 +11,8 @@
from scrapyd.website import Root
from tests import root_add_version

BASEDIR = os.path.abspath(os.path.dirname(__file__))


@pytest.fixture()
def txrequest():
Expand All @@ -19,22 +24,25 @@ def txrequest():
# Use this fixture when testing the Scrapyd web UI or API or writing configuration files.
@pytest.fixture()
def chdir(monkeypatch, tmpdir):
return monkeypatch.chdir(tmpdir)
monkeypatch.chdir(tmpdir)
return tmpdir


@pytest.fixture(
params=[
None,
(Config.SECTION, "items_dir", "items"),
("settings", "localproject", "tests.fixtures.localbot.settings"),
("settings", "localproject", "localproject.settings"),
],
ids=["default", "items_dir", "settings"],
)
def root(request, chdir):
config = Config()
if request.param:
if request.param[0] != Config.SECTION:
if request.param[0] == "settings":
config.cp.add_section(request.param[0])
# Copy the local files to be in the Python path.
shutil.copytree(os.path.join(BASEDIR, "fixtures", "filesystem"), os.path.join(chdir), dirs_exist_ok=True)
config.cp.set(*request.param)

return Root(config, application(config))
Expand Down
File renamed without changes.
Loading

0 comments on commit 94e87da

Please sign in to comment.