Skip to content

Commit

Permalink
feat: Clarify error messages if project or version of egg not found, c…
Browse files Browse the repository at this point in the history
…loses #517
  • Loading branch information
jpmckinney committed Jul 18, 2024
1 parent 54e8b25 commit 88a4e45
Show file tree
Hide file tree
Showing 7 changed files with 209 additions and 61 deletions.
8 changes: 5 additions & 3 deletions docs/news.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,10 @@ API

- Clarify error messages, for example:

- ``'project' parameter is required``, instead of ``'project'``
- ``project 'myproject' not found``, instead of ``'myproject'``
- ``'project' parameter is required``, instead of ``'project'`` (KeyError)
- ``project 'myproject' not found``, instead of ``'myproject'`` (KeyError)
- ``project 'myproject' not found``, instead of ``Scrapy VERSION - no active project``
- ``version 'myversion' not found``, instead of a traceback
- ``exception class: message``, instead of ``message``
- ``ValueError: Unknown or corrupt egg``, instead of ``TypeError: 'tuple' object is not an iterator``
- Unsupported method error messages no longer list ``object`` as an allowed HTTP method
Expand All @@ -65,7 +67,7 @@ Security
^^^^^^^^

- The ``FilesystemEggStorage`` class used by the :ref:`listversions.json` webservice escapes project names (used in glob patterns) before globbing, to disallow listing arbitrary directories.
- The ``FilesystemEggStorage`` class used by the :ref:`runner` and the :ref:`addversion.json`, :ref:`listversions.json`, :ref:`delversion.json` and :ref:`delproject.json` webservices raises a ``DirectoryTraversalError`` error if the project (used in file paths) would traverse directories.
- The ``FilesystemEggStorage`` class used by the :ref:`runner` and the :ref:`addversion.json`, :ref:`listversions.json`, :ref:`delversion.json` and :ref:`delproject.json` webservices raises a ``DirectoryTraversalError`` error if the project parameter (used in file paths) would traverse directories.
- The ``Environment`` class used by the :ref:`launcher` raises a ``DirectoryTraversalError`` error if the project, spider or job parameters (used in file paths) would traverse directories.
- The :ref:`webui` escapes user input (project names, spider names, and job IDs) to prevent cross-site scripting (XSS).

Expand Down
37 changes: 6 additions & 31 deletions integration_tests/test_webservice.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import os.path
import sys
from pathlib import Path

import pytest
import requests
import scrapy

from integration_tests import req

Expand All @@ -17,8 +15,6 @@ def assert_webservice(method, path, expected, **kwargs):
response = req(method, path, **kwargs)
data = response.json()
data.pop("node_name")
if "message" in expected:
expected["message"] = expected["message"].replace("\n", os.linesep)

assert data == expected

Expand Down Expand Up @@ -92,12 +88,9 @@ def test_project_directory_traversal_runner(webservice, method, params):

data = response.json()
data.pop("node_name")
message = data.pop("message")

assert response.status_code == 200, f"200 != {response.status_code}"
assert data == {"status": "error"}
assert message.startswith("RunnerError: Traceback (most recent call last):"), message
assert message.endswith(f"scrapyd.exceptions.DirectoryTraversalError: ../p{os.linesep}"), message
assert data == {"status": "error", "message": "DirectoryTraversalError: ../p"}


def test_daemonstatus():
Expand All @@ -114,11 +107,7 @@ def test_schedule():
"/schedule.json",
{
"status": "error",
"message": (
f'RunnerError: Scrapy {scrapy.__version__} - no active project\n\n'
'Unknown command: list\n\n'
'Use "scrapy" to see available commands\n'
),
"message": "project 'nonexistent' not found",
},
data={"project": "nonexistent", "spider": "nospider"},
)
Expand Down Expand Up @@ -174,11 +163,7 @@ def test_listspiders_nonexistent_project():
"/listspiders.json",
{
"status": "error",
"message": (
f'RunnerError: Scrapy {scrapy.__version__} - no active project\n\n'
'Unknown command: list\n\n'
'Use "scrapy" to see available commands\n'
),
"message": "project 'nonexistent' not found",
},
params={"project": "nonexistent"},
)
Expand Down Expand Up @@ -207,14 +192,9 @@ def test_delversion_nonexistent_project():
"/delversion.json",
{
"status": "error",
"message": "FileNotFoundError: " + (
f"[Errno 2] No such file or directory: '{BASEDIR}/eggs/nonexistent/noegg.egg'"
if sys.platform != "win32" else
"[WinError 3] The system cannot find the path specified: "
f"'{BASEDIR}\\\\eggs\\\\nonexistent\\\\noegg.egg'"
),
"message": "version 'nonexistent' not found",
},
data={"project": "nonexistent", "version": "noegg"},
data={"project": "sample", "version": "nonexistent"},
)


Expand All @@ -224,12 +204,7 @@ def test_delproject_nonexistent_project():
"/delproject.json",
{
"status": "error",
"message": "FileNotFoundError: " + (
f"[Errno 2] No such file or directory: '{BASEDIR}/eggs/nonexistent'"
if sys.platform != "win32" else
"[WinError 3] The system cannot find the path specified: "
f"'{BASEDIR}\\\\eggs\\\\nonexistent'"
),
"message": "project 'nonexistent' not found",
},
data={"project": "nonexistent"},
)
21 changes: 15 additions & 6 deletions scrapyd/eggstorage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from zope.interface import implementer

from scrapyd.exceptions import DirectoryTraversalError
from scrapyd.exceptions import DirectoryTraversalError, EggNotFoundError, ProjectNotFoundError
from scrapyd.interfaces import IEggStorage
from scrapyd.utils import sorted_versions

Expand All @@ -32,7 +32,10 @@ def get(self, project, version=None):
version = self.list(project)[-1]
except IndexError:
return None, None
return version, open(self._egg_path(project, version), 'rb')
try:
return version, open(self._egg_path(project, version), 'rb')
except FileNotFoundError:
return None, None

def list(self, project):
return sorted_versions(
Expand All @@ -46,11 +49,17 @@ def list_projects(self):

def delete(self, project, version=None):
if version is None:
shutil.rmtree(self._get_path(project))
try:
shutil.rmtree(self._get_path(project))
except FileNotFoundError as e:
raise ProjectNotFoundError from e
else:
os.remove(self._egg_path(project, version))
if not self.list(project): # remove project if no versions left
self.delete(project)
try:
os.remove(self._egg_path(project, version))
if not self.list(project): # remove project if no versions left
self.delete(project)
except FileNotFoundError as e:
raise EggNotFoundError from e

def _egg_path(self, project, version):
sanitized_version = re.sub(r'[^A-Za-z0-9_-]', '_', version)
Expand Down
8 changes: 8 additions & 0 deletions scrapyd/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,13 @@ class DirectoryTraversalError(ScrapydError):
"""Raised if the resolved path is outside the expected directory"""


class ProjectNotFoundError(ScrapydError):
"""Raised if a project isn't found in an IEggStorage implementation"""


class EggNotFoundError(ScrapydError):
"""Raised if an egg isn't found in an IEggStorage implementation"""


class RunnerError(ScrapydError):
"""Raised if the runner returns an error code"""
2 changes: 1 addition & 1 deletion scrapyd/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def get_crawl_args(message):
return args


def get_spider_list(project, runner=None, pythonpath=None, version=''):
def get_spider_list(project, runner=None, pythonpath=None, version=None):
"""Return the spider list from the given project, using the given runner"""
if "cache" not in get_spider_list.__dict__:
get_spider_list.cache = UtilsCache()
Expand Down
37 changes: 27 additions & 10 deletions scrapyd/webservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from twisted.python import log
from twisted.web import error, http

from scrapyd.exceptions import EggNotFoundError, ProjectNotFoundError
from scrapyd.jobstorage import job_items_url, job_log_url
from scrapyd.utils import JsonResource, UtilsCache, get_spider_list, native_stringify_dict

Expand Down Expand Up @@ -101,16 +102,23 @@ def render_GET(self, txrequest):
class Schedule(WsResource):
@param('project')
@param('spider')
@param('_version', dest='version', required=False, default='')
@param('_version', dest='version', required=False, default=None)
# See https://github.com/scrapy/scrapyd/pull/215
@param('jobid', required=False, default=lambda: uuid.uuid1().hex)
@param('priority', required=False, default=0, type=float)
@param('setting', required=False, default=list, multiple=True)
def render_POST(self, txrequest, project, spider, version, jobid, priority, setting):
spider_arguments = {k: v[0] for k, v in native_stringify_dict(copy(txrequest.args), keys_only=False).items()}
spiders = get_spider_list(project, version=version)
if self.root.eggstorage.get(project, version) == (None, None):
if version:
raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode())
raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode())

spiders = get_spider_list(project, version=version, runner=self.root.runner)
if spider not in spiders:
raise error.Error(code=http.OK, message=b"spider '%b' not found" % spider.encode())

spider_arguments = {k: v[0] for k, v in native_stringify_dict(copy(txrequest.args), keys_only=False).items()}

self.root.scheduler.schedule(
project,
spider,
Expand Down Expand Up @@ -160,7 +168,7 @@ def render_POST(self, txrequest, project, version, egg):
)

self.root.eggstorage.put(BytesIO(egg), project, version)
spiders = get_spider_list(project, version=version)
spiders = get_spider_list(project, version=version, runner=self.root.runner)
self.root.update_projects()
UtilsCache.invalid_cache(project)
return {"node_name": self.root.nodename, "status": "ok", "project": project, "version": version,
Expand All @@ -182,9 +190,15 @@ def render_GET(self, txrequest, project):

class ListSpiders(WsResource):
@param('project')
@param('_version', dest='version', required=False, default='')
@param('_version', dest='version', required=False, default=None)
def render_GET(self, txrequest, project, version):
spiders = get_spider_list(project, runner=self.root.runner, version=version)
if self.root.eggstorage.get(project, version) == (None, None):
if version:
raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode())
raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode())

spiders = get_spider_list(project, version=version, runner=self.root.runner)

return {"node_name": self.root.nodename, "status": "ok", "spiders": spiders}


Expand Down Expand Up @@ -269,11 +283,14 @@ def render_POST(self, txrequest, project):
UtilsCache.invalid_cache(project)
return {"node_name": self.root.nodename, "status": "ok"}

# Minor security note: Submitting a non-existent project or version when using the default eggstorage can produce
# an error message that discloses the resolved path to the eggs_dir.
def _delete_version(self, project, version=None):
self.root.eggstorage.delete(project, version)
self.root.update_projects()
try:
self.root.eggstorage.delete(project, version)
self.root.update_projects()
except ProjectNotFoundError:
raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode())
except EggNotFoundError:
raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode())


class DeleteVersion(DeleteProject):
Expand Down
Loading

0 comments on commit 88a4e45

Please sign in to comment.