From 88a4e45b3b5a0b8ab8e210d797a1be810ea2efbb Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Thu, 18 Jul 2024 14:01:06 -0400 Subject: [PATCH] feat: Clarify error messages if project or version of egg not found, closes #517 --- docs/news.rst | 8 +- integration_tests/test_webservice.py | 37 +------ scrapyd/eggstorage.py | 21 +++- scrapyd/exceptions.py | 8 ++ scrapyd/utils.py | 2 +- scrapyd/webservice.py | 37 +++++-- tests/test_webservice.py | 157 +++++++++++++++++++++++++-- 7 files changed, 209 insertions(+), 61 deletions(-) diff --git a/docs/news.rst b/docs/news.rst index d1d26125..17d4a810 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -38,8 +38,10 @@ API - Clarify error messages, for example: - - ``'project' parameter is required``, instead of ``'project'`` - - ``project 'myproject' not found``, instead of ``'myproject'`` + - ``'project' parameter is required``, instead of ``'project'`` (KeyError) + - ``project 'myproject' not found``, instead of ``'myproject'`` (KeyError) + - ``project 'myproject' not found``, instead of ``Scrapy VERSION - no active project`` + - ``version 'myversion' not found``, instead of a traceback - ``exception class: message``, instead of ``message`` - ``ValueError: Unknown or corrupt egg``, instead of ``TypeError: 'tuple' object is not an iterator`` - Unsupported method error messages no longer list ``object`` as an allowed HTTP method @@ -65,7 +67,7 @@ Security ^^^^^^^^ - The ``FilesystemEggStorage`` class used by the :ref:`listversions.json` webservice escapes project names (used in glob patterns) before globbing, to disallow listing arbitrary directories. -- The ``FilesystemEggStorage`` class used by the :ref:`runner` and the :ref:`addversion.json`, :ref:`listversions.json`, :ref:`delversion.json` and :ref:`delproject.json` webservices raises a ``DirectoryTraversalError`` error if the project (used in file paths) would traverse directories. +- The ``FilesystemEggStorage`` class used by the :ref:`runner` and the :ref:`addversion.json`, :ref:`listversions.json`, :ref:`delversion.json` and :ref:`delproject.json` webservices raises a ``DirectoryTraversalError`` error if the project parameter (used in file paths) would traverse directories. - The ``Environment`` class used by the :ref:`launcher` raises a ``DirectoryTraversalError`` error if the project, spider or job parameters (used in file paths) would traverse directories. - The :ref:`webui` escapes user input (project names, spider names, and job IDs) to prevent cross-site scripting (XSS). diff --git a/integration_tests/test_webservice.py b/integration_tests/test_webservice.py index e3e45d58..2114cba0 100644 --- a/integration_tests/test_webservice.py +++ b/integration_tests/test_webservice.py @@ -1,10 +1,8 @@ import os.path -import sys from pathlib import Path import pytest import requests -import scrapy from integration_tests import req @@ -17,8 +15,6 @@ def assert_webservice(method, path, expected, **kwargs): response = req(method, path, **kwargs) data = response.json() data.pop("node_name") - if "message" in expected: - expected["message"] = expected["message"].replace("\n", os.linesep) assert data == expected @@ -92,12 +88,9 @@ def test_project_directory_traversal_runner(webservice, method, params): data = response.json() data.pop("node_name") - message = data.pop("message") assert response.status_code == 200, f"200 != {response.status_code}" - assert data == {"status": "error"} - assert message.startswith("RunnerError: Traceback (most recent call last):"), message - assert message.endswith(f"scrapyd.exceptions.DirectoryTraversalError: ../p{os.linesep}"), message + assert data == {"status": "error", "message": "DirectoryTraversalError: ../p"} def test_daemonstatus(): @@ -114,11 +107,7 @@ def test_schedule(): "/schedule.json", { "status": "error", - "message": ( - f'RunnerError: Scrapy {scrapy.__version__} - no active project\n\n' - 'Unknown command: list\n\n' - 'Use "scrapy" to see available commands\n' - ), + "message": "project 'nonexistent' not found", }, data={"project": "nonexistent", "spider": "nospider"}, ) @@ -174,11 +163,7 @@ def test_listspiders_nonexistent_project(): "/listspiders.json", { "status": "error", - "message": ( - f'RunnerError: Scrapy {scrapy.__version__} - no active project\n\n' - 'Unknown command: list\n\n' - 'Use "scrapy" to see available commands\n' - ), + "message": "project 'nonexistent' not found", }, params={"project": "nonexistent"}, ) @@ -207,14 +192,9 @@ def test_delversion_nonexistent_project(): "/delversion.json", { "status": "error", - "message": "FileNotFoundError: " + ( - f"[Errno 2] No such file or directory: '{BASEDIR}/eggs/nonexistent/noegg.egg'" - if sys.platform != "win32" else - "[WinError 3] The system cannot find the path specified: " - f"'{BASEDIR}\\\\eggs\\\\nonexistent\\\\noegg.egg'" - ), + "message": "version 'nonexistent' not found", }, - data={"project": "nonexistent", "version": "noegg"}, + data={"project": "sample", "version": "nonexistent"}, ) @@ -224,12 +204,7 @@ def test_delproject_nonexistent_project(): "/delproject.json", { "status": "error", - "message": "FileNotFoundError: " + ( - f"[Errno 2] No such file or directory: '{BASEDIR}/eggs/nonexistent'" - if sys.platform != "win32" else - "[WinError 3] The system cannot find the path specified: " - f"'{BASEDIR}\\\\eggs\\\\nonexistent'" - ), + "message": "project 'nonexistent' not found", }, data={"project": "nonexistent"}, ) diff --git a/scrapyd/eggstorage.py b/scrapyd/eggstorage.py index ed3695c7..db1ddd51 100644 --- a/scrapyd/eggstorage.py +++ b/scrapyd/eggstorage.py @@ -5,7 +5,7 @@ from zope.interface import implementer -from scrapyd.exceptions import DirectoryTraversalError +from scrapyd.exceptions import DirectoryTraversalError, EggNotFoundError, ProjectNotFoundError from scrapyd.interfaces import IEggStorage from scrapyd.utils import sorted_versions @@ -32,7 +32,10 @@ def get(self, project, version=None): version = self.list(project)[-1] except IndexError: return None, None - return version, open(self._egg_path(project, version), 'rb') + try: + return version, open(self._egg_path(project, version), 'rb') + except FileNotFoundError: + return None, None def list(self, project): return sorted_versions( @@ -46,11 +49,17 @@ def list_projects(self): def delete(self, project, version=None): if version is None: - shutil.rmtree(self._get_path(project)) + try: + shutil.rmtree(self._get_path(project)) + except FileNotFoundError as e: + raise ProjectNotFoundError from e else: - os.remove(self._egg_path(project, version)) - if not self.list(project): # remove project if no versions left - self.delete(project) + try: + os.remove(self._egg_path(project, version)) + if not self.list(project): # remove project if no versions left + self.delete(project) + except FileNotFoundError as e: + raise EggNotFoundError from e def _egg_path(self, project, version): sanitized_version = re.sub(r'[^A-Za-z0-9_-]', '_', version) diff --git a/scrapyd/exceptions.py b/scrapyd/exceptions.py index 6f67feef..4e286797 100644 --- a/scrapyd/exceptions.py +++ b/scrapyd/exceptions.py @@ -6,5 +6,13 @@ class DirectoryTraversalError(ScrapydError): """Raised if the resolved path is outside the expected directory""" +class ProjectNotFoundError(ScrapydError): + """Raised if a project isn't found in an IEggStorage implementation""" + + +class EggNotFoundError(ScrapydError): + """Raised if an egg isn't found in an IEggStorage implementation""" + + class RunnerError(ScrapydError): """Raised if the runner returns an error code""" diff --git a/scrapyd/utils.py b/scrapyd/utils.py index 450041fe..de26b550 100644 --- a/scrapyd/utils.py +++ b/scrapyd/utils.py @@ -125,7 +125,7 @@ def get_crawl_args(message): return args -def get_spider_list(project, runner=None, pythonpath=None, version=''): +def get_spider_list(project, runner=None, pythonpath=None, version=None): """Return the spider list from the given project, using the given runner""" if "cache" not in get_spider_list.__dict__: get_spider_list.cache = UtilsCache() diff --git a/scrapyd/webservice.py b/scrapyd/webservice.py index 3c449d05..9ce82f66 100644 --- a/scrapyd/webservice.py +++ b/scrapyd/webservice.py @@ -10,6 +10,7 @@ from twisted.python import log from twisted.web import error, http +from scrapyd.exceptions import EggNotFoundError, ProjectNotFoundError from scrapyd.jobstorage import job_items_url, job_log_url from scrapyd.utils import JsonResource, UtilsCache, get_spider_list, native_stringify_dict @@ -101,16 +102,23 @@ def render_GET(self, txrequest): class Schedule(WsResource): @param('project') @param('spider') - @param('_version', dest='version', required=False, default='') + @param('_version', dest='version', required=False, default=None) # See https://github.com/scrapy/scrapyd/pull/215 @param('jobid', required=False, default=lambda: uuid.uuid1().hex) @param('priority', required=False, default=0, type=float) @param('setting', required=False, default=list, multiple=True) def render_POST(self, txrequest, project, spider, version, jobid, priority, setting): - spider_arguments = {k: v[0] for k, v in native_stringify_dict(copy(txrequest.args), keys_only=False).items()} - spiders = get_spider_list(project, version=version) + if self.root.eggstorage.get(project, version) == (None, None): + if version: + raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode()) + raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode()) + + spiders = get_spider_list(project, version=version, runner=self.root.runner) if spider not in spiders: raise error.Error(code=http.OK, message=b"spider '%b' not found" % spider.encode()) + + spider_arguments = {k: v[0] for k, v in native_stringify_dict(copy(txrequest.args), keys_only=False).items()} + self.root.scheduler.schedule( project, spider, @@ -160,7 +168,7 @@ def render_POST(self, txrequest, project, version, egg): ) self.root.eggstorage.put(BytesIO(egg), project, version) - spiders = get_spider_list(project, version=version) + spiders = get_spider_list(project, version=version, runner=self.root.runner) self.root.update_projects() UtilsCache.invalid_cache(project) return {"node_name": self.root.nodename, "status": "ok", "project": project, "version": version, @@ -182,9 +190,15 @@ def render_GET(self, txrequest, project): class ListSpiders(WsResource): @param('project') - @param('_version', dest='version', required=False, default='') + @param('_version', dest='version', required=False, default=None) def render_GET(self, txrequest, project, version): - spiders = get_spider_list(project, runner=self.root.runner, version=version) + if self.root.eggstorage.get(project, version) == (None, None): + if version: + raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode()) + raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode()) + + spiders = get_spider_list(project, version=version, runner=self.root.runner) + return {"node_name": self.root.nodename, "status": "ok", "spiders": spiders} @@ -269,11 +283,14 @@ def render_POST(self, txrequest, project): UtilsCache.invalid_cache(project) return {"node_name": self.root.nodename, "status": "ok"} - # Minor security note: Submitting a non-existent project or version when using the default eggstorage can produce - # an error message that discloses the resolved path to the eggs_dir. def _delete_version(self, project, version=None): - self.root.eggstorage.delete(project, version) - self.root.update_projects() + try: + self.root.eggstorage.delete(project, version) + self.root.update_projects() + except ProjectNotFoundError: + raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode()) + except EggNotFoundError: + raise error.Error(code=http.OK, message=b"version '%b' not found" % version.encode()) class DeleteVersion(DeleteProject): diff --git a/tests/test_webservice.py b/tests/test_webservice.py index b47fd72f..f2aed432 100644 --- a/tests/test_webservice.py +++ b/tests/test_webservice.py @@ -1,10 +1,10 @@ -import os from pathlib import Path from unittest import mock import pytest +from twisted.web import error -from scrapyd.exceptions import DirectoryTraversalError, RunnerError +from scrapyd.exceptions import DirectoryTraversalError from scrapyd.interfaces import IEggStorage from scrapyd.jobstorage import Job @@ -22,20 +22,70 @@ def fake_list_spiders_other(*args, **kwarsg): class TestWebservice: - def test_list_spiders(self, txrequest, site_with_egg): + def add_test_version(self, root, basename, version): + egg_path = Path(__file__).absolute().parent / f"{basename}.egg" + project, version = 'myproject', version + with open(egg_path, 'rb') as f: + root.eggstorage.put(f, project, version) + + def test_list_spiders(self, txrequest, site_no_egg): + self.add_test_version(site_no_egg, "mybot", "r1") + self.add_test_version(site_no_egg, "mybot2", "r2") + txrequest.args = { - b'project': [b'quotesbot'] + b'project': [b'myproject'] } endpoint = b'listspiders.json' - content = site_with_egg.children[endpoint].render_GET(txrequest) + content = site_no_egg.children[endpoint].render_GET(txrequest) - assert content['spiders'] == ['toscrape-css', 'toscrape-xpath'] + assert content['spiders'] == ['spider1', 'spider2', 'spider3'] assert content['status'] == 'ok' + def test_list_spiders_nonexistent(self, txrequest, site_no_egg): + txrequest.args = { + b'project': [b'nonexistent'], + } + endpoint = b'listspiders.json' + + with pytest.raises(error.Error) as exc: + site_no_egg.children[endpoint].render_GET(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"project 'nonexistent' not found" + + def test_list_spiders_version(self, txrequest, site_no_egg): + self.add_test_version(site_no_egg, "mybot", "r1") + self.add_test_version(site_no_egg, "mybot2", "r2") + + txrequest.args = { + b'project': [b'myproject'], + b'_version': [b'r1'], + } + endpoint = b'listspiders.json' + content = site_no_egg.children[endpoint].render_GET(txrequest) + + assert content['spiders'] == ['spider1', 'spider2'] + assert content['status'] == 'ok' + + def test_list_spiders_version_nonexistent(self, txrequest, site_no_egg): + self.add_test_version(site_no_egg, "mybot", "r1") + self.add_test_version(site_no_egg, "mybot2", "r2") + + txrequest.args = { + b'project': [b'myproject'], + b'_version': [b'nonexistent'], + } + endpoint = b'listspiders.json' + + with pytest.raises(error.Error) as exc: + site_no_egg.children[endpoint].render_GET(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"version 'nonexistent' not found" + def test_list_versions(self, txrequest, site_with_egg): txrequest.args = { b'project': [b'quotesbot'], - b'spider': [b'toscrape-css'] } endpoint = b'listversions.json' content = site_with_egg.children[endpoint].render_GET(txrequest) @@ -43,6 +93,16 @@ def test_list_versions(self, txrequest, site_with_egg): assert content['versions'] == ['0_1'] assert content['status'] == 'ok' + def test_list_versions_nonexistent(self, txrequest, site_no_egg): + txrequest.args = { + b'project': [b'quotesbot'], + } + endpoint = b'listversions.json' + content = site_no_egg.children[endpoint].render_GET(txrequest) + + assert content['versions'] == [] + assert content['status'] == 'ok' + def test_list_projects(self, txrequest, site_with_egg): txrequest.args = { b'project': [b'quotesbot'], @@ -92,6 +152,32 @@ def test_delete_version(self, txrequest, site_with_egg): assert 'node_name' in content assert no_version is None + def test_delete_version_nonexistent_project(self, txrequest, site_with_egg): + endpoint = b'delversion.json' + txrequest.args = { + b'project': [b'quotesbot'], + b'version': [b'nonexistent'] + } + + with pytest.raises(error.Error) as exc: + site_with_egg.children[endpoint].render_POST(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"version 'nonexistent' not found" + + def test_delete_version_nonexistent_version(self, txrequest, site_no_egg): + endpoint = b'delversion.json' + txrequest.args = { + b'project': [b'nonexistent'], + b'version': [b'0.1'] + } + + with pytest.raises(error.Error) as exc: + site_no_egg.children[endpoint].render_POST(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"version '0.1' not found" + def test_delete_project(self, txrequest, site_with_egg): endpoint = b'delproject.json' txrequest.args = { @@ -113,6 +199,18 @@ def test_delete_project(self, txrequest, site_with_egg): assert 'node_name' in content assert no_version is None + def test_delete_project_nonexistent(self, txrequest, site_no_egg): + endpoint = b'delproject.json' + txrequest.args = { + b'project': [b'nonexistent'], + } + + with pytest.raises(error.Error) as exc: + site_no_egg.children[endpoint].render_POST(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"project 'nonexistent' not found" + def test_addversion(self, txrequest, site_no_egg): endpoint = b'addversion.json' txrequest.args = { @@ -151,6 +249,46 @@ def test_schedule(self, txrequest, site_with_egg): assert content['status'] == 'ok' assert 'jobid' in content + def test_schedule_nonexistent_project(self, txrequest, site_no_egg): + endpoint = b'schedule.json' + txrequest.args = { + b'project': [b'nonexistent'], + b'spider': [b'toscrape-css'] + } + + with pytest.raises(error.Error) as exc: + site_no_egg.children[endpoint].render_POST(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"project 'nonexistent' not found" + + def test_schedule_nonexistent_version(self, txrequest, site_with_egg): + endpoint = b'schedule.json' + txrequest.args = { + b'project': [b'quotesbot'], + b'_version': [b'nonexistent'], + b'spider': [b'toscrape-css'] + } + + with pytest.raises(error.Error) as exc: + site_with_egg.children[endpoint].render_POST(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"version 'nonexistent' not found" + + def test_schedule_nonexistent_spider(self, txrequest, site_with_egg): + endpoint = b'schedule.json' + txrequest.args = { + b'project': [b'quotesbot'], + b'spider': [b'nonexistent'] + } + + with pytest.raises(error.Error) as exc: + site_with_egg.children[endpoint].render_POST(txrequest) + + assert exc.value.status == b"200" + assert exc.value.message == b"spider 'nonexistent' not found" + @pytest.mark.parametrize('endpoint,attach_egg,method', [ (b'addversion.json', True, 'render_POST'), (b'listversions.json', False, 'render_GET'), @@ -195,11 +333,10 @@ def test_project_directory_traversal_runner(self, txrequest, site_no_egg, endpoi with open(egg_path, 'rb') as f: txrequest.args[b'egg'] = [f.read()] - with pytest.raises(RunnerError) as exc: + with pytest.raises(DirectoryTraversalError) as exc: getattr(site_no_egg.children[endpoint], method)(txrequest) - assert str(exc.value).startswith("Traceback (most recent call last):"), str(exc.value) - assert str(exc.value).endswith(f"scrapyd.exceptions.DirectoryTraversalError: ../p{os.linesep}"), str(exc.value) + assert str(exc.value) == "../p" storage = site_no_egg.app.getComponent(IEggStorage) version, egg = storage.get('quotesbot')