Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sanitize project name #421

Closed
wants to merge 30 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
546e3dc
[tests] unit tests for webservice
pawelmhm Nov 25, 2021
a13fe9b
[webservice] safe project name
pawelmhm Nov 26, 2021
8e8e410
[webservice] sanitize project name
pawelmhm Nov 26, 2021
bdd1f6a
[webservice] sanitize project name in listproject
pawelmhm Nov 29, 2021
0f05b32
[webservice] validate project name in schedule.json
pawelmhm Nov 29, 2021
767fac5
[test_webservice] mock list spiders
pawelmhm Nov 29, 2021
49ab5a7
webservice: extend allowed character list
pawelmhm Dec 3, 2021
5980853
Merge branch 'master' into sanitize-project
pawelmhm Dec 14, 2021
0f45e31
Merge branch 'master' into sanitize-project
jpmckinney Jul 17, 2024
e86d261
fix(sanitize-project): listprojects.json accepts no parameters. proje…
jpmckinney Jul 17, 2024
8431cd0
chore: Restore previous style
jpmckinney Jul 17, 2024
3720c09
feat: Clarify error message (no "object" method in UnsupportedMethod …
jpmckinney Jul 17, 2024
a21ce02
chore: Refactor project sanitization
jpmckinney Jul 17, 2024
a89fe27
feat: Use 405 Method Not Allowed
jpmckinney Jul 17, 2024
7a5582b
Merge branch 'master' into sanitize-project
jpmckinney Jul 17, 2024
7624524
Merge branch 'master' into sanitize-project
jpmckinney Jul 17, 2024
fd1c8e7
Merge branch 'master' into sanitize-project
jpmckinney Jul 17, 2024
ce76bf9
chore: Refactor webservice.py for easier modification (bis)
jpmckinney Jul 17, 2024
67656fe
Merge branch 'master' into sanitize-project
jpmckinney Jul 17, 2024
b7dafac
Merge branch 'master' into sanitize-project
jpmckinney Jul 17, 2024
3f4c1d2
test: Add webservices to project parameter validation test
jpmckinney Jul 17, 2024
dd2a39c
test: Fix test_endpoints expectations
jpmckinney Jul 17, 2024
8df8b60
Merge branch 'master' into sanitize-project
jpmckinney Jul 18, 2024
04bed89
chore: Change DirectoryTraversalError message to not disclose full paths
jpmckinney Jul 18, 2024
7380428
chore: Use generator
jpmckinney Jul 18, 2024
b7d055a
test: Add tests to tests/test_webservice.py for directory traversal
jpmckinney Jul 18, 2024
d8716c7
Fix "Merge branch 'master' into sanitize-project"
jpmckinney Jul 18, 2024
287742d
Fix "Merge branch 'master' into sanitize-project"
jpmckinney Jul 18, 2024
69dcf53
Merge branch 'master' into sanitize-project
jpmckinney Jul 18, 2024
72cdf5b
Fix "Merge branch 'master' into sanitize-project"
jpmckinney Jul 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/news.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@ Web UI
API
^^^

- **BACKWARDS-INCOMPATIBLE CHANGE:** The ``project`` parameter must contain one or more characters, when provided.
- **BACKWARDS-INCOMPATIBLE CHANGE:** The ``project`` parameter must contain only the characters: A-Z, a-z, 0-9, ``.``, ``_``, ``-``, ``"``, ``'`` or whitespace.
- **BACKWARDS-INCOMPATIBLE CHANGE:** Use HTTP error status codes instead of 200 OK, if an error occurs.

- Missing parameter: 400 Bad Request
- Nonexistent resource: 404 Not Found
- Unsupported method: 405 Method Not Allowed

- Clarify error messages, for example:

- ``'project' parameter is required``, instead of ``'project'``
Expand Down
4 changes: 2 additions & 2 deletions integration_tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import requests


def req(method, path, auth=None, **kwargs):
def req(method, path, auth=None, status=200, **kwargs):
url = urljoin("http://127.0.0.1:6800", path)

for badauth in (None, ("baduser", "badpass")):
Expand All @@ -14,6 +14,6 @@ def req(method, path, auth=None, **kwargs):

response = getattr(requests, method)(url, auth=("hello12345", "67890world"), **kwargs)

assert response.status_code == 200, f"200 != {response.status_code}"
assert response.status_code == status, f"{status} != {response.status_code}"

return response
1 change: 1 addition & 0 deletions integration_tests/test_webservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def test_cancel_nonexistent_project():
"/cancel.json",
{"status": "error", "message": "project 'nonexistent' not found"},
data={"project": "nonexistent", "job": "nojob"},
status=404,
)


Expand Down
2 changes: 1 addition & 1 deletion scrapyd/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def create_wrapped_resource(webroot_cls, config, app):
return resource


def application(config):
def application(config) -> Application:
app = Application("Scrapyd")
bind_address = os.getenv('SCRAPYD_BIND_ADDRESS') or config.get('bind_address', '127.0.0.1')
http_port = int(os.getenv('SCRAPYD_HTTP_PORT') or config.getint('http_port', 6800))
Expand Down
30 changes: 22 additions & 8 deletions scrapyd/webservice.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import functools
import re
import sys
import traceback
import uuid
Expand All @@ -13,6 +14,13 @@
from scrapyd.jobstorage import job_items_url, job_log_url
from scrapyd.utils import JsonResource, UtilsCache, get_spider_list, native_stringify_dict

# Allow the same characters as Setuptools. Setuptools uses underscores for replacement in filenames, and hyphens
# for replacement in distributions. Quotes and spaces are allowed in case a config sets `project = "my project"`.
# https://github.com/pypa/setuptools/blob/e304e4d/setuptools/_distutils/command/install_egg_info.py#L74
PATTERNS = {
b'project': re.compile(b"""^[A-Za-z0-9."'\\s_-]+$"""),
}


def param(
decoded: str,
Expand All @@ -33,15 +41,19 @@ def decorator(func):
def wrapper(self, txrequest, *args, **kwargs):
if encoded not in txrequest.args:
if required:
raise error.Error(code=http.OK, message=b"'%b' parameter is required" % encoded)
raise error.Error(code=http.BAD_REQUEST, message=b"'%b' parameter is required" % encoded)

value = default
else:
values = (value.decode() if type is str else type(value) for value in txrequest.args.pop(encoded))
values = []
for value in txrequest.args.pop(encoded):
if encoded in PATTERNS and not PATTERNS[encoded].search(value):
raise error.Error(code=http.BAD_REQUEST, message=b"%b '%b' is invalid" % (encoded, value))
values.append(value.decode() if type is str else type(value))
if multiple:
value = list(values)
value = values
else:
value = next(values)
value = values[0]

kwargs[dest] = value

Expand All @@ -61,7 +73,9 @@ def render(self, txrequest):
try:
return JsonResource.render(self, txrequest).encode('utf-8')
except Exception as e:
if isinstance(e, error.Error):
if isinstance(e, error.UnsupportedMethod):
txrequest.setResponseCode(http.NOT_ALLOWED)
elif isinstance(e, error.Error):
txrequest.setResponseCode(int(e.status))
if self.root.debug:
return traceback.format_exc().encode('utf-8')
Expand Down Expand Up @@ -110,7 +124,7 @@ def render_POST(self, txrequest, project, spider, version, jobid, priority, sett
spider_arguments = {k: v[0] for k, v in native_stringify_dict(copy(txrequest.args), keys_only=False).items()}
spiders = get_spider_list(project, version=version)
if spider not in spiders:
raise error.Error(code=http.OK, message=b"spider '%b' not found" % spider.encode())
raise error.Error(code=http.NOT_FOUND, message=b"spider '%b' not found" % spider.encode())
self.root.scheduler.schedule(
project,
spider,
Expand All @@ -132,7 +146,7 @@ class Cancel(WsResource):
@param('signal', required=False, default='INT' if sys.platform != 'win32' else 'BREAK')
def render_POST(self, txrequest, project, job, signal):
if project not in self.root.poller.queues:
raise error.Error(code=http.OK, message=b"project '%b' not found" % project.encode())
raise error.Error(code=http.NOT_FOUND, message=b"project '%b' not found" % project.encode())

prevstate = None

Expand All @@ -156,7 +170,7 @@ class AddVersion(WsResource):
def render_POST(self, txrequest, project, version, egg):
if not zipfile.is_zipfile(BytesIO(egg)):
raise error.Error(
code=http.OK, message=b"egg is not a ZIP file (if using curl, use egg=@path not egg=path)"
code=http.BAD_REQUEST, message=b"egg is not a ZIP file (if using curl, use egg=@path not egg=path)"
)

self.root.eggstorage.put(BytesIO(egg), project, version)
Expand Down
6 changes: 3 additions & 3 deletions tests/test_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,16 +62,16 @@ def test_auth(self):
def test_launch_spider_get(self, mock_scrapyd):
resp = requests.get(mock_scrapyd.urljoin("schedule.json"))

assert resp.status_code == 200
# TODO scrapyd should return status 405 Method Not Allowed not 200
assert resp.status_code == 405
assert resp.json()['status'] == 'error'

def test_spider_list_no_project(self, mock_scrapyd):
resp = requests.get(mock_scrapyd.urljoin("listspiders.json"))
data = resp.json()

assert resp.status_code == 200
assert resp.status_code == 400
assert data['status'] == 'error'
assert re.search('is required', data['message'])

def test_spider_list_project_no_egg(self, mock_scrapyd):
resp = requests.get(mock_scrapyd.urljoin('listprojects.json'))
Expand Down
51 changes: 51 additions & 0 deletions tests/test_webservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from unittest import mock

import pytest
from twisted.web.error import Error

from scrapyd.exceptions import DirectoryTraversalError, RunnerError
from scrapyd.interfaces import IEggStorage
Expand Down Expand Up @@ -158,6 +159,56 @@ def test_schedule(self, txrequest, site_with_egg):
assert content['status'] == 'ok'
assert 'jobid' in content

def test_schedule_bad_request(self, txrequest, site_with_egg):
endpoint = b'schedule.json'
txrequest.args = {
b'project': [b'/etc/host/quotesbot'],
b'spider': [b'toscrape-css']
}

with pytest.raises(Error) as e:
site_with_egg.children[endpoint].render_POST(txrequest)

assert e.args[0] == 400

assert site_with_egg.scheduler.calls == []

@pytest.mark.parametrize('endpoint,attach_egg,method', [
(b'schedule.json', False, 'render_POST'),
(b'cancel.json', False, 'render_POST'),
(b'addversion.json', True, 'render_POST'),
(b'listversions.json', False, 'render_GET'),
(b'listspiders.json', False, 'render_GET'),
(b'status.json', False, 'render_GET'),
(b'listjobs.json', False, 'render_GET'),
(b'delproject.json', False, 'render_POST'),
(b'delversion.json', False, 'render_POST'),
])
def test_bad_project_names_invalid(self, txrequest, site_no_egg, endpoint, attach_egg, method):
txrequest.args = {
b'project': [b'/home/pawel/hosts'],
b'version': [b'0.1'],
b'job': [b'abc'],
}

if attach_egg:
egg_path = Path(__file__).absolute().parent / "quotesbot.egg"
with open(egg_path, 'rb') as f:
txrequest.args[b'egg'] = [f.read()]

with pytest.raises(Error) as e:
getattr(site_no_egg.children[endpoint], method)(txrequest)

assert e.value.status == b'400'
assert e.value.message == b"project '/home/pawel/hosts' is invalid"

storage = site_no_egg.app.getComponent(IEggStorage)
version, egg = storage.get('quotesbot')
if egg:
egg.close()

assert version is None

@pytest.mark.parametrize('endpoint,attach_egg,method', [
(b'addversion.json', True, 'render_POST'),
(b'listversions.json', False, 'render_GET'),
Expand Down
Loading