Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sanitize project name #421

Closed
wants to merge 30 commits into from
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
546e3dc
[tests] unit tests for webservice
pawelmhm Nov 25, 2021
a13fe9b
[webservice] safe project name
pawelmhm Nov 26, 2021
8e8e410
[webservice] sanitize project name
pawelmhm Nov 26, 2021
bdd1f6a
[webservice] sanitize project name in listproject
pawelmhm Nov 29, 2021
0f05b32
[webservice] validate project name in schedule.json
pawelmhm Nov 29, 2021
767fac5
[test_webservice] mock list spiders
pawelmhm Nov 29, 2021
49ab5a7
webservice: extend allowed character list
pawelmhm Dec 3, 2021
5980853
Merge branch 'master' into sanitize-project
pawelmhm Dec 14, 2021
0f45e31
Merge branch 'master' into sanitize-project
jpmckinney Jul 17, 2024
e86d261
fix(sanitize-project): listprojects.json accepts no parameters. proje…
jpmckinney Jul 17, 2024
8431cd0
chore: Restore previous style
jpmckinney Jul 17, 2024
3720c09
feat: Clarify error message (no "object" method in UnsupportedMethod …
jpmckinney Jul 17, 2024
a21ce02
chore: Refactor project sanitization
jpmckinney Jul 17, 2024
a89fe27
feat: Use 405 Method Not Allowed
jpmckinney Jul 17, 2024
7a5582b
Merge branch 'master' into sanitize-project
jpmckinney Jul 17, 2024
7624524
Merge branch 'master' into sanitize-project
jpmckinney Jul 17, 2024
fd1c8e7
Merge branch 'master' into sanitize-project
jpmckinney Jul 17, 2024
ce76bf9
chore: Refactor webservice.py for easier modification (bis)
jpmckinney Jul 17, 2024
67656fe
Merge branch 'master' into sanitize-project
jpmckinney Jul 17, 2024
b7dafac
Merge branch 'master' into sanitize-project
jpmckinney Jul 17, 2024
3f4c1d2
test: Add webservices to project parameter validation test
jpmckinney Jul 17, 2024
dd2a39c
test: Fix test_endpoints expectations
jpmckinney Jul 17, 2024
8df8b60
Merge branch 'master' into sanitize-project
jpmckinney Jul 18, 2024
04bed89
chore: Change DirectoryTraversalError message to not disclose full paths
jpmckinney Jul 18, 2024
7380428
chore: Use generator
jpmckinney Jul 18, 2024
b7d055a
test: Add tests to tests/test_webservice.py for directory traversal
jpmckinney Jul 18, 2024
d8716c7
Fix "Merge branch 'master' into sanitize-project"
jpmckinney Jul 18, 2024
287742d
Fix "Merge branch 'master' into sanitize-project"
jpmckinney Jul 18, 2024
69dcf53
Merge branch 'master' into sanitize-project
jpmckinney Jul 18, 2024
72cdf5b
Fix "Merge branch 'master' into sanitize-project"
jpmckinney Jul 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scrapyd/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def create_wrapped_resource(webcls, config, app):
return resource


def application(config):
def application(config) -> Application:
app = Application("Scrapyd")
http_port = config.getint('http_port', 6800)
bind_address = config.get('bind_address', '127.0.0.1')
Expand Down
82 changes: 82 additions & 0 deletions scrapyd/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import shutil
from pathlib import Path

import pytest
from twisted.web import http
from twisted.web.http import Request
from twisted.web.test.requesthelper import DummyChannel
from zope.interface import implementer

from scrapyd import Config
from scrapyd.app import application
from scrapyd.interfaces import IEggStorage, ISpiderScheduler
from scrapyd.website import Root


@implementer(ISpiderScheduler)
class FakeScheduler:

def __init__(self, config):
self.config = config
self.calls = []

def schedule(self, project, spider_name, priority=0.0, **spider_args):
self.calls.append(
[project, spider_name]
)

def list_projects(self):
return ['quotesbot']

def update_projects(self):
pass


def delete_eggs(storage, project, version, config):
if storage.list(project) != []:
storage.delete(project, version)
eggdir = config.get("eggs_dir")
shutil.rmtree(eggdir)


@pytest.fixture
def txrequest():
tcp_channel = DummyChannel.TCP()
http_channel = http.HTTPChannel()
http_channel.makeConnection(tcp_channel)
return Request(http_channel)


def common_app_fixture(request):
config = Config()

app = application(config)
project, version = 'quotesbot', '0.1'
storage = app.getComponent(IEggStorage)
app.setComponent(ISpiderScheduler, FakeScheduler(config))

def delete_egg():
# There is no egg initially but something can place an egg
# e.g. addversion test
delete_eggs(storage, project, version, config)

request.addfinalizer(delete_egg)
return Root(config, app), storage


@pytest.fixture
def site_no_egg(request):
root, storage = common_app_fixture(request)
return root


@pytest.fixture
def site_with_egg(request):
root, storage = common_app_fixture(request)

egg_path = Path(__file__).absolute().parent / "quotesbot.egg"
project, version = 'quotesbot', '0.1'
with open(egg_path, 'rb') as f:
storage.put(f, project, version)

return root
30 changes: 5 additions & 25 deletions scrapyd/tests/test_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,16 @@ def test_launch_spider_get(self, mock_scrapyd):

def test_spider_list_no_project(self, mock_scrapyd):
resp = requests.get(mock_scrapyd.urljoin("listspiders.json"))
assert resp.status_code == 200
# TODO scrapyd should return status 400 if no project specified
assert resp.status_code == 400
data = resp.json()
assert data['status'] == 'error'
assert data['message'] == "'project'"
assert re.search('is required', data['message'])

def test_spider_list_project_no_egg(self, mock_scrapyd):
resp = requests.get(mock_scrapyd.urljoin('listprojects.json'))
data = resp.json()
assert resp.status_code == 200
assert data['projects'] == []
assert resp.status_code == 400
assert data['status'] == 'error'

def test_addversion_and_delversion(self, mock_scrapyd, quotesbot_egg):
resp = self._deploy(mock_scrapyd, quotesbot_egg)
Expand All @@ -88,23 +87,4 @@ def _deploy(self, mock_scrapyd, quotesbot_egg) -> Response:
b'egg': quotesbot_egg
}
resp = requests.post(url, data=data, files=files)
return resp

def test_addversion_and_schedule(self, mock_scrapyd, quotesbot_egg):
deploy_response = self._deploy(mock_scrapyd, quotesbot_egg)
assert deploy_response.status_code == 200
schedule_url = mock_scrapyd.urljoin('schedule.json')
data = {
'spider': 'toscrape-css',
'project': 'quotesbot',
# Spider making request to scrapyD root url
# TODO add some mock website later, for now we just want
# to make sure spider is launched properly, not important
# if it collects any items or not
'start_url': mock_scrapyd.url
}
schedule_res = requests.post(schedule_url, data=data)
assert schedule_res.status_code == 200
data = schedule_res.json()
assert 'jobid' in data
assert data['status'] == 'ok'
return resp
16 changes: 15 additions & 1 deletion scrapyd/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from scrapy.utils.test import get_pythonpath
from scrapyd.interfaces import IEggStorage
from scrapyd.utils import get_crawl_args, get_spider_list, UtilsCache
from scrapyd.utils import get_crawl_args, get_spider_list, UtilsCache, check_disallowed_characters
from scrapyd import get_application

def get_pythonpath_scrapyd():
Expand Down Expand Up @@ -128,3 +128,17 @@ def popen_wrapper(*args, **kwargs):
r'Exception: This should break the `scrapy list` command$'
)
self.assertRegexpMatches(tb, tb_regex)


@pytest.mark.parametrize('project_name, allowed', [
('/hello/world', False),
('@hello/world', False),
('hello world', True),
('hello_world', True),
('hello-world', True),
('C:\\hello\\world', False),
("chrząścz", False),
('"hello world"', True)
])
def test_disallowed_chars(project_name, allowed):
assert check_disallowed_characters(project_name) == allowed
161 changes: 161 additions & 0 deletions scrapyd/tests/test_webservice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
from pathlib import Path
from unittest import mock

import pytest
from twisted.web.error import Error

from scrapyd.interfaces import IEggStorage


def fake_list_spiders(*args, **kwargs):
return []


def fake_list_spiders_other(*args, **kwarsg):
return ['quotesbot', 'toscrape-css']


class TestWebservice:
@mock.patch('scrapyd.webservice.get_spider_list', new=fake_list_spiders)
def test_list_spiders(self, txrequest, site_no_egg):
# TODO Test with actual egg requires to write better mock runner
# scrapyd webservice calls subprocess with command
# "python -m scrapyd.runner list", need to write code to mock this
# and test it
txrequest.args = {
b'project': [b'quotesbot']
}
endpoint = b'listspiders.json'
content = site_no_egg.children[endpoint].render_GET(txrequest)
assert content['spiders'] == []
assert content['status'] == 'ok'

def test_list_versions(self, txrequest, site_with_egg):
txrequest.args = {
b'project': [b'quotesbot'],
b'spider': [b'toscrape-css']
}
endpoint = b'listversions.json'
content = site_with_egg.children[endpoint].render_GET(txrequest)
assert content['versions'] == ['0_1']
assert content['status'] == 'ok'

def test_list_projects(self, txrequest, site_with_egg):
txrequest.args = {
b'project': [b'quotesbot'],
b'spider': [b'toscrape-css']
}
endpoint = b'listprojects.json'
content = site_with_egg.children[endpoint].render_GET(txrequest)
assert content['projects'] == ['quotesbot']

def test_delete_version(self, txrequest, site_with_egg):
endpoint = b'delversion.json'
txrequest.args = {
b'project': [b'quotesbot'],
b'version': [b'0.1']
}

storage = site_with_egg.app.getComponent(IEggStorage)
egg = storage.get('quotesbot')
assert egg[0] is not None
content = site_with_egg.children[endpoint].render_POST(txrequest)
assert content['status'] == 'ok'
assert 'node_name' in content
assert storage.get('quotesbot')
no_egg = storage.get('quotesbot')
assert no_egg[0] is None

def test_delete_project(self, txrequest, site_with_egg):
endpoint = b'delproject.json'
txrequest.args = {
b'project': [b'quotesbot'],
}

storage = site_with_egg.app.getComponent(IEggStorage)
egg = storage.get('quotesbot')
assert egg[0] is not None

content = site_with_egg.children[endpoint].render_POST(txrequest)
assert content['status'] == 'ok'
assert 'node_name' in content
assert storage.get('quotesbot')
no_egg = storage.get('quotesbot')
assert no_egg[0] is None

@mock.patch('scrapyd.webservice.get_spider_list', new=fake_list_spiders)
def test_addversion(self, txrequest, site_no_egg):
endpoint = b'addversion.json'
txrequest.args = {
b'project': [b'quotesbot'],
b'version': [b'0.1']
}
egg_path = Path(__file__).absolute().parent / "quotesbot.egg"
with open(egg_path, 'rb') as f:
txrequest.args[b'egg'] = [f.read()]

storage = site_no_egg.app.getComponent(IEggStorage)
egg = storage.get('quotesbot')
assert egg[0] is None

content = site_no_egg.children[endpoint].render_POST(txrequest)
assert content['status'] == 'ok'
assert 'node_name' in content
assert storage.get('quotesbot')
no_egg = storage.get('quotesbot')
assert no_egg[0] == '0_1'

@mock.patch('scrapyd.webservice.get_spider_list',
new=fake_list_spiders_other)
def test_schedule(self, txrequest, site_with_egg):
endpoint = b'schedule.json'
txrequest.args = {
b'project': [b'quotesbot'],
b'spider': [b'toscrape-css']
}

content = site_with_egg.children[endpoint].render_POST(txrequest)
assert site_with_egg.scheduler.calls == [['quotesbot', 'toscrape-css']]
assert content['status'] == 'ok'
assert 'jobid' in content

def test_schedule_bad_request(self, txrequest, site_with_egg):
endpoint = b'schedule.json'
txrequest.args = {
b'project': [b'/etc/host/quotesbot'],
b'spider': [b'toscrape-css']
}

with pytest.raises(Error) as e:
site_with_egg.children[endpoint].render_POST(txrequest)
assert e.args[0] == 400

assert site_with_egg.scheduler.calls == []

@pytest.mark.parametrize('endpoint,attach_egg,method', [
(b'addversion.json', True, 'render_POST'),
(b'delproject.json', False, 'render_POST'),
(b'delversion.json', False, 'render_POST'),
(b'listspiders.json', False, 'render_GET'),
(b'listjobs.json', False, 'render_GET'),
(b'listprojects.json', False, 'render_GET')
])
def test_bad_project_names(self, txrequest, site_no_egg,
endpoint, attach_egg, method):
txrequest.args = {
b'project': [b'/home/pawel/hosts'],
b'version': [b'0.1'],
}
egg_path = Path(__file__).absolute().parent / "quotesbot.egg"
if attach_egg:
with open(egg_path, 'rb') as f:
txrequest.args[b'egg'] = [f.read()]

with pytest.raises(Error) as e:
resource = site_no_egg.children[endpoint]
getattr(resource, method)(txrequest)
assert e.args[0] == 400

storage = site_no_egg.app.getComponent(IEggStorage)
egg = storage.get('quotesbot')
assert egg[0] is None
33 changes: 4 additions & 29 deletions scrapyd/tests/test_website.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,6 @@
import pytest
from twisted.web import http
from twisted.web.http import Request
from twisted.web.test.requesthelper import DummyChannel

from scrapyd import Config
from scrapyd.app import application
from scrapyd.website import Root


@pytest.fixture
def txrequest():
tcp_channel = DummyChannel.TCP()
http_channel = http.HTTPChannel()
http_channel.makeConnection(tcp_channel)
return Request(http_channel)


@pytest.fixture
def scrapyd_site():
config = Config()
app = application(config)
return Root(config, app)


class TestWebsite:
def test_render_jobs(self, txrequest, scrapyd_site):
content = scrapyd_site.children[b'jobs'].render(txrequest)
def test_render_jobs(self, txrequest, site_no_egg):
content = site_no_egg.children[b'jobs'].render(txrequest)
expect_headers = {
b'Content-Type': [b'text/html; charset=utf-8'],
b'Content-Length': [b'643']
Expand All @@ -35,8 +10,8 @@ def test_render_jobs(self, txrequest, scrapyd_site):
initial = '<html><head><title>Scrapyd</title><style type="text/css">#jobs>thead td {text-align: center; font-weight'
assert content.decode().startswith(initial)

def test_render_home(self, txrequest, scrapyd_site):
content = scrapyd_site.children[b''].render_GET(txrequest)
def test_render_home(self, txrequest, site_no_egg):
content = site_no_egg.children[b''].render_GET(txrequest)
assert b'Available projects' in content
headers = dict(txrequest.responseHeaders.getAllRawHeaders())
assert headers[b'Content-Type'] == [b'text/html; charset=utf-8']
Expand Down
7 changes: 7 additions & 0 deletions scrapyd/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
import os
import re
from .sqlite import JsonSqliteDict
from subprocess import Popen, PIPE
import six
Expand Down Expand Up @@ -154,3 +155,9 @@ def _to_native_str(text, encoding='utf-8', errors='strict'):
return text.encode(encoding, errors)
else:
return text.decode(encoding, errors)


def check_disallowed_characters(text):
# Anything that is not in this list: A-Za-z0-9.\s_- is banned
disallowed_characters_regex = r'[^\"A-Za-z0-9.\s_-]+'
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved
return not re.search(disallowed_characters_regex, text)
Loading