Skip to content

Commit

Permalink
test fixes related to build server setup (#8)
Browse files Browse the repository at this point in the history
* make tests pass
* decode check_output result
  • Loading branch information
jesterhazy authored Apr 16, 2018
1 parent c537177 commit be6d181
Show file tree
Hide file tree
Showing 11 changed files with 774 additions and 103 deletions.
1 change: 0 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
[tool:pytest]
addopts =
--verbose
--ignore=container-tests

[bdist_wheel]
universal=1
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def read(fname):

install_requires=['sagemaker-container-support', 'torch'],
extras_require={
'test': ['tox', 'flake8', 'pytest', 'mock', 'sagemaker']
'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock', 'Flask', 'boto3>=1.4.8',
'docker-compose', 'nvidia-docker-compose', 'sagemaker', 'PyYAML']
},
)
Empty file added test/__init__.py
Empty file.
115 changes: 114 additions & 1 deletion test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,127 @@
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import os
from os.path import join
import logging
import platform
import pytest
import shutil
import sys
import tempfile


from test.utils import local_mode

logger = logging.getLogger(__name__)
logging.getLogger('boto').setLevel(logging.INFO)
logging.getLogger('botocore').setLevel(logging.INFO)
logging.getLogger('factory.py').setLevel(logging.INFO)
logging.getLogger('auth.py').setLevel(logging.INFO)
logging.getLogger('connectionpool.py').setLevel(logging.INFO)


dir_path = os.path.dirname(os.path.realpath(__file__))


def pytest_addoption(parser):
parser.addoption('--build-image', '-B', action="store_true")
parser.addoption('--build-image', '-D', action="store_true")
parser.addoption('--build-base-image', '-B', action="store_true")
parser.addoption('--install-container-support', '-C', action="store_true")
parser.addoption('--docker-base-name', default='pytorch')
parser.addoption('--region', default='us-west-2')
parser.addoption('--framework-version', default='0.3.1')
parser.addoption('--py-version', choices=['2', '3'], default='2')
parser.addoption('--processor', choices=['gpu','cpu'], default='cpu')
# If not specified, will default to {framework-version}-{processor}-py{py-version}
parser.addoption('--tag', default=None)


@pytest.fixture(scope='session')
def docker_base_name(request):
return request.config.getoption('--docker-base-name')


@pytest.fixture(scope='session')
def region(request):
return request.config.getoption('--region')


@pytest.fixture(scope='session')
def framework_version(request):
return request.config.getoption('--framework-version')


@pytest.fixture(scope='session')
def py_version(request):
return 'py{}'.format(int(request.config.getoption('--py-version')))


@pytest.fixture(scope='session')
def processor(request):
return request.config.getoption('--processor')


@pytest.fixture(scope='session')
def tag(request, framework_version, processor, py_version):
provided_tag = request.config.getoption('--tag')
default_tag = '{}-{}-{}'.format(framework_version, processor, py_version)
return provided_tag if provided_tag else default_tag


@pytest.fixture(scope='session')
def docker_image(docker_base_name, tag):
return '{}:{}'.format(docker_base_name, tag)


@pytest.fixture
def opt_ml():
tmp = tempfile.mkdtemp()
os.mkdir(os.path.join(tmp, 'output'))

# Docker cannot mount Mac OS /var folder properly see
# https://forums.docker.com/t/var-folders-isnt-mounted-properly/9600
opt_ml_dir = '/private{}'.format(tmp) if platform.system() == 'Darwin' else tmp
yield opt_ml_dir

shutil.rmtree(tmp, True)


@pytest.fixture(scope='session')
def use_gpu(processor):
return processor == 'gpu'


@pytest.fixture(scope='session', autouse=True)
def install_container_support(request):
install = request.config.getoption('--install-container-support')
if install:
local_mode.install_container_support()


@pytest.fixture(scope='session', autouse=True)
def build_base_image(request, framework_version, py_version, processor, tag, docker_base_name):
build_base_image = request.config.getoption('--build-base-image')
if build_base_image:
return local_mode.build_base_image(framework_name=docker_base_name,
framework_version=framework_version,
py_version=py_version,
base_image_tag=tag,
processor=processor,
cwd=os.path.join(dir_path, '..'))

return tag


@pytest.fixture(scope='session', autouse=True)
def build_image(request, framework_version, py_version, processor, tag, docker_base_name):
build_image = request.config.getoption('--build-image')
if build_image:
return local_mode.build_image(framework_name=docker_base_name,
framework_version=framework_version,
py_version=py_version,
processor=processor,
tag=tag,
cwd=os.path.join(dir_path, '..'))

return tag
54 changes: 0 additions & 54 deletions test/integ/conftest.py

This file was deleted.

53 changes: 27 additions & 26 deletions test/integ/test_distributed_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,15 @@
import os
import pytest
import torch
import utils
import test.utils
from test.utils import local_mode

dir_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'resources')
data_dir = os.path.join(dir_path, 'mnist', 'data')
resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'resources'))
mnist_path = os.path.join(resources_path, 'mnist')
mnist_script = os.path.join(mnist_path, 'mnist.py')
data_dir = os.path.join(mnist_path, 'data')
training_dir = os.path.join(data_dir, 'training')

mnist_script = os.path.join(dir_path, 'mnist', 'mnist.py')
dist_operations = os.path.join(dir_path, 'distributed_operations.py')
dist_operations_path = os.path.join(resources_path, 'distributed_operations.py')

ENTRYPOINT = ["python", "-m", "pytorch_container.start"]

Expand All @@ -35,39 +36,39 @@ def fixture_dist_gpu_backend(request):
return request.param


def test_dist_operations_cpu(region, image_name, opt_ml, dist_cpu_backend):
utils.train(region, dist_operations, data_dir, image_name(), opt_ml, entrypoint=ENTRYPOINT, cluster_size=3,
def test_dist_operations_path_cpu(docker_image, opt_ml, dist_cpu_backend):
local_mode.train(dist_operations_path, data_dir, docker_image, opt_ml, entrypoint=ENTRYPOINT, cluster_size=3,
hyperparameters={'backend': dist_cpu_backend})

assert utils.file_exists(opt_ml, 'model/success'), 'Script success file was not created'
assert utils.file_exists(opt_ml, 'output/success'), 'Success file was not created'
assert not utils.file_exists(opt_ml, 'output/failure'), 'Failure happened'
assert local_mode.file_exists(opt_ml, 'model/success'), 'Script success file was not created'
assert local_mode.file_exists(opt_ml, 'output/success'), 'Success file was not created'
assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened'


@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda is not available")
def test_dist_operations_gpu(region, image_name, opt_ml, dist_gpu_backend):
utils.train(region, dist_operations, data_dir, image_name(device='gpu'), opt_ml, entrypoint=ENTRYPOINT, cluster_size=3,
def test_dist_operations_path_gpu(docker_image, opt_ml, dist_gpu_backend):
local_mode.train(dist_operations_path, data_dir, docker_image, opt_ml, entrypoint=ENTRYPOINT, cluster_size=3,
use_gpu=True, hyperparameters={'backend': dist_gpu_backend})

assert utils.file_exists(opt_ml, 'model/success'), 'Script success file was not created'
assert utils.file_exists(opt_ml, 'output/success'), 'Success file was not created'
assert not utils.file_exists(opt_ml, 'output/failure'), 'Failure happened'
assert local_mode.file_exists(opt_ml, 'model/success'), 'Script success file was not created'
assert local_mode.file_exists(opt_ml, 'output/success'), 'Success file was not created'
assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened'


def test_mnist_cpu(region, image_name, opt_ml, dist_cpu_backend):
utils.train(region, mnist_script, data_dir, image_name(), opt_ml, entrypoint=ENTRYPOINT, cluster_size=2,
def test_mnist_cpu(docker_image, opt_ml, dist_cpu_backend):
local_mode.train(mnist_script, data_dir, docker_image, opt_ml, entrypoint=ENTRYPOINT, cluster_size=2,
hyperparameters={'backend': dist_cpu_backend})

assert utils.file_exists(opt_ml, 'model/model'), 'Model file was not created'
assert utils.file_exists(opt_ml, 'output/success'), 'Success file was not created'
assert not utils.file_exists(opt_ml, 'output/failure'), 'Failure happened'
assert local_mode.file_exists(opt_ml, 'model/model'), 'Model file was not created'
assert local_mode.file_exists(opt_ml, 'output/success'), 'Success file was not created'
assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened'


@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda is not available")
def test_mnist_gpu(region, image_name, opt_ml, dist_gpu_backend):
utils.train(region, mnist_script, data_dir, image_name(device='gpu'), opt_ml, entrypoint=ENTRYPOINT, cluster_size=2,
def test_mnist_gpu(docker_image, opt_ml, dist_gpu_backend):
local_mode.train(mnist_script, data_dir, docker_image, opt_ml, entrypoint=ENTRYPOINT, cluster_size=2,
use_gpu=True, hyperparameters={'backend': dist_gpu_backend})

assert utils.file_exists(opt_ml, 'model/model'), 'Model file was not created'
assert utils.file_exists(opt_ml, 'output/success'), 'Success file was not created'
assert not utils.file_exists(opt_ml, 'output/failure'), 'Failure happened'
assert local_mode.file_exists(opt_ml, 'model/model'), 'Model file was not created'
assert local_mode.file_exists(opt_ml, 'output/success'), 'Success file was not created'
assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened'
30 changes: 12 additions & 18 deletions test/integ/test_single_machine_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,33 +13,27 @@
import os
import pytest
import torch
import utils
from test.utils import local_mode
from test.utils.local_mode import request
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

dir_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'resources', 'mnist')
data_dir = os.path.join(dir_path, 'data')
mnist_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'resources', 'mnist'))
data_dir = os.path.join(mnist_path, 'data')

training_dir = os.path.join(data_dir, 'training')

mnist_script = os.path.join(dir_path, 'mnist.py')
mnist_script = 'mnist.py'

ENTRYPOINT = ["python", "-m", "pytorch_container.start"]


def test_mnist_cpu(region, image_name, opt_ml):
utils.train(region, mnist_script, data_dir, image_name(), opt_ml, entrypoint=ENTRYPOINT)

assert utils.file_exists(opt_ml, 'model/model'), 'Model file was not created'
assert utils.file_exists(opt_ml, 'output/success'), 'Success file was not created'
assert not utils.file_exists(opt_ml, 'output/failure'), 'Failure happened'


@pytest.mark.skipif(not torch.cuda.is_available(), reason="cuda is not available")
def test_mnist_gpu(region, image_name, opt_ml):
utils.train(region, mnist_script, data_dir, image_name(device='gpu'), opt_ml, use_gpu=True, entrypoint=ENTRYPOINT)
def test_mnist_cpu(docker_image, opt_ml, use_gpu):
local_mode.train(mnist_script, data_dir, docker_image, opt_ml,
source_dir=mnist_path, use_gpu=use_gpu, entrypoint=ENTRYPOINT)

assert utils.file_exists(opt_ml, 'model/model'), 'Model file was not created'
assert utils.file_exists(opt_ml, 'output/success'), 'Success file was not created'
assert not utils.file_exists(opt_ml, 'output/failure'), 'Failure happened'
assert local_mode.file_exists(opt_ml, 'model/model'), 'Model file was not created'
assert local_mode.file_exists(opt_ml, 'output/success'), 'Success file was not created'
assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened'
4 changes: 2 additions & 2 deletions test/integ/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,10 @@ def save_as_json(data, filename):
json.dump(data, f)


def train(region, customer_script, data_dir, image_name, opt_ml, cluster_size=1, hyperparameters={}, additional_volumes=[],
def train(region, customer_script, data_dir, docker_image, opt_ml, cluster_size=1, hyperparameters={}, additional_volumes=[],
additional_env_vars=[], use_gpu=False, entrypoint=ENTRYPOINT, source_dir=None):
print("training")
tmpdir = create_training(region, data_dir, customer_script, opt_ml, image_name, additional_volumes, additional_env_vars,
tmpdir = create_training(region, data_dir, customer_script, opt_ml, docker_image, additional_volumes, additional_env_vars,
hyperparameters, cluster_size, entrypoint=entrypoint, source_dir=source_dir)
command = create_docker_command(tmpdir, use_gpu)
start_docker(tmpdir, command)
Expand Down
Empty file added test/utils/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions test/utils/csv_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import numpy as np
from six import StringIO

def loads(data):
stream = StringIO(data)
return np.genfromtxt(stream, dtype=np.float32, delimiter=',')


def dumps(data):
stream = StringIO()
np.savetxt(stream, data, delimiter=',', fmt='%s')
return stream.getvalue()
Loading

0 comments on commit be6d181

Please sign in to comment.