Skip to content
This repository has been archived by the owner on Feb 3, 2021. It is now read-only.

Feature: Add ability to specify docker run options in toolkit config #613

Merged
merged 8 commits into from
Aug 13, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion aztk/internal/docker_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@ class DockerCmd:
Class helping to write a docker command
"""

def __init__(self, name: str, docker_repo: str, cmd: str, gpu_enabled=False):
def __init__(self, name: str, docker_repo: str, docker_run_options: str, cmd: str, gpu_enabled=False):
if gpu_enabled:
self.cmd = CommandBuilder('nvidia-docker run')
else:
self.cmd = CommandBuilder('docker run')
self.cmd.add_option('--net', 'host')
self.cmd.add_option('--name', name)
self.cmd.add_argument('-d')
self.cmd.add_argument(docker_run_options)
self.cmd.add_argument(docker_repo)
self.cmd.add_argument(cmd)

Expand Down
11 changes: 7 additions & 4 deletions aztk/models/cluster_configuration.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import aztk.error as error
from aztk.core.models import Model, fields
from aztk.utils import deprecated, deprecate, helpers
from aztk.utils import deprecate, deprecated, helpers

from .custom_script import CustomScript
from .file_share import FileShare
from .plugins import PluginConfiguration
from .scheduling_target import SchedulingTarget
from .toolkit import Toolkit
from .user_configuration import UserConfiguration
from .scheduling_target import SchedulingTarget


class ClusterConfiguration(Model):
Expand Down Expand Up @@ -85,10 +85,13 @@ def gpu_enabled(self):
def get_docker_repo(self):
return self.toolkit.get_docker_repo(self.gpu_enabled())

def get_docker_run_options(self) -> str:
return self.toolkit.get_docker_run_options()

def __validate__(self) -> bool:
if self.size == 0 and self.size_low_priority == 0:
raise error.InvalidModelError(
"Please supply a valid (greater than 0) size or size_low_priority value either in the cluster.yaml configuration file or with a parameter (--size or --size-low-pri)"
"Please supply a valid (greater than 0) size or size_low_priority value either in the cluster.yaml configuration file or with a parameter (--size or --size-low-priority)"
)

if self.vm_size is None:
Expand All @@ -97,7 +100,7 @@ def __validate__(self) -> bool:

if self.mixed_mode() and not self.subnet_id:
raise error.InvalidModelError(
"You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes). Set the VNET's subnet_id in your cluster.yaml."
"You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes). Set the VNET's subnet_id in your cluster.yaml or with a parameter (--subnet-id)."
)

if self.custom_scripts:
Expand Down
17 changes: 16 additions & 1 deletion aztk/models/toolkit.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import re

from aztk.core.models import Model, fields
from aztk.error import InvalidModelError
from aztk.utils import constants, deprecate
from aztk.core.models import Model, fields


class ToolkitDefinition:
Expand Down Expand Up @@ -38,13 +40,15 @@ class Toolkit(Model):
environment (str): Which environment to use for this toolkit
environment_version (str): If there is multiple version for an environment you can specify which one
docker_repo (str): Optional docker repo
docker_run_options (str): Optional command-line options for `docker run`
"""

software = fields.String()
version = fields.String()
environment = fields.String(default=None)
environment_version = fields.String(default=None)
docker_repo = fields.String(default=None)
docker_run_options = fields.String(default=None)

def __validate__(self):
if self.software not in TOOLKIT_MAP:
Expand All @@ -71,6 +75,14 @@ def __validate__(self):
"Environment '{0}' version '{1}' for toolkit '{2}' is not available. Use one of: {3}".format(
self.environment, self.environment_version, self.software, env_def.versions))

if self.docker_run_options:
invalid_character = re.search('[^A-Za-z0-9 _./:=\-\"]', self.docker_run_options)
if invalid_character:
raise InvalidModelError(
"Docker run options contains invalid character '{0}'. Only A-Z, a-z, 0-9, space, hyphen (-), "
"underscore (_), period (.), forward slash (/), colon (:), equals(=), comma (,), and "
"double quote (\") are allowed.".format(invalid_character.group(0)))

def get_docker_repo(self, gpu: bool):
if self.docker_repo:
return self.docker_repo
Expand All @@ -82,6 +94,9 @@ def get_docker_repo(self, gpu: bool):
tag=self._get_docker_tag(gpu),
)

def get_docker_run_options(self):
return self.docker_run_options

def _get_docker_tag(self, gpu: bool):
environment = self.environment or "base"
environment_def = self._get_environment_definition()
Expand Down
15 changes: 10 additions & 5 deletions aztk/node_scripts/install/install.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import os
from core import config
from install import pick_master, spark, scripts, create_user, plugins, spark_container

import wait_until_master_selected
from aztk.models.plugins import PluginTarget
from aztk.internal import cluster_data
from aztk.models.plugins import PluginTarget
from core import config
from install import (create_user, pick_master, plugins, scripts, spark, spark_container)

from .node_scheduling import setup_node_scheduling


Expand All @@ -14,9 +16,11 @@ def read_cluster_config():
return cluster_config


def setup_host(docker_repo: str):
def setup_host(docker_repo: str, docker_run_options: str):
"""
Code to be run on the node(NOT in a container)
Code to be run on the node (NOT in a container)
:param docker_repo: location of the Docker image to use
:param docker_run_options: additional command-line options to pass to docker run
"""
client = config.batch_client

Expand Down Expand Up @@ -49,6 +53,7 @@ def setup_host(docker_repo: str):
#TODO pass azure file shares
spark_container.start_spark_container(
docker_repo=docker_repo,
docker_run_options=docker_run_options,
gpu_enabled=os.environ.get("AZTK_GPU_ENABLED") == "true",
plugins=cluster_conf.plugins,
)
Expand Down
7 changes: 6 additions & 1 deletion aztk/node_scripts/install/spark_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,16 @@
from aztk.utils import constants


def start_spark_container(docker_repo: str = None, gpu_enabled: bool = False, file_mounts=None, plugins=None):
def start_spark_container(docker_repo: str = None,
docker_run_options: str = None,
gpu_enabled: bool = False,
file_mounts=None,
plugins=None):

cmd = DockerCmd(
name=constants.DOCKER_SPARK_CONTAINER_NAME,
docker_repo=docker_repo,
docker_run_options=docker_run_options,
cmd="/bin/bash /mnt/batch/tasks/startup/wd/aztk/node_scripts/docker_main.sh",
gpu_enabled=gpu_enabled)

Expand Down
2 changes: 1 addition & 1 deletion aztk/node_scripts/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def run():
action = sys.argv[1]

if action == "setup-node":
install.setup_host(sys.argv[2])
install.setup_host(sys.argv[2], sys.argv[3])
elif action == "setup-spark-container":
install.setup_spark_container()
else:
Expand Down
5 changes: 3 additions & 2 deletions aztk/node_scripts/setup_host.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@

# Entry point for the start task. It will install all dependencies and start docker.
# Usage:
# setup_host.sh [container_name] [docker_repo_name]
# setup_host.sh [container_name] [docker_repo_name] [docker_run_options]
set -e

export AZTK_WORKING_DIR=/mnt/batch/tasks/startup/wd
export PYTHONUNBUFFERED=TRUE

container_name=$1
docker_repo_name=$2
docker_run_options=$3

install_prerequisites () {
echo "Installing pre-reqs"
Expand Down Expand Up @@ -79,7 +80,7 @@ run_docker_container () {
echo "Creating docker container."

echo "Running setup python script"
$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python $(dirname $0)/main.py setup-node $docker_repo_name
$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python $(dirname $0)/main.py setup-node $docker_repo_name "$docker_run_options"

# wait until container is running
until [ "`/usr/bin/docker inspect -f {{.State.Running}} $container_name`"=="true" ]; do
Expand Down
9 changes: 6 additions & 3 deletions aztk/spark/client/base/helpers/generate_cluster_start_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def __get_secrets_env(core_base_operations):
def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
plugins=None,
worker_on_master: bool = True,
file_mounts=None,
Expand Down Expand Up @@ -93,9 +94,10 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile,
'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\
'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\
') 2>&1'.format(zip_resource_file.file_path),
'/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1}'.format(
'/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1} "{2}"'.format(
constants.DOCKER_SPARK_CONTAINER_NAME,
docker_repo,
"" if docker_run_options is None else docker_run_options.replace('"', '\\\"')
)
]

Expand All @@ -108,6 +110,7 @@ def generate_cluster_start_task(core_base_operations,
cluster_id: str,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
file_shares: List[models.FileShare] = None,
plugins: List[models.PluginConfiguration] = None,
mixed_mode: bool = False,
Expand Down Expand Up @@ -137,8 +140,8 @@ def generate_cluster_start_task(core_base_operations,
] + __get_docker_credentials(core_base_operations) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode)

# start task command
command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, plugins, worker_on_master, file_shares,
mixed_mode)
command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, docker_run_options, plugins,
worker_on_master, file_shares, mixed_mode)

return batch_models.StartTask(
command_line=helpers.wrap_commands_in_shell(command),
Expand Down
9 changes: 5 additions & 4 deletions aztk/spark/client/base/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from aztk.client.base import BaseOperations as CoreBaseOperations
from aztk.spark import models

from .helpers import generate_cluster_start_task, generate_application_task
from .helpers import generate_application_task, generate_cluster_start_task


class SparkBaseOperations:
Expand All @@ -19,6 +19,7 @@ def _generate_cluster_start_task(self,
id: str,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
file_shares: List[models.FileShare] = None,
plugins: List[models.PluginConfiguration] = None,
mixed_mode: bool = False,
Expand All @@ -44,9 +45,9 @@ def _generate_cluster_start_task(self,
Returns:
:obj:`azure.batch.models.StartTask`: the StartTask definition to provision the cluster.
"""
return generate_cluster_start_task.generate_cluster_start_task(core_base_operations, zip_resource_file, id,
gpu_enabled, docker_repo, file_shares, plugins,
mixed_mode, worker_on_master)
return generate_cluster_start_task.generate_cluster_start_task(
core_base_operations, zip_resource_file, id, gpu_enabled, docker_repo, docker_run_options, file_shares,
plugins, mixed_mode, worker_on_master)

#TODO: make this private or otherwise not public
def _generate_application_task(self, core_base_operations, container_id, application, remote=False):
Expand Down
4 changes: 2 additions & 2 deletions aztk/spark/client/cluster/helpers/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ def create_cluster(core_cluster_operations,

start_task = spark_cluster_operations._generate_cluster_start_task(
core_cluster_operations, zip_resource_files, cluster_conf.cluster_id, cluster_conf.gpu_enabled(),
cluster_conf.get_docker_repo(), cluster_conf.file_shares, cluster_conf.plugins, cluster_conf.mixed_mode(),
cluster_conf.worker_on_master)
cluster_conf.get_docker_repo(), cluster_conf.get_docker_run_options(), cluster_conf.file_shares,
cluster_conf.plugins, cluster_conf.mixed_mode(), cluster_conf.worker_on_master)

software_metadata_key = base_models.Software.spark

Expand Down
1 change: 1 addition & 0 deletions aztk/spark/client/job/helpers/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def submit_job(core_job_operations,
job_configuration.id,
job_configuration.gpu_enabled,
job_configuration.get_docker_repo(),
job_configuration.get_docker_run_options(),
mixed_mode=job_configuration.mixed_mode(),
worker_on_master=job_configuration.worker_on_master)

Expand Down
10 changes: 7 additions & 3 deletions aztk/spark/helpers/create_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def __get_secrets_env(spark_client):
def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
plugins=None,
worker_on_master: bool = True,
file_mounts=None,
Expand All @@ -69,6 +70,7 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile,
"""
default_docker_repo = constants.DEFAULT_DOCKER_REPO if not gpu_enabled else constants.DEFAULT_DOCKER_REPO_GPU
docker_repo = docker_repo or default_docker_repo
docker_run_options = docker_run_options or ""

shares = []

Expand All @@ -89,9 +91,10 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile,
'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\
'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\
') 2>&1'.format(zip_resource_file.file_path),
'/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1}'.format(
'/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1} "{2}"'.format(
constants.DOCKER_SPARK_CONTAINER_NAME,
docker_repo,
docker_run_options.replace('"', '\\\"')
)
]

Expand All @@ -104,6 +107,7 @@ def generate_cluster_start_task(spark_client,
cluster_id: str,
gpu_enabled: bool,
docker_repo: str = None,
docker_run_options: str = None,
file_shares: List[aztk_models.FileShare] = None,
plugins: List[aztk_models.PluginConfiguration] = None,
mixed_mode: bool = False,
Expand Down Expand Up @@ -133,8 +137,8 @@ def generate_cluster_start_task(spark_client,
] + __get_docker_credentials(spark_client) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode)

# start task command
command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, plugins, worker_on_master, file_shares,
mixed_mode)
command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, docker_run_options, plugins,
worker_on_master, file_shares, mixed_mode)

return batch_models.StartTask(
command_line=helpers.wrap_commands_in_shell(command),
Expand Down
3 changes: 3 additions & 0 deletions aztk/spark/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,9 @@ def mixed_mode(self) -> bool:
def get_docker_repo(self) -> str:
return self.toolkit.get_docker_repo(self.gpu_enabled)

def get_docker_run_options(self) -> str:
return self.toolkit.get_docker_run_options()

def validate(self) -> bool:
"""
Validate the config at its current state.
Expand Down
2 changes: 2 additions & 0 deletions aztk_cli/config/cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ toolkit:
# Optional docker repository(To bring your custom docker image. Just specify the Toolkit software, version and environment if using default images)
# docker_repo: <name of docker image repo (for more information, see https://github.com/Azure/aztk/blob/master/docs/12-docker-image.md)>

# Optional command line options to pass to `docker run`
# docker_run_options: <additional command line options to pass to `docker run` (for more information, see https://github.com/Azure/aztk/blob/master/docs/12-docker-image.md)>

# vm_size: <vm-size, see available options here: https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/>
vm_size: standard_f2
Expand Down
3 changes: 3 additions & 0 deletions aztk_cli/config/job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ job:
# Optional docker repository(To bring your custom docker image. Just specify the Toolkit software, version and environment if using default images)
# docker_repo: <name of docker image repo (for more information, see https://github.com/Azure/aztk/blob/master/docs/12-docker-image.md)>

# Optional command line options to pass to `docker run`
# docker_run_options: <additional command line options to pass to `docker run` (for more information, see https://github.com/Azure/aztk/blob/master/docs/12-docker-image.md)>

# Where do you want to run the driver <dedicated/master/any> (Default: dedicated if at least one dedicated node or any otherwise)
# scheduling_target: dedicated

Expand Down
15 changes: 8 additions & 7 deletions aztk_cli/spark/endpoints/cluster/cluster_create.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,10 @@ def setup_parser(parser: argparse.ArgumentParser):
parser.add_argument('--username', help='Username to access your cluster (required: --wait flag)')
parser.add_argument(
'--password',
help="The password to access your spark cluster's head \
node. If not provided will use ssh public key.")
help="The password to access your spark cluster's head node. If not provided will use ssh public key.")
parser.add_argument(
'--docker-repo',
help='The location of the public docker image you want to use \
(<my-username>/<my-repo>:<tag>)')
'--docker-repo', help='The location of the public docker image you want to use (<my-username>/<my-repo>:<tag>)')
parser.add_argument('--docker-run-options', help='command line options to pass to `docker run`')
parser.add_argument('--subnet-id', help='The subnet in which to create the cluster.')

parser.add_argument('--no-wait', dest='wait', action='store_false')
Expand Down Expand Up @@ -57,8 +55,11 @@ def execute(args: typing.NamedTuple):
password=args.password,
)))

if args.docker_repo and cluster_conf.toolkit:
cluster_conf.toolkit.docker_repo = args.docker_repo
if cluster_conf.toolkit:
if args.docker_repo:
cluster_conf.toolkit.docker_repo = args.docker_repo
if args.docker_run_options:
cluster_conf.toolkit.docker_run_options = args.docker_run_options

wait = wait if args.wait is None else args.wait

Expand Down
Loading