diff --git a/aztk/internal/docker_cmd.py b/aztk/internal/docker_cmd.py index 818d6d96..83eccfb6 100644 --- a/aztk/internal/docker_cmd.py +++ b/aztk/internal/docker_cmd.py @@ -7,7 +7,7 @@ class DockerCmd: Class helping to write a docker command """ - def __init__(self, name: str, docker_repo: str, cmd: str, gpu_enabled=False): + def __init__(self, name: str, docker_repo: str, docker_run_options: str, cmd: str, gpu_enabled=False): if gpu_enabled: self.cmd = CommandBuilder('nvidia-docker run') else: @@ -15,6 +15,7 @@ def __init__(self, name: str, docker_repo: str, cmd: str, gpu_enabled=False): self.cmd.add_option('--net', 'host') self.cmd.add_option('--name', name) self.cmd.add_argument('-d') + self.cmd.add_argument(docker_run_options) self.cmd.add_argument(docker_repo) self.cmd.add_argument(cmd) diff --git a/aztk/models/cluster_configuration.py b/aztk/models/cluster_configuration.py index fdc2166c..d3e57054 100644 --- a/aztk/models/cluster_configuration.py +++ b/aztk/models/cluster_configuration.py @@ -1,13 +1,13 @@ import aztk.error as error from aztk.core.models import Model, fields -from aztk.utils import deprecated, deprecate, helpers +from aztk.utils import deprecate, deprecated, helpers from .custom_script import CustomScript from .file_share import FileShare from .plugins import PluginConfiguration +from .scheduling_target import SchedulingTarget from .toolkit import Toolkit from .user_configuration import UserConfiguration -from .scheduling_target import SchedulingTarget class ClusterConfiguration(Model): @@ -85,10 +85,13 @@ def gpu_enabled(self): def get_docker_repo(self): return self.toolkit.get_docker_repo(self.gpu_enabled()) + def get_docker_run_options(self) -> str: + return self.toolkit.get_docker_run_options() + def __validate__(self) -> bool: if self.size == 0 and self.size_low_priority == 0: raise error.InvalidModelError( - "Please supply a valid (greater than 0) size or size_low_priority value either in the cluster.yaml configuration file or with a parameter (--size or --size-low-pri)" + "Please supply a valid (greater than 0) size or size_low_priority value either in the cluster.yaml configuration file or with a parameter (--size or --size-low-priority)" ) if self.vm_size is None: @@ -97,7 +100,7 @@ def __validate__(self) -> bool: if self.mixed_mode() and not self.subnet_id: raise error.InvalidModelError( - "You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes). Set the VNET's subnet_id in your cluster.yaml." + "You must configure a VNET to use AZTK in mixed mode (dedicated and low priority nodes). Set the VNET's subnet_id in your cluster.yaml or with a parameter (--subnet-id)." ) if self.custom_scripts: diff --git a/aztk/models/toolkit.py b/aztk/models/toolkit.py index 0f78d9e7..2d9822d4 100644 --- a/aztk/models/toolkit.py +++ b/aztk/models/toolkit.py @@ -1,6 +1,8 @@ +import re + +from aztk.core.models import Model, fields from aztk.error import InvalidModelError from aztk.utils import constants, deprecate -from aztk.core.models import Model, fields class ToolkitDefinition: @@ -38,6 +40,7 @@ class Toolkit(Model): environment (str): Which environment to use for this toolkit environment_version (str): If there is multiple version for an environment you can specify which one docker_repo (str): Optional docker repo + docker_run_options (str): Optional command-line options for `docker run` """ software = fields.String() @@ -45,6 +48,7 @@ class Toolkit(Model): environment = fields.String(default=None) environment_version = fields.String(default=None) docker_repo = fields.String(default=None) + docker_run_options = fields.String(default=None) def __validate__(self): if self.software not in TOOLKIT_MAP: @@ -71,6 +75,14 @@ def __validate__(self): "Environment '{0}' version '{1}' for toolkit '{2}' is not available. Use one of: {3}".format( self.environment, self.environment_version, self.software, env_def.versions)) + if self.docker_run_options: + invalid_character = re.search('[^A-Za-z0-9 _./:=\-\"]', self.docker_run_options) + if invalid_character: + raise InvalidModelError( + "Docker run options contains invalid character '{0}'. Only A-Z, a-z, 0-9, space, hyphen (-), " + "underscore (_), period (.), forward slash (/), colon (:), equals(=), comma (,), and " + "double quote (\") are allowed.".format(invalid_character.group(0))) + def get_docker_repo(self, gpu: bool): if self.docker_repo: return self.docker_repo @@ -82,6 +94,9 @@ def get_docker_repo(self, gpu: bool): tag=self._get_docker_tag(gpu), ) + def get_docker_run_options(self): + return self.docker_run_options + def _get_docker_tag(self, gpu: bool): environment = self.environment or "base" environment_def = self._get_environment_definition() diff --git a/aztk/node_scripts/install/install.py b/aztk/node_scripts/install/install.py index b8548f99..e8d29f1e 100644 --- a/aztk/node_scripts/install/install.py +++ b/aztk/node_scripts/install/install.py @@ -1,9 +1,11 @@ import os -from core import config -from install import pick_master, spark, scripts, create_user, plugins, spark_container + import wait_until_master_selected -from aztk.models.plugins import PluginTarget from aztk.internal import cluster_data +from aztk.models.plugins import PluginTarget +from core import config +from install import (create_user, pick_master, plugins, scripts, spark, spark_container) + from .node_scheduling import setup_node_scheduling @@ -14,9 +16,11 @@ def read_cluster_config(): return cluster_config -def setup_host(docker_repo: str): +def setup_host(docker_repo: str, docker_run_options: str): """ - Code to be run on the node(NOT in a container) + Code to be run on the node (NOT in a container) + :param docker_repo: location of the Docker image to use + :param docker_run_options: additional command-line options to pass to docker run """ client = config.batch_client @@ -49,6 +53,7 @@ def setup_host(docker_repo: str): #TODO pass azure file shares spark_container.start_spark_container( docker_repo=docker_repo, + docker_run_options=docker_run_options, gpu_enabled=os.environ.get("AZTK_GPU_ENABLED") == "true", plugins=cluster_conf.plugins, ) diff --git a/aztk/node_scripts/install/spark_container.py b/aztk/node_scripts/install/spark_container.py index 13120679..84565f52 100644 --- a/aztk/node_scripts/install/spark_container.py +++ b/aztk/node_scripts/install/spark_container.py @@ -4,11 +4,16 @@ from aztk.utils import constants -def start_spark_container(docker_repo: str = None, gpu_enabled: bool = False, file_mounts=None, plugins=None): +def start_spark_container(docker_repo: str = None, + docker_run_options: str = None, + gpu_enabled: bool = False, + file_mounts=None, + plugins=None): cmd = DockerCmd( name=constants.DOCKER_SPARK_CONTAINER_NAME, docker_repo=docker_repo, + docker_run_options=docker_run_options, cmd="/bin/bash /mnt/batch/tasks/startup/wd/aztk/node_scripts/docker_main.sh", gpu_enabled=gpu_enabled) diff --git a/aztk/node_scripts/main.py b/aztk/node_scripts/main.py index eb9079b1..2ccd8d54 100644 --- a/aztk/node_scripts/main.py +++ b/aztk/node_scripts/main.py @@ -11,7 +11,7 @@ def run(): action = sys.argv[1] if action == "setup-node": - install.setup_host(sys.argv[2]) + install.setup_host(sys.argv[2], sys.argv[3]) elif action == "setup-spark-container": install.setup_spark_container() else: diff --git a/aztk/node_scripts/setup_host.sh b/aztk/node_scripts/setup_host.sh index c23ff0dd..2a8dbfca 100644 --- a/aztk/node_scripts/setup_host.sh +++ b/aztk/node_scripts/setup_host.sh @@ -2,7 +2,7 @@ # Entry point for the start task. It will install all dependencies and start docker. # Usage: -# setup_host.sh [container_name] [docker_repo_name] +# setup_host.sh [container_name] [docker_repo_name] [docker_run_options] set -e export AZTK_WORKING_DIR=/mnt/batch/tasks/startup/wd @@ -10,6 +10,7 @@ export PYTHONUNBUFFERED=TRUE container_name=$1 docker_repo_name=$2 +docker_run_options=$3 install_prerequisites () { echo "Installing pre-reqs" @@ -79,7 +80,7 @@ run_docker_container () { echo "Creating docker container." echo "Running setup python script" - $AZTK_WORKING_DIR/.aztk-env/.venv/bin/python $(dirname $0)/main.py setup-node $docker_repo_name + $AZTK_WORKING_DIR/.aztk-env/.venv/bin/python $(dirname $0)/main.py setup-node $docker_repo_name "$docker_run_options" # wait until container is running until [ "`/usr/bin/docker inspect -f {{.State.Running}} $container_name`"=="true" ]; do diff --git a/aztk/spark/client/base/helpers/generate_cluster_start_task.py b/aztk/spark/client/base/helpers/generate_cluster_start_task.py index 0b100bf1..e3670b95 100644 --- a/aztk/spark/client/base/helpers/generate_cluster_start_task.py +++ b/aztk/spark/client/base/helpers/generate_cluster_start_task.py @@ -63,6 +63,7 @@ def __get_secrets_env(core_base_operations): def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, gpu_enabled: bool, docker_repo: str = None, + docker_run_options: str = None, plugins=None, worker_on_master: bool = True, file_mounts=None, @@ -93,9 +94,10 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, 'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\ 'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\ ') 2>&1'.format(zip_resource_file.file_path), - '/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1}'.format( + '/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1} "{2}"'.format( constants.DOCKER_SPARK_CONTAINER_NAME, docker_repo, + "" if docker_run_options is None else docker_run_options.replace('"', '\\\"') ) ] @@ -108,6 +110,7 @@ def generate_cluster_start_task(core_base_operations, cluster_id: str, gpu_enabled: bool, docker_repo: str = None, + docker_run_options: str = None, file_shares: List[models.FileShare] = None, plugins: List[models.PluginConfiguration] = None, mixed_mode: bool = False, @@ -137,8 +140,8 @@ def generate_cluster_start_task(core_base_operations, ] + __get_docker_credentials(core_base_operations) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode) # start task command - command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, plugins, worker_on_master, file_shares, - mixed_mode) + command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, docker_run_options, plugins, + worker_on_master, file_shares, mixed_mode) return batch_models.StartTask( command_line=helpers.wrap_commands_in_shell(command), diff --git a/aztk/spark/client/base/operations.py b/aztk/spark/client/base/operations.py index 7a020f5e..ef95e75d 100644 --- a/aztk/spark/client/base/operations.py +++ b/aztk/spark/client/base/operations.py @@ -5,7 +5,7 @@ from aztk.client.base import BaseOperations as CoreBaseOperations from aztk.spark import models -from .helpers import generate_cluster_start_task, generate_application_task +from .helpers import generate_application_task, generate_cluster_start_task class SparkBaseOperations: @@ -19,6 +19,7 @@ def _generate_cluster_start_task(self, id: str, gpu_enabled: bool, docker_repo: str = None, + docker_run_options: str = None, file_shares: List[models.FileShare] = None, plugins: List[models.PluginConfiguration] = None, mixed_mode: bool = False, @@ -44,9 +45,9 @@ def _generate_cluster_start_task(self, Returns: :obj:`azure.batch.models.StartTask`: the StartTask definition to provision the cluster. """ - return generate_cluster_start_task.generate_cluster_start_task(core_base_operations, zip_resource_file, id, - gpu_enabled, docker_repo, file_shares, plugins, - mixed_mode, worker_on_master) + return generate_cluster_start_task.generate_cluster_start_task( + core_base_operations, zip_resource_file, id, gpu_enabled, docker_repo, docker_run_options, file_shares, + plugins, mixed_mode, worker_on_master) #TODO: make this private or otherwise not public def _generate_application_task(self, core_base_operations, container_id, application, remote=False): diff --git a/aztk/spark/client/cluster/helpers/create.py b/aztk/spark/client/cluster/helpers/create.py index b2580a3e..e72b0880 100644 --- a/aztk/spark/client/cluster/helpers/create.py +++ b/aztk/spark/client/cluster/helpers/create.py @@ -53,8 +53,8 @@ def create_cluster(core_cluster_operations, start_task = spark_cluster_operations._generate_cluster_start_task( core_cluster_operations, zip_resource_files, cluster_conf.cluster_id, cluster_conf.gpu_enabled(), - cluster_conf.get_docker_repo(), cluster_conf.file_shares, cluster_conf.plugins, cluster_conf.mixed_mode(), - cluster_conf.worker_on_master) + cluster_conf.get_docker_repo(), cluster_conf.get_docker_run_options(), cluster_conf.file_shares, + cluster_conf.plugins, cluster_conf.mixed_mode(), cluster_conf.worker_on_master) software_metadata_key = base_models.Software.spark diff --git a/aztk/spark/client/job/helpers/submit.py b/aztk/spark/client/job/helpers/submit.py index 01b2965a..98e8af67 100644 --- a/aztk/spark/client/job/helpers/submit.py +++ b/aztk/spark/client/job/helpers/submit.py @@ -81,6 +81,7 @@ def submit_job(core_job_operations, job_configuration.id, job_configuration.gpu_enabled, job_configuration.get_docker_repo(), + job_configuration.get_docker_run_options(), mixed_mode=job_configuration.mixed_mode(), worker_on_master=job_configuration.worker_on_master) diff --git a/aztk/spark/helpers/create_cluster.py b/aztk/spark/helpers/create_cluster.py index e917e687..ade2a042 100644 --- a/aztk/spark/helpers/create_cluster.py +++ b/aztk/spark/helpers/create_cluster.py @@ -59,6 +59,7 @@ def __get_secrets_env(spark_client): def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, gpu_enabled: bool, docker_repo: str = None, + docker_run_options: str = None, plugins=None, worker_on_master: bool = True, file_mounts=None, @@ -69,6 +70,7 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, """ default_docker_repo = constants.DEFAULT_DOCKER_REPO if not gpu_enabled else constants.DEFAULT_DOCKER_REPO_GPU docker_repo = docker_repo or default_docker_repo + docker_run_options = docker_run_options or "" shares = [] @@ -89,9 +91,10 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, 'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\ 'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\ ') 2>&1'.format(zip_resource_file.file_path), - '/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1}'.format( + '/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1} "{2}"'.format( constants.DOCKER_SPARK_CONTAINER_NAME, docker_repo, + docker_run_options.replace('"', '\\\"') ) ] @@ -104,6 +107,7 @@ def generate_cluster_start_task(spark_client, cluster_id: str, gpu_enabled: bool, docker_repo: str = None, + docker_run_options: str = None, file_shares: List[aztk_models.FileShare] = None, plugins: List[aztk_models.PluginConfiguration] = None, mixed_mode: bool = False, @@ -133,8 +137,8 @@ def generate_cluster_start_task(spark_client, ] + __get_docker_credentials(spark_client) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode) # start task command - command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, plugins, worker_on_master, file_shares, - mixed_mode) + command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, docker_run_options, plugins, + worker_on_master, file_shares, mixed_mode) return batch_models.StartTask( command_line=helpers.wrap_commands_in_shell(command), diff --git a/aztk/spark/models/models.py b/aztk/spark/models/models.py index 87e0d83a..909b84e7 100644 --- a/aztk/spark/models/models.py +++ b/aztk/spark/models/models.py @@ -236,6 +236,9 @@ def mixed_mode(self) -> bool: def get_docker_repo(self) -> str: return self.toolkit.get_docker_repo(self.gpu_enabled) + def get_docker_run_options(self) -> str: + return self.toolkit.get_docker_run_options() + def validate(self) -> bool: """ Validate the config at its current state. diff --git a/aztk_cli/config/cluster.yaml b/aztk_cli/config/cluster.yaml index e31a9c08..9cb26d23 100644 --- a/aztk_cli/config/cluster.yaml +++ b/aztk_cli/config/cluster.yaml @@ -14,6 +14,8 @@ toolkit: # Optional docker repository(To bring your custom docker image. Just specify the Toolkit software, version and environment if using default images) # docker_repo: + # Optional command line options to pass to `docker run` + # docker_run_options: # vm_size: vm_size: standard_f2 diff --git a/aztk_cli/config/job.yaml b/aztk_cli/config/job.yaml index 6a311213..c0800fbd 100644 --- a/aztk_cli/config/job.yaml +++ b/aztk_cli/config/job.yaml @@ -22,6 +22,9 @@ job: # Optional docker repository(To bring your custom docker image. Just specify the Toolkit software, version and environment if using default images) # docker_repo: + # Optional command line options to pass to `docker run` + # docker_run_options: + # Where do you want to run the driver (Default: dedicated if at least one dedicated node or any otherwise) # scheduling_target: dedicated diff --git a/aztk_cli/spark/endpoints/cluster/cluster_create.py b/aztk_cli/spark/endpoints/cluster/cluster_create.py index c0e37bc2..16e7e4ff 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_create.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_create.py @@ -20,12 +20,10 @@ def setup_parser(parser: argparse.ArgumentParser): parser.add_argument('--username', help='Username to access your cluster (required: --wait flag)') parser.add_argument( '--password', - help="The password to access your spark cluster's head \ - node. If not provided will use ssh public key.") + help="The password to access your spark cluster's head node. If not provided will use ssh public key.") parser.add_argument( - '--docker-repo', - help='The location of the public docker image you want to use \ - (/:)') + '--docker-repo', help='The location of the public docker image you want to use (/:)') + parser.add_argument('--docker-run-options', help='command line options to pass to `docker run`') parser.add_argument('--subnet-id', help='The subnet in which to create the cluster.') parser.add_argument('--no-wait', dest='wait', action='store_false') @@ -57,8 +55,11 @@ def execute(args: typing.NamedTuple): password=args.password, ))) - if args.docker_repo and cluster_conf.toolkit: - cluster_conf.toolkit.docker_repo = args.docker_repo + if cluster_conf.toolkit: + if args.docker_repo: + cluster_conf.toolkit.docker_repo = args.docker_repo + if args.docker_run_options: + cluster_conf.toolkit.docker_run_options = args.docker_run_options wait = wait if args.wait is None else args.wait diff --git a/aztk_cli/utils.py b/aztk_cli/utils.py index 31f40ae4..363d178a 100644 --- a/aztk_cli/utils.py +++ b/aztk_cli/utils.py @@ -405,6 +405,8 @@ def print_cluster_conf(cluster_conf: ClusterConfiguration, wait: bool): len(cluster_conf.file_shares) if cluster_conf.file_shares is not None else 0) log.info("gpu enabled: %s", str(cluster_conf.gpu_enabled())) log.info("docker repo name: %s", cluster_conf.get_docker_repo()) + if cluster_conf.get_docker_run_options(): + log.info("docker run options: %s", cluster_conf.get_docker_run_options()) log.info("wait for cluster: %s", wait) if user_configuration: log.info("username: %s", user_configuration.username) diff --git a/docs/12-docker-image.md b/docs/12-docker-image.md index bad8d5ed..47f3a3db 100644 --- a/docs/12-docker-image.md +++ b/docs/12-docker-image.md @@ -11,9 +11,16 @@ To select an image other than the default, you can set your Docker image at clus aztk spark cluster create ... --docker-repo ``` -For example, if I wanted to use Spark v2.2.0, I could run the following cluster create command: +To customize Docker configuration, you can pass command line options to the `docker run` command with the optional **--docker-run-options** parameter: + +```sh +aztk spark cluster create ... "--docker-run-options=" +``` + +For example, if I wanted to use Spark v2.2.0 and start my container in privileged mode and with a kernel memory limit of 100MB, +I could run the following cluster create command: ```sh -aztk spark cluster create ... --docker-repo aztk/base:spark1.6.3 +aztk spark cluster create ... --docker-repo aztk/base:spark2.2.0 "--docker-run-options=--privileged --kernel-memory 100m" ``` ## Using a custom Docker Image diff --git a/docs/13-configuration.md b/docs/13-configuration.md index a701ec23..789285f4 100644 --- a/docs/13-configuration.md +++ b/docs/13-configuration.md @@ -22,6 +22,8 @@ toolkit: # Optional docker repository(To bring your custom docker image. Just specify the Toolkit software, version and environment if using default images) # docker_repo: + # Optional command line options to pass to `docker run` + # docker_run_options: # vm_size: vm_size: standard_a2