From a88f087d316d5984a793acb4318c24088afa9783 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 22 Jun 2018 17:48:44 -0700 Subject: [PATCH 01/52] start refactor --- .style.yapf | 2 +- aztk/client/__init__.py | 1 + aztk/client/base/__init__.py | 1 + aztk/client/base/client.py | 58 +++++ aztk/client/base/helpers/__init__.py | 0 aztk/client/base/helpers/cluster_run.py | 30 +++ .../base/helpers/create_user_on_node.py | 42 ++++ .../base/helpers/create_user_on_pool.py | 10 + .../base/helpers/delete_user_on_node.py | 9 + .../base/helpers/delete_user_on_pool.py | 7 + .../base/helpers/generate_user_on_node.py | 11 + .../base/helpers/generate_user_on_pool.py | 19 ++ aztk/client/base/helpers/node_run.py | 29 +++ aztk/client/base/helpers/ssh_into_node.py | 20 ++ aztk/client/client.py | 6 + aztk/client/cluster/__init__.py | 0 aztk/client/cluster/client.py | 30 +++ aztk/client/cluster/helpers/__init__.py | 0 aztk/client/cluster/helpers/copy.py | 35 +++ aztk/client/cluster/helpers/create.py | 67 ++++++ aztk/client/cluster/helpers/delete.py | 31 +++ aztk/client/cluster/helpers/get.py | 11 + aztk/client/cluster/helpers/list.py | 19 ++ aztk/client/job/__init__.py | 0 aztk/client/job/client.py | 10 + aztk/client/job/helpers/__init__.py | 0 aztk/client/job/helpers/submit.py | 77 +++++++ aztk/spark/client.py | 2 + aztk/spark/client/__init__.py | 1 + aztk/spark/client/base/__init__.py | 1 + aztk/spark/client/base/client.py | 4 + aztk/spark/client/base/helpers/__init__.py | 0 aztk/spark/client/client.py | 7 + aztk/spark/client/cluster/__init__.py | 1 + aztk/spark/client/cluster/client.py | 9 + aztk/spark/client/cluster/helpers/__init__.py | 0 aztk/spark/client/cluster/helpers/create.py | 203 ++++++++++++++++++ aztk/spark/client/cluster/helpers/delete.py | 10 + aztk/spark/client/cluster/helpers/get.py | 13 ++ aztk/spark/client/cluster/helpers/list.py | 13 ++ aztk/spark/client/cluster/helpers/submit.py | 145 +++++++++++++ aztk/spark/client/job/__init__.py | 1 + aztk/spark/client/job/client.py | 4 + aztk/spark/client/job/helpers/__init__.py | 0 44 files changed, 938 insertions(+), 1 deletion(-) create mode 100644 aztk/client/__init__.py create mode 100644 aztk/client/base/__init__.py create mode 100644 aztk/client/base/client.py create mode 100644 aztk/client/base/helpers/__init__.py create mode 100644 aztk/client/base/helpers/cluster_run.py create mode 100644 aztk/client/base/helpers/create_user_on_node.py create mode 100644 aztk/client/base/helpers/create_user_on_pool.py create mode 100644 aztk/client/base/helpers/delete_user_on_node.py create mode 100644 aztk/client/base/helpers/delete_user_on_pool.py create mode 100644 aztk/client/base/helpers/generate_user_on_node.py create mode 100644 aztk/client/base/helpers/generate_user_on_pool.py create mode 100644 aztk/client/base/helpers/node_run.py create mode 100644 aztk/client/base/helpers/ssh_into_node.py create mode 100644 aztk/client/client.py create mode 100644 aztk/client/cluster/__init__.py create mode 100644 aztk/client/cluster/client.py create mode 100644 aztk/client/cluster/helpers/__init__.py create mode 100644 aztk/client/cluster/helpers/copy.py create mode 100644 aztk/client/cluster/helpers/create.py create mode 100644 aztk/client/cluster/helpers/delete.py create mode 100644 aztk/client/cluster/helpers/get.py create mode 100644 aztk/client/cluster/helpers/list.py create mode 100644 aztk/client/job/__init__.py create mode 100644 aztk/client/job/client.py create mode 100644 aztk/client/job/helpers/__init__.py create mode 100644 aztk/client/job/helpers/submit.py create mode 100644 aztk/spark/client/__init__.py create mode 100644 aztk/spark/client/base/__init__.py create mode 100644 aztk/spark/client/base/client.py create mode 100644 aztk/spark/client/base/helpers/__init__.py create mode 100644 aztk/spark/client/client.py create mode 100644 aztk/spark/client/cluster/__init__.py create mode 100644 aztk/spark/client/cluster/client.py create mode 100644 aztk/spark/client/cluster/helpers/__init__.py create mode 100644 aztk/spark/client/cluster/helpers/create.py create mode 100644 aztk/spark/client/cluster/helpers/delete.py create mode 100644 aztk/spark/client/cluster/helpers/get.py create mode 100644 aztk/spark/client/cluster/helpers/list.py create mode 100644 aztk/spark/client/cluster/helpers/submit.py create mode 100644 aztk/spark/client/job/__init__.py create mode 100644 aztk/spark/client/job/client.py create mode 100644 aztk/spark/client/job/helpers/__init__.py diff --git a/.style.yapf b/.style.yapf index 4463b094..ca54f975 100644 --- a/.style.yapf +++ b/.style.yapf @@ -3,5 +3,5 @@ based_on_style=pep8 spaces_before_comment=4 split_before_logical_operator=True indent_width=4 -column_limit=140 +column_limit=120 split_arguments_when_comma_terminated=True diff --git a/aztk/client/__init__.py b/aztk/client/__init__.py new file mode 100644 index 00000000..3ff722bf --- /dev/null +++ b/aztk/client/__init__.py @@ -0,0 +1 @@ +from .client import Client diff --git a/aztk/client/base/__init__.py b/aztk/client/base/__init__.py new file mode 100644 index 00000000..b957ca23 --- /dev/null +++ b/aztk/client/base/__init__.py @@ -0,0 +1 @@ +from .client import BaseClient diff --git a/aztk/client/base/client.py b/aztk/client/base/client.py new file mode 100644 index 00000000..cb01b6bb --- /dev/null +++ b/aztk/client/base/client.py @@ -0,0 +1,58 @@ +import aztk.models as models +from aztk.internal import cluster_data +from aztk.utils import ssh as ssh_lib +from aztk.utils import azure_api + +from .helpers import (create_user_on_node, create_user_on_pool, + delete_user_on_node, generate_user_on_node, + generate_user_on_pool, ssh_into_node) + + +class BaseClient: + ''' + Base client that all other clients inherit from + ''' + + def __init__(self, secrets_config: models.SecretsConfiguration): + self.secrets_config = secrets_config + + azure_api.validate_secrets(secrets_config) + self.batch_client = azure_api.make_batch_client(secrets_config) + self.blob_client = azure_api.make_blob_client(secrets_config) + + def get_cluster_config(self, cluster_id: str) -> models.ClusterConfiguration: + return self.get_cluster_data(cluster_id).read_cluster_config() + + def get_cluster_data(self, cluster_id: str) -> cluster_data.ClusterData: + """ + Returns ClusterData object to manage data related to the given cluster id + """ + return cluster_data.ClusterData(self.blob_client, cluster_id) + + def ssh_into_node(self, + pool_id, + node_id, + username, + ssh_key=None, + password=None, + port_forward_list=None, + internal=False): + ''' + Opens a ssh tunnel to the node for port forwarding + ''' + ssh_into_node.ssh_into_node(self, pool_id, node_id, username, ssh_key, password, port_forward_list, internal) + + def create_user_on_node(self, username, pool_id, node_id, ssh_key=None, password=None): + return create_user_on_node.create_user_on_node(self, username, pool_id, node_id, ssh_key, password) + + def create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key=None, password=None): + return create_user_on_pool.create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key, password) + + def generate_user_on_node(self, pool_id, node_id): + return generate_user_on_node.generate_user_on_node(self, pool_id, node_id) + + def generate_user_on_pool(self, pool_id, nodes): + return generate_user_on_pool.generate_user_on_pool(self, pool_id, nodes) + + def delete_user_on_node(self, pool_id: str, node_id: str, username: str) -> str: + return delete_user_on_node.delete_user(self, pool_id, node_id, username) diff --git a/aztk/client/base/helpers/__init__.py b/aztk/client/base/helpers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/aztk/client/base/helpers/cluster_run.py b/aztk/client/base/helpers/cluster_run.py new file mode 100644 index 00000000..47954a79 --- /dev/null +++ b/aztk/client/base/helpers/cluster_run.py @@ -0,0 +1,30 @@ +import asyncio + +import aztk.models as models +from aztk.utils import ssh as ssh_lib + +def __cluster_run(base_client, cluster_id, command, internal, container_name=None, timeout=None): + pool, nodes = base_client.get_pool_details(cluster_id) + nodes = list(nodes) + if internal: + cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] + else: + cluster_nodes = [(node, base_client.get_remote_login_settings(pool.id, node.id)) for node in nodes] + + try: + generated_username, ssh_key = base_client.generate_user_on_pool(pool.id, nodes) + output = asyncio.get_event_loop().run_until_complete( + ssh_lib.clus_exec_command( + command, + generated_username, + cluster_nodes, + ssh_key=ssh_key.exportKey().decode('utf-8'), + container_name=container_name, + timeout=timeout + ) + ) + return output + except OSError as exc: + raise exc + finally: + base_client.delete_user_on_pool(generated_username, pool.id, nodes) \ No newline at end of file diff --git a/aztk/client/base/helpers/create_user_on_node.py b/aztk/client/base/helpers/create_user_on_node.py new file mode 100644 index 00000000..b0b69f44 --- /dev/null +++ b/aztk/client/base/helpers/create_user_on_node.py @@ -0,0 +1,42 @@ +import datetime +from datetime import datetime + +import azure.batch.models as batch_models +import azure.batch.models.batch_error as batch_error + +from aztk import models +from aztk.utils import get_ssh_key + + +def __create_user(self, pool_id: str, node_id: str, username: str, password: str = None, ssh_key: str = None) -> str: + """ + Create a pool user + :param pool: the pool to add the user to + :param node: the node to add the user to + :param username: username of the user to add + :param password: password of the user to add + :param ssh_key: ssh_key of the user to add + """ + # Create new ssh user for the given node + self.batch_client.compute_node.add_user( + pool_id, + node_id, + batch_models.ComputeNodeUser( + name=username, + is_admin=True, + password=password, + ssh_public_key=get_ssh_key.get_user_public_key(ssh_key, self.secrets_config), + expiry_time=datetime.now(datetime.timezone.utc) + datetime.timedelta(days=365), + ), + ) + + +def create_user_on_node(base_client, username, pool_id, node_id, ssh_key=None, password=None): + try: + __create_user(base_client, pool_id=pool_id, node_id=node_id, username=username, ssh_key=ssh_key, password=password) + except batch_error.BatchErrorException as error: + try: + base_client.__delete_user(pool_id, node_id, username) + base_client.__create_user(pool_id=pool_id, node_id=node_id, username=username, ssh_key=ssh_key) + except batch_error.BatchErrorException as error: + raise error diff --git a/aztk/client/base/helpers/create_user_on_pool.py b/aztk/client/base/helpers/create_user_on_pool.py new file mode 100644 index 00000000..ca7f50ef --- /dev/null +++ b/aztk/client/base/helpers/create_user_on_pool.py @@ -0,0 +1,10 @@ +import concurrent.futures + + +def create_user_on_pool(base_client, username, pool_id, nodes, ssh_pub_key=None, password=None): + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = { + executor.submit(base_client.create_user_on_node, username, pool_id, node.id, ssh_pub_key, password): node + for node in nodes + } + concurrent.futures.wait(futures) diff --git a/aztk/client/base/helpers/delete_user_on_node.py b/aztk/client/base/helpers/delete_user_on_node.py new file mode 100644 index 00000000..d901350d --- /dev/null +++ b/aztk/client/base/helpers/delete_user_on_node.py @@ -0,0 +1,9 @@ +def delete_user(self, pool_id: str, node_id: str, username: str) -> str: + """ + Create a pool user + :param pool: the pool to add the user to + :param node: the node to add the user to + :param username: username of the user to add + """ + # Delete a user on the given node + self.batch_client.compute_node.delete_user(pool_id, node_id, username) diff --git a/aztk/client/base/helpers/delete_user_on_pool.py b/aztk/client/base/helpers/delete_user_on_pool.py new file mode 100644 index 00000000..75f41744 --- /dev/null +++ b/aztk/client/base/helpers/delete_user_on_pool.py @@ -0,0 +1,7 @@ +import concurrent.futures + + +def __delete_user_on_pool(base_client, username, pool_id, nodes): + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [executor.submit(base_client.delete_user, pool_id, node.id, username) for node in nodes] + concurrent.futures.wait(futures) diff --git a/aztk/client/base/helpers/generate_user_on_node.py b/aztk/client/base/helpers/generate_user_on_node.py new file mode 100644 index 00000000..494d4d31 --- /dev/null +++ b/aztk/client/base/helpers/generate_user_on_node.py @@ -0,0 +1,11 @@ +from Cryptodome.PublicKey import RSA + +from aztk.utils import secure_utils + + +def __generate_user_on_node(base_client, pool_id, node_id): + generated_username = secure_utils.generate_random_string() + ssh_key = RSA.generate(2048) + ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8') + base_client.__create_user_on_node(generated_username, pool_id, node_id, ssh_pub_key) + return generated_username, ssh_key diff --git a/aztk/client/base/helpers/generate_user_on_pool.py b/aztk/client/base/helpers/generate_user_on_pool.py new file mode 100644 index 00000000..f9c91c10 --- /dev/null +++ b/aztk/client/base/helpers/generate_user_on_pool.py @@ -0,0 +1,19 @@ +import concurrent.futures + +from Cryptodome.PublicKey import RSA + +from aztk.utils import secure_utils + + +def generate_user_on_pool(base_client, pool_id, nodes): + generated_username = secure_utils.generate_random_string() + ssh_key = RSA.generate(2048) + ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8') + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = { + executor.submit(base_client.create_user_on_node, generated_username, pool_id, node.id, ssh_pub_key): node + for node in nodes + } + concurrent.futures.wait(futures) + + return generated_username, ssh_key diff --git a/aztk/client/base/helpers/node_run.py b/aztk/client/base/helpers/node_run.py new file mode 100644 index 00000000..48252ce5 --- /dev/null +++ b/aztk/client/base/helpers/node_run.py @@ -0,0 +1,29 @@ +import aztk.error as error +import aztk.models as models +from aztk.utils import ssh as ssh_lib + + +def node_run(base_client, cluster_id, node_id, command, internal, container_name=None, timeout=None): + pool, nodes = base_client.get_pool_details(cluster_id) + try: + node = next(node for node in nodes if node.id == node_id) + except StopIteration: + raise error.AztkError("Node with id {} not found".format(node_id)) + if internal: + node_rls = models.RemoteLogin(ip_address=node.ip_address, port="22") + else: + node_rls = base_client.get_remote_login_settings(pool.id, node.id) + try: + generated_username, ssh_key = base_client.generate_user_on_node(pool.id, node.id) + output = ssh_lib.node_exec_command( + node.id, + command, + generated_username, + node_rls.ip_address, + node_rls.port, + ssh_key=ssh_key.exportKey().decode('utf-8'), + container_name=container_name, + timeout=timeout) + return output + finally: + base_client.delete_user(cluster_id, node.id, generated_username) diff --git a/aztk/client/base/helpers/ssh_into_node.py b/aztk/client/base/helpers/ssh_into_node.py new file mode 100644 index 00000000..5e5a024d --- /dev/null +++ b/aztk/client/base/helpers/ssh_into_node.py @@ -0,0 +1,20 @@ +import aztk.models as models +from aztk.utils import ssh as ssh_lib + + +def ssh_into_node(base_client, pool_id, node_id, username, ssh_key=None, password=None, port_forward_list=None, internal=False): + if internal: + result = base_client.batch_client.compute_node.get(pool_id=pool_id, node_id=node_id) + rls = models.RemoteLogin(ip_address=result.ip_address, port="22") + else: + result = base_client.batch_client.compute_node.get_remote_login_settings(pool_id, node_id) + rls = models.RemoteLogin(ip_address=result.remote_login_ip_address, port=str(result.remote_login_port)) + + ssh_lib.node_ssh( + username=username, + hostname=rls.ip_address, + port=rls.port, + ssh_key=ssh_key, + password=password, + port_forward_list=port_forward_list, + ) diff --git a/aztk/client/client.py b/aztk/client/client.py new file mode 100644 index 00000000..eba03384 --- /dev/null +++ b/aztk/client/client.py @@ -0,0 +1,6 @@ +from aztk.spark.client.cluster import Client as cluster_client +from aztk.spark.client.job import Client as job_client + +class Client: + cluster = cluster_client + job = job_client diff --git a/aztk/client/cluster/__init__.py b/aztk/client/cluster/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/aztk/client/cluster/client.py b/aztk/client/cluster/client.py new file mode 100644 index 00000000..45f69abe --- /dev/null +++ b/aztk/client/cluster/client.py @@ -0,0 +1,30 @@ +from aztk.client.base import BaseClient +from aztk.models import ClusterConfiguration + +from .helpers import copy, create, delete, get, list + + +class Client(BaseClient): + def create(self, cluster_configuration: ClusterConfiguration, software_metadata_key: str, start_task, + vm_image_model): + create.create_pool_and_job(self, cluster_configuration, software_metadata_key, start_task, vm_image_model) + + def get(self, cluster_id: str): + get.get_pool_details(self, cluster_id) + + def copy(self, + cluster_id, + source_path, + destination_path=None, + container_name=None, + internal=False, + get=False, + timeout=None): + return copy.cluster_copy(self, cluster_id, source_path, destination_path, container_name, internal, get, + timeout) + + def delete(self, pool_id: str, keep_logs: bool = False): + return delete.delete_pool_and_job(self, pool_id, keep_logs) + + def list(self, software_metadata_key): + return list.list_clusters(self, software_metadata_key) diff --git a/aztk/client/cluster/helpers/__init__.py b/aztk/client/cluster/helpers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/aztk/client/cluster/helpers/copy.py b/aztk/client/cluster/helpers/copy.py new file mode 100644 index 00000000..e1413ed9 --- /dev/null +++ b/aztk/client/cluster/helpers/copy.py @@ -0,0 +1,35 @@ +import asyncio + +import azure.batch.models.batch_error as batch_error + +import aztk.models as models +from aztk.utils import ssh as ssh_lib + + +def cluster_copy(cluster_client, cluster_id, source_path, destination_path=None, container_name=None, internal=False, get=False, timeout=None): + pool, nodes = cluster_client.__get_pool_details(cluster_id) + nodes = list(nodes) + if internal: + cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] + else: + cluster_nodes = [(node, cluster_client.__get_remote_login_settings(pool.id, node.id)) for node in nodes] + + try: + generated_username, ssh_key = cluster_client.generate_user_on_pool(pool.id, nodes) + output = asyncio.get_event_loop().run_until_complete( + ssh_lib.clus_copy( + container_name=container_name, + username=generated_username, + nodes=cluster_nodes, + source_path=source_path, + destination_path=destination_path, + ssh_key=ssh_key.exportKey().decode('utf-8'), + get=get, + timeout=timeout + ) + ) + return output + except (OSError, batch_error.BatchErrorException) as exc: + raise exc + finally: + cluster_client.__delete_user_on_pool(generated_username, pool.id, nodes) diff --git a/aztk/client/cluster/helpers/create.py b/aztk/client/cluster/helpers/create.py new file mode 100644 index 00000000..dfa2dfa8 --- /dev/null +++ b/aztk/client/cluster/helpers/create.py @@ -0,0 +1,67 @@ +from datetime import timedelta +import azure.batch.models as batch_models + +from aztk import models +from aztk.utils import helpers, constants + + +def create_pool_and_job(cluster_client, cluster_conf: models.ClusterConfiguration, software_metadata_key: str, start_task, VmImageModel): + """ + Create a pool and job + :param cluster_conf: the configuration object used to create the cluster + :type cluster_conf: aztk.models.ClusterConfiguration + :parm software_metadata_key: the id of the software being used on the cluster + :param start_task: the start task for the cluster + :param VmImageModel: the type of image to provision for the cluster + :param wait: wait until the cluster is ready + """ + cluster_client.get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf) + # reuse pool_id as job_id + pool_id = cluster_conf.cluster_id + job_id = cluster_conf.cluster_id + + # Get a verified node agent sku + sku_to_use, image_ref_to_use = \ + helpers.select_latest_verified_vm_image_with_node_agent_sku( + VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, cluster_client.batch_client) + + network_conf = None + if cluster_conf.subnet_id is not None: + network_conf = batch_models.NetworkConfiguration( + subnet_id=cluster_conf.subnet_id) + auto_scale_formula = "$TargetDedicatedNodes={0}; $TargetLowPriorityNodes={1}".format( + cluster_conf.size, cluster_conf.size_low_priority) + + # Configure the pool + pool = batch_models.PoolAddParameter( + id=pool_id, + virtual_machine_configuration=batch_models.VirtualMachineConfiguration( + image_reference=image_ref_to_use, + node_agent_sku_id=sku_to_use), + vm_size=cluster_conf.vm_size, + enable_auto_scale=True, + auto_scale_formula=auto_scale_formula, + auto_scale_evaluation_interval=timedelta(minutes=5), + start_task=start_task, + enable_inter_node_communication=True if not cluster_conf.subnet_id else False, + max_tasks_per_node=4, + network_configuration=network_conf, + metadata=[ + batch_models.MetadataItem( + name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), + batch_models.MetadataItem( + name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA) + ]) + + # Create the pool + create user for the pool + helpers.create_pool_if_not_exist(pool, cluster_client.batch_client) + + # Create job + job = batch_models.JobAddParameter( + id=job_id, + pool_info=batch_models.PoolInformation(pool_id=pool_id)) + + # Add job to batch + cluster_client.batch_client.job.add(job) + + return helpers.get_cluster(cluster_conf.cluster_id, cluster_client.batch_client) diff --git a/aztk/client/cluster/helpers/delete.py b/aztk/client/cluster/helpers/delete.py new file mode 100644 index 00000000..ead592de --- /dev/null +++ b/aztk/client/cluster/helpers/delete.py @@ -0,0 +1,31 @@ +import azure.batch.models as batch_models + + +def delete_pool_and_job(cluster_client, pool_id: str, keep_logs: bool = False): + """ + Delete a pool and it's associated job + :param cluster_id: the pool to add the user to + :return bool: deleted the pool if exists and job if exists + """ + # job id is equal to pool id + job_id = pool_id + job_exists = True + + try: + cluster_client.batch_client.job.get(job_id) + except batch_models.batch_error.BatchErrorException: + job_exists = False + + pool_exists = cluster_client.batch_client.pool.exists(pool_id) + + if job_exists: + cluster_client.batch_client.job.delete(job_id) + + if pool_exists: + cluster_client.batch_client.pool.delete(pool_id) + + if not keep_logs: + cluster_data = cluster_client.get_cluster_data(pool_id) + cluster_data.delete_container(pool_id) + + return job_exists or pool_exists diff --git a/aztk/client/cluster/helpers/get.py b/aztk/client/cluster/helpers/get.py new file mode 100644 index 00000000..73b9a3d7 --- /dev/null +++ b/aztk/client/cluster/helpers/get.py @@ -0,0 +1,11 @@ + + +def get_pool_details(cluster_client, cluster_id: str): + """ + Print the information for the given cluster + :param cluster_id: Id of the cluster + :return pool: CloudPool, nodes: ComputeNodePaged + """ + pool = cluster_client.batch_client.pool.get(cluster_id) + nodes = cluster_client.batch_client.compute_node.list(pool_id=cluster_id) + return pool, nodes diff --git a/aztk/client/cluster/helpers/list.py b/aztk/client/cluster/helpers/list.py new file mode 100644 index 00000000..af6612ea --- /dev/null +++ b/aztk/client/cluster/helpers/list.py @@ -0,0 +1,19 @@ +from aztk.utils import constants + + +def list_clusters(cluster_client, software_metadata_key): + """ + List all the cluster on your account. + """ + pools = cluster_client.batch_client.pool.list() + software_metadata = ( + constants.AZTK_SOFTWARE_METADATA_KEY, software_metadata_key) + cluster_metadata = ( + constants.AZTK_MODE_METADATA_KEY, constants.AZTK_CLUSTER_MODE_METADATA) + + aztk_pools = [] + for pool in [pool for pool in pools if pool.metadata]: + pool_metadata = [(metadata.name, metadata.value) for metadata in pool.metadata] + if all([metadata in pool_metadata for metadata in [software_metadata, cluster_metadata]]): + aztk_pools.append(pool) + return aztk_pools diff --git a/aztk/client/job/__init__.py b/aztk/client/job/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/aztk/client/job/client.py b/aztk/client/job/client.py new file mode 100644 index 00000000..ebf4ab62 --- /dev/null +++ b/aztk/client/job/client.py @@ -0,0 +1,10 @@ +from aztk.client.base import BaseClient +from .helpers import ( + submit, ) + + +class Client(BaseClient): + def submit(self, job_configuration, start_task, job_manager_task, autoscale_formula, software_metadata_key: str, + vm_image_model, application_metadata): + return submit.submit_job(self, job_configuration, start_task, job_manager_task, autoscale_formula, + software_metadata_key, vm_image_model, application_metadata) diff --git a/aztk/client/job/helpers/__init__.py b/aztk/client/job/helpers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/aztk/client/job/helpers/submit.py b/aztk/client/job/helpers/submit.py new file mode 100644 index 00000000..901dd37f --- /dev/null +++ b/aztk/client/job/helpers/submit.py @@ -0,0 +1,77 @@ +from datetime import timedelta + +import azure.batch.models as batch_models +from aztk.utils import helpers, constants + + +def submit_job( + job_client, + job_configuration, + start_task, + job_manager_task, + autoscale_formula, + software_metadata_key: str, + vm_image_model, + application_metadata, +): + """ + Job Submission + :param job_configuration -> aztk_sdk.spark.models.JobConfiguration + :param start_task -> batch_models.StartTask + :param job_manager_task -> batch_models.TaskAddParameter + :param autoscale_formula -> str + :param software_metadata_key -> str + :param vm_image_model -> aztk_sdk.models.VmImage + :returns None + """ + job_client.get_cluster_data(job_configuration.id).save_cluster_config(job_configuration.to_cluster_config()) + + # get a verified node agent sku + sku_to_use, image_ref_to_use = \ + helpers.select_latest_verified_vm_image_with_node_agent_sku( + vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, job_client.batch_client) + + # set up subnet if necessary + network_conf = None + if job_configuration.subnet_id: + network_conf = batch_models.NetworkConfiguration(subnet_id=job_configuration.subnet_id) + + # set up a schedule for a recurring job + auto_pool_specification = batch_models.AutoPoolSpecification( + pool_lifetime_option=batch_models.PoolLifetimeOption.job_schedule, + auto_pool_id_prefix=job_configuration.id, + keep_alive=False, + pool=batch_models.PoolSpecification( + display_name=job_configuration.id, + virtual_machine_configuration=batch_models.VirtualMachineConfiguration( + image_reference=image_ref_to_use, node_agent_sku_id=sku_to_use), + vm_size=job_configuration.vm_size, + enable_auto_scale=True, + auto_scale_formula=autoscale_formula, + auto_scale_evaluation_interval=timedelta(minutes=5), + start_task=start_task, + enable_inter_node_communication=not job_configuration.mixed_mode(), + network_configuration=network_conf, + max_tasks_per_node=4, + metadata=[ + batch_models.MetadataItem(name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), + batch_models.MetadataItem(name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA) + ])) + + # define job specification + job_spec = batch_models.JobSpecification( + pool_info=batch_models.PoolInformation(auto_pool_specification=auto_pool_specification), + display_name=job_configuration.id, + on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job, + job_manager_task=job_manager_task, + metadata=[batch_models.MetadataItem(name='applications', value=application_metadata)]) + + # define schedule + schedule = batch_models.Schedule(do_not_run_until=None, do_not_run_after=None, start_window=None, recurrence_interval=None) + + # create job schedule and add task + setup = batch_models.JobScheduleAddParameter(id=job_configuration.id, schedule=schedule, job_specification=job_spec) + + job_client.batch_client.job_schedule.add(setup) + + return job_client.batch_client.job_schedule.get(job_schedule_id=job_configuration.id) diff --git a/aztk/spark/client.py b/aztk/spark/client.py index 7936c087..5157d8e4 100644 --- a/aztk/spark/client.py +++ b/aztk/spark/client.py @@ -110,6 +110,8 @@ def submit(self, cluster_id: str, application: models.ApplicationConfiguration, except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) +########################################### CURRENT PROGRESS ##################################################### + def submit_all_applications(self, cluster_id: str, applications): for application in applications: self.submit(cluster_id, application) diff --git a/aztk/spark/client/__init__.py b/aztk/spark/client/__init__.py new file mode 100644 index 00000000..3ff722bf --- /dev/null +++ b/aztk/spark/client/__init__.py @@ -0,0 +1 @@ +from .client import Client diff --git a/aztk/spark/client/base/__init__.py b/aztk/spark/client/base/__init__.py new file mode 100644 index 00000000..b957ca23 --- /dev/null +++ b/aztk/spark/client/base/__init__.py @@ -0,0 +1 @@ +from .client import BaseClient diff --git a/aztk/spark/client/base/client.py b/aztk/spark/client/base/client.py new file mode 100644 index 00000000..d6d1c5fe --- /dev/null +++ b/aztk/spark/client/base/client.py @@ -0,0 +1,4 @@ +from aztk.client.base import BaseClient as CoreBaseClient + +class BaseClient(CoreBaseClient): + pass diff --git a/aztk/spark/client/base/helpers/__init__.py b/aztk/spark/client/base/helpers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/aztk/spark/client/client.py b/aztk/spark/client/client.py new file mode 100644 index 00000000..89409e51 --- /dev/null +++ b/aztk/spark/client/client.py @@ -0,0 +1,7 @@ +from aztk.spark.client.cluster import Client as cluster_client +from aztk.spark.client.job import Client as job_client + + +class Client(): + cluster = cluster_client + job = job_client diff --git a/aztk/spark/client/cluster/__init__.py b/aztk/spark/client/cluster/__init__.py new file mode 100644 index 00000000..3ff722bf --- /dev/null +++ b/aztk/spark/client/cluster/__init__.py @@ -0,0 +1 @@ +from .client import Client diff --git a/aztk/spark/client/cluster/client.py b/aztk/spark/client/cluster/client.py new file mode 100644 index 00000000..d9238f0f --- /dev/null +++ b/aztk/spark/client/cluster/client.py @@ -0,0 +1,9 @@ +from aztk.spark.client.cluster.client import Client as cluster_client + +from .helpers import create +from aztk.spark import models + +class Client(cluster_client): + + def create(self, cluster_configuration: models.ClusterConfiguration, wait: bool = False): + return create.create_cluster(self, cluster_configuration, wait) diff --git a/aztk/spark/client/cluster/helpers/__init__.py b/aztk/spark/client/cluster/helpers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/aztk/spark/client/cluster/helpers/create.py b/aztk/spark/client/cluster/helpers/create.py new file mode 100644 index 00000000..38d61727 --- /dev/null +++ b/aztk/spark/client/cluster/helpers/create.py @@ -0,0 +1,203 @@ +from typing import List + +import azure.batch.models as batch_models +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.internal.cluster_data import NodeData +from aztk.spark import models +from aztk.spark.utils import util +from aztk.utils import constants, helpers + +POOL_ADMIN_USER_IDENTITY = batch_models.UserIdentity( + auto_user=batch_models.AutoUserSpecification( + scope=batch_models.AutoUserScope.pool, elevation_level=batch_models.ElevationLevel.admin)) + +def _default_scheduling_target(vm_count: int): + if vm_count == 0: + return models.SchedulingTarget.Any + else: + return models.SchedulingTarget.Dedicated + + +def _apply_default_for_cluster_config(configuration: models.ClusterConfiguration): + cluster_conf = models.ClusterConfiguration() + cluster_conf.merge(configuration) + if cluster_conf.scheduling_target is None: + cluster_conf.scheduling_target = _default_scheduling_target(cluster_conf.size) + return cluster_conf + + +def _get_aztk_environment(cluster_id, worker_on_master, mixed_mode): + envs = [] + envs.append(batch_models.EnvironmentSetting(name="AZTK_MIXED_MODE", value=helpers.bool_env(mixed_mode))) + envs.append(batch_models.EnvironmentSetting(name="AZTK_WORKER_ON_MASTER", value=helpers.bool_env(worker_on_master))) + envs.append(batch_models.EnvironmentSetting(name="AZTK_CLUSTER_ID", value=cluster_id)) + return envs + + +def __get_docker_credentials(spark_client): + creds = [] + docker = spark_client.secrets_config.docker + if docker: + if docker.endpoint: + creds.append(batch_models.EnvironmentSetting(name="DOCKER_ENDPOINT", value=docker.endpoint)) + if docker.username: + creds.append(batch_models.EnvironmentSetting(name="DOCKER_USERNAME", value=docker.username)) + if docker.password: + creds.append(batch_models.EnvironmentSetting(name="DOCKER_PASSWORD", value=docker.password)) + + return creds + + +def __get_secrets_env(spark_client): + shared_key = spark_client.secrets_config.shared_key + service_principal = spark_client.secrets_config.service_principal + if shared_key: + return [ + batch_models.EnvironmentSetting(name="BATCH_SERVICE_URL", value=shared_key.batch_service_url), + batch_models.EnvironmentSetting(name="BATCH_ACCOUNT_KEY", value=shared_key.batch_account_key), + batch_models.EnvironmentSetting(name="STORAGE_ACCOUNT_NAME", value=shared_key.storage_account_name), + batch_models.EnvironmentSetting(name="STORAGE_ACCOUNT_KEY", value=shared_key.storage_account_key), + batch_models.EnvironmentSetting(name="STORAGE_ACCOUNT_SUFFIX", value=shared_key.storage_account_suffix), + ] + else: + return [ + batch_models.EnvironmentSetting(name="SP_TENANT_ID", value=service_principal.tenant_id), + batch_models.EnvironmentSetting(name="SP_CLIENT_ID", value=service_principal.client_id), + batch_models.EnvironmentSetting(name="SP_CREDENTIAL", value=service_principal.credential), + batch_models.EnvironmentSetting( + name="SP_BATCH_RESOURCE_ID", value=service_principal.batch_account_resource_id), + batch_models.EnvironmentSetting( + name="SP_STORAGE_RESOURCE_ID", value=service_principal.storage_account_resource_id), + ] + + +def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, + gpu_enabled: bool, + docker_repo: str = None, + plugins=None, + worker_on_master: bool = True, + file_mounts=None, + mixed_mode: bool = False): + """ + For Docker on ubuntu 16.04 - return the command line + to be run on the start task of the pool to setup spark. + """ + default_docker_repo = constants.DEFAULT_DOCKER_REPO if not gpu_enabled else constants.DEFAULT_DOCKER_REPO_GPU + docker_repo = docker_repo or default_docker_repo + + shares = [] + + if file_mounts: + for mount in file_mounts: + # Create the directory on the node + shares.append('mkdir -p {0}'.format(mount.mount_path)) + + # Mount the file share + shares.append( + 'mount -t cifs //{0}.file.core.windows.net/{2} {3} -o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp'. + format(mount.storage_account_name, mount.storage_account_key, mount.file_share_path, mount.mount_path)) + + setup = [ + 'time('\ + 'apt-get -y update;'\ + 'apt-get -y --no-install-recommends install unzip;'\ + 'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\ + 'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\ + ') 2>&1'.format(zip_resource_file.file_path), + '/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1}'.format( + constants.DOCKER_SPARK_CONTAINER_NAME, + docker_repo, + ) + ] + + commands = shares + setup + return commands + + +def generate_cluster_start_task(spark_client, + zip_resource_file: batch_models.ResourceFile, + cluster_id: str, + gpu_enabled: bool, + docker_repo: str = None, + file_shares: List[models.FileShare] = None, + plugins: List[models.PluginConfiguration] = None, + mixed_mode: bool = False, + worker_on_master: bool = True): + """ + This will return the start task object for the pool to be created. + :param cluster_id str: Id of the cluster(Used for uploading the resource files) + :param zip_resource_file: Resource file object pointing to the zip file containing scripts to run on the node + """ + + resource_files = [zip_resource_file] + spark_web_ui_port = constants.DOCKER_SPARK_WEB_UI_PORT + spark_worker_ui_port = constants.DOCKER_SPARK_WORKER_UI_PORT + spark_job_ui_port = constants.DOCKER_SPARK_JOB_UI_PORT + + spark_container_name = constants.DOCKER_SPARK_CONTAINER_NAME + spark_submit_logs_file = constants.SPARK_SUBMIT_LOGS_FILE + + # TODO use certificate + environment_settings = __get_secrets_env(spark_client) + [ + batch_models.EnvironmentSetting(name="SPARK_WEB_UI_PORT", value=spark_web_ui_port), + batch_models.EnvironmentSetting(name="SPARK_WORKER_UI_PORT", value=spark_worker_ui_port), + batch_models.EnvironmentSetting(name="SPARK_JOB_UI_PORT", value=spark_job_ui_port), + batch_models.EnvironmentSetting(name="SPARK_CONTAINER_NAME", value=spark_container_name), + batch_models.EnvironmentSetting(name="SPARK_SUBMIT_LOGS_FILE", value=spark_submit_logs_file), + batch_models.EnvironmentSetting(name="AZTK_GPU_ENABLED", value=helpers.bool_env(gpu_enabled)), + ] + __get_docker_credentials(spark_client) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode) + + # start task command + command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, plugins, worker_on_master, file_shares, + mixed_mode) + + return batch_models.StartTask( + command_line=helpers.wrap_commands_in_shell(command), + resource_files=resource_files, + environment_settings=environment_settings, + user_identity=POOL_ADMIN_USER_IDENTITY, + wait_for_success=True) + + +def create_cluster(spark_cluster_client, cluster_conf: models.ClusterConfiguration, wait: bool = False): + """ + Create a new aztk spark cluster + + Args: + cluster_conf(aztk.spark.models.models.ClusterConfiguration): Configuration for the the cluster to be created + wait(bool): If you should wait for the cluster to be ready before returning + + Returns: + aztk.spark.models.Cluster + """ + cluster_conf = _apply_default_for_cluster_config(cluster_conf) + cluster_conf.validate() + + cluster_data = spark_cluster_client._get_cluster_data(cluster_conf.cluster_id) + try: + zip_resource_files = None + node_data = NodeData(cluster_conf).add_core().done() + zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() + + start_task = generate_cluster_start_task(spark_cluster_client, zip_resource_files, cluster_conf.cluster_id, + cluster_conf.gpu_enabled(), cluster_conf.get_docker_repo(), + cluster_conf.file_shares, cluster_conf.plugins, + cluster_conf.mixed_mode(), cluster_conf.worker_on_master) + + software_metadata_key = "spark" + + vm_image = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04') + + cluster = spark_cluster_client.create_pool_and_job(cluster_conf, software_metadata_key, start_task, vm_image) + + # Wait for the master to be ready + if wait: + util.wait_for_master_to_be_ready(spark_cluster_client, cluster.id) + cluster = spark_cluster_client.get_cluster(cluster.id) + + return cluster + + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/delete.py b/aztk/spark/client/cluster/helpers/delete.py new file mode 100644 index 00000000..10a9f263 --- /dev/null +++ b/aztk/spark/client/cluster/helpers/delete.py @@ -0,0 +1,10 @@ +from aztk import error +import azure.batch.models.batch_error as batch_error +from aztk.spark import helpers + + +def delete_cluster(spark_cluster_client, cluster_id: str, keep_logs: bool = False): + try: + return spark_cluster_client.delete_pool_and_job(cluster_id, keep_logs) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/get.py b/aztk/spark/client/cluster/helpers/get.py new file mode 100644 index 00000000..b63fb54b --- /dev/null +++ b/aztk/spark/client/cluster/helpers/get.py @@ -0,0 +1,13 @@ +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import models +from aztk.spark import helpers + + +def get_cluster(spark_cluster_client, cluster_id: str): + try: + pool, nodes = spark_cluster_client.get_pool_details(cluster_id) + return models.Cluster(pool, nodes) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/list.py b/aztk/spark/client/cluster/helpers/list.py new file mode 100644 index 00000000..958b0e10 --- /dev/null +++ b/aztk/spark/client/cluster/helpers/list.py @@ -0,0 +1,13 @@ +import azure.batch.models.batch_error as batch_error + +import aztk.models # TODO: get rid of this import and use aztk.spark.models +from aztk import error +from aztk.spark import models +from aztk.spark import helpers + + +def list_clusters(spark_cluster_client): + try: + return [models.Cluster(pool) for pool in spark_cluster_client.__list_clusters(aztk.models.Software.spark)] + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/submit.py b/aztk/spark/client/cluster/helpers/submit.py new file mode 100644 index 00000000..689c1b68 --- /dev/null +++ b/aztk/spark/client/cluster/helpers/submit.py @@ -0,0 +1,145 @@ +import datetime +import os +from typing import List + +import azure.batch.models as batch_models +import yaml + +from aztk.error import AztkError +from aztk.spark import models +from aztk.utils import constants, helpers +from aztk.utils.command_builder import CommandBuilder +import azure.batch.models.batch_error as batch_error +from aztk import error + +''' +Submit helper methods +''' + + +def __get_node(spark_client, node_id: str, cluster_id: str) -> batch_models.ComputeNode: + return spark_client.batch_client.compute_node.get(cluster_id, node_id) + + +def generate_task(spark_client, container_id, application, remote=False): + resource_files = [] + + # The application provided is not hosted remotely and therefore must be uploaded + if not remote: + app_resource_file = helpers.upload_file_to_container( + container_name=container_id, + application_name=application.name, + file_path=application.application, + blob_client=spark_client.blob_client, + use_full_path=False) + + # Upload application file + resource_files.append(app_resource_file) + + application.application = '$AZ_BATCH_TASK_WORKING_DIR/' + os.path.basename(application.application) + + # Upload dependent JARS + jar_resource_file_paths = [] + for jar in application.jars: + current_jar_resource_file_path = helpers.upload_file_to_container( + container_name=container_id, + application_name=application.name, + file_path=jar, + blob_client=spark_client.blob_client, + use_full_path=False) + jar_resource_file_paths.append(current_jar_resource_file_path) + resource_files.append(current_jar_resource_file_path) + + # Upload dependent python files + py_files_resource_file_paths = [] + for py_file in application.py_files: + current_py_files_resource_file_path = helpers.upload_file_to_container( + container_name=container_id, + application_name=application.name, + file_path=py_file, + blob_client=spark_client.blob_client, + use_full_path=False) + py_files_resource_file_paths.append(current_py_files_resource_file_path) + resource_files.append(current_py_files_resource_file_path) + + # Upload other dependent files + files_resource_file_paths = [] + for file in application.files: + files_resource_file_path = helpers.upload_file_to_container( + container_name=container_id, + application_name=application.name, + file_path=file, + blob_client=spark_client.blob_client, + use_full_path=False) + files_resource_file_paths.append(files_resource_file_path) + resource_files.append(files_resource_file_path) + + # Upload application definition + application.jars = [os.path.basename(jar) for jar in application.jars] + application.py_files = [os.path.basename(py_files) for py_files in application.py_files] + application.files = [os.path.basename(files) for files in application.files] + application_definition_file = helpers.upload_text_to_container( + container_name=container_id, + application_name=application.name, + file_path='application.yaml', + content=yaml.dump(vars(application)), + blob_client=spark_client.blob_client) + resource_files.append(application_definition_file) + + # create command to submit task + task_cmd = CommandBuilder('sudo docker exec') + task_cmd.add_argument('-i') + task_cmd.add_option('-e', 'AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR') + task_cmd.add_option('-e', 'STORAGE_LOGS_CONTAINER={0}'.format(container_id)) + task_cmd.add_argument('spark /bin/bash >> output.log 2>&1') + task_cmd.add_argument('-c "source ~/.bashrc; ' \ + 'export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; ' \ + 'cd \$AZ_BATCH_TASK_WORKING_DIR; ' \ + '\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"') + + # Create task + task = batch_models.TaskAddParameter( + id=application.name, + command_line=helpers.wrap_commands_in_shell([task_cmd.to_str()]), + resource_files=resource_files, + constraints=batch_models.TaskConstraints(max_task_retry_count=application.max_retry_count), + user_identity=batch_models.UserIdentity( + auto_user=batch_models.AutoUserSpecification( + scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin))) + + return task + + +def affinitize_task_to_master(spark_client, cluster_id, task): + cluster = spark_client.get_cluster(cluster_id) + if cluster.master_node_id is None: + raise AztkError("Master has not yet been selected. Please wait until the cluster is finished provisioning.") + master_node = spark_client.batch_client.compute_node.get(pool_id=cluster_id, node_id=cluster.master_node_id) + task.affinity_info = batch_models.AffinityInformation(affinity_id=master_node.affinity_id) + return task + + +def submit_application(spark_client, cluster_id, application, remote: bool = False, wait: bool = False): + """ + Submit a spark app + """ + task = generate_task(spark_client, cluster_id, application, remote) + task = affinitize_task_to_master(spark_client, cluster_id, task) + + # Add task to batch job (which has the same name as cluster_id) + job_id = cluster_id + spark_client.batch_client.task.add(job_id=job_id, task=task) + + if wait: + helpers.wait_for_task_to_complete(job_id=job_id, task_id=task.id, batch_client=spark_client.batch_client) + + +def submit(spark_cluster_client, + cluster_id: str, + application: models.ApplicationConfiguration, + remote: bool = False, + wait: bool = False): + try: + submit_application(spark_cluster_client, cluster_id, application, remote, wait) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/__init__.py b/aztk/spark/client/job/__init__.py new file mode 100644 index 00000000..3ff722bf --- /dev/null +++ b/aztk/spark/client/job/__init__.py @@ -0,0 +1 @@ +from .client import Client diff --git a/aztk/spark/client/job/client.py b/aztk/spark/client/job/client.py new file mode 100644 index 00000000..4a78f660 --- /dev/null +++ b/aztk/spark/client/job/client.py @@ -0,0 +1,4 @@ +import aztk.spark.Client + +class Client(): + pass diff --git a/aztk/spark/client/job/helpers/__init__.py b/aztk/spark/client/job/helpers/__init__.py new file mode 100644 index 00000000..e69de29b From 35da2154e240e91f2d45c50918d86f8c73b5db95 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 25 Jun 2018 15:50:58 -0700 Subject: [PATCH 02/52] continue refactor for cluster and job functions --- aztk/spark/client.py | 31 ++-- aztk/spark/client/base/__init__.py | 2 +- aztk/spark/client/base/client.py | 26 ++- .../base/helpers/generate_application_task.py | 96 ++++++++++++ .../helpers/generate_cluster_start_task.py | 148 ++++++++++++++++++ aztk/spark/client/cluster/client.py | 50 +++++- aztk/spark/client/cluster/helpers/copy.py | 12 ++ aztk/spark/client/cluster/helpers/create.py | 135 +--------------- .../client/cluster/helpers/create_user.py | 15 ++ aztk/spark/client/cluster/helpers/download.py | 19 +++ .../cluster/helpers/get_application_log.py | 114 ++++++++++++++ .../cluster/helpers/get_application_status.py | 12 ++ aztk/spark/client/cluster/helpers/node_run.py | 18 +++ aztk/spark/client/cluster/helpers/run.py | 12 ++ .../client/cluster/helpers/ssh_into_master.py | 12 ++ aztk/spark/client/cluster/helpers/submit.py | 92 +---------- aztk/spark/client/job/client.py | 33 +++- aztk/spark/client/job/helpers/delete.py | 39 +++++ aztk/spark/client/job/helpers/get.py | 32 ++++ .../client/job/helpers/get_application.py | 25 +++ .../client/job/helpers/get_application_log.py | 39 +++++ .../client/job/helpers/get_recent_job.py | 3 + aztk/spark/client/job/helpers/list.py | 16 ++ .../client/job/helpers/list_applications.py | 35 +++++ aztk/spark/client/job/helpers/stop.py | 22 +++ aztk/spark/client/job/helpers/submit.py | 111 +++++++++++++ 26 files changed, 903 insertions(+), 246 deletions(-) create mode 100644 aztk/spark/client/base/helpers/generate_application_task.py create mode 100644 aztk/spark/client/base/helpers/generate_cluster_start_task.py create mode 100644 aztk/spark/client/cluster/helpers/copy.py create mode 100644 aztk/spark/client/cluster/helpers/create_user.py create mode 100644 aztk/spark/client/cluster/helpers/download.py create mode 100644 aztk/spark/client/cluster/helpers/get_application_log.py create mode 100644 aztk/spark/client/cluster/helpers/get_application_status.py create mode 100644 aztk/spark/client/cluster/helpers/node_run.py create mode 100644 aztk/spark/client/cluster/helpers/run.py create mode 100644 aztk/spark/client/cluster/helpers/ssh_into_master.py create mode 100644 aztk/spark/client/job/helpers/delete.py create mode 100644 aztk/spark/client/job/helpers/get.py create mode 100644 aztk/spark/client/job/helpers/get_application.py create mode 100644 aztk/spark/client/job/helpers/get_application_log.py create mode 100644 aztk/spark/client/job/helpers/get_recent_job.py create mode 100644 aztk/spark/client/job/helpers/list.py create mode 100644 aztk/spark/client/job/helpers/list_applications.py create mode 100644 aztk/spark/client/job/helpers/stop.py create mode 100644 aztk/spark/client/job/helpers/submit.py diff --git a/aztk/spark/client.py b/aztk/spark/client.py index 5157d8e4..830a7e50 100644 --- a/aztk/spark/client.py +++ b/aztk/spark/client.py @@ -75,7 +75,7 @@ def create_cluster(self, cluster_conf: models.ClusterConfiguration, wait: bool = except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) - def create_clusters_in_parallel(self, cluster_confs): + def create_clusters_in_parallel(self, cluster_confs): # NOT IMPLEMENTED for cluster_conf in cluster_confs: self.create_cluster(cluster_conf) @@ -98,7 +98,7 @@ def list_clusters(self): except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) - def get_remote_login_settings(self, cluster_id: str, node_id: str): + def get_remote_login_settings(self, cluster_id: str, node_id: str): # NOT IMPLEMENTED try: return self.__get_remote_login_settings(cluster_id, node_id) except batch_error.BatchErrorException as e: @@ -110,25 +110,25 @@ def submit(self, cluster_id: str, application: models.ApplicationConfiguration, except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) -########################################### CURRENT PROGRESS ##################################################### - def submit_all_applications(self, cluster_id: str, applications): + def submit_all_applications(self, cluster_id: str, applications): # NOT IMPLEMENTED for application in applications: self.submit(cluster_id, application) - def wait_until_application_done(self, cluster_id: str, task_id: str): + + def wait_until_application_done(self, cluster_id: str, task_id: str): # NOT IMPLEMENTED try: helpers.wait_for_task_to_complete(job_id=cluster_id, task_id=task_id, batch_client=self.batch_client) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) - def wait_until_applications_done(self, cluster_id: str): + def wait_until_applications_done(self, cluster_id: str): # NOT IMPLEMENTED try: helpers.wait_for_tasks_to_complete(job_id=cluster_id, batch_client=self.batch_client) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) - def wait_until_cluster_is_ready(self, cluster_id: str): + def wait_until_cluster_is_ready(self, cluster_id: str): # NOT IMPLEMENTED try: util.wait_for_master_to_be_ready(self, cluster_id) pool = self.batch_client.pool.get(cluster_id) @@ -137,10 +137,11 @@ def wait_until_cluster_is_ready(self, cluster_id: str): except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) - def wait_until_all_clusters_are_ready(self, clusters: List[str]): + def wait_until_all_clusters_are_ready(self, clusters: List[str]): # NOT IMPLEMENTED for cluster_id in clusters: self.wait_until_cluster_is_ready(cluster_id) + def create_user(self, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str: try: cluster = self.get_cluster(cluster_id) @@ -151,6 +152,7 @@ def create_user(self, cluster_id: str, username: str, password: str = None, ssh_ except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) + def get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): try: return get_log_helper.get_log(self.batch_client, self.blob_client, @@ -158,6 +160,7 @@ def get_application_log(self, cluster_id: str, application_name: str, tail=False except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) + def get_application_status(self, cluster_id: str, app_name: str): try: task = self.batch_client.task.get(cluster_id, app_name) @@ -165,6 +168,7 @@ def get_application_status(self, cluster_id: str, app_name: str): except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) + def cluster_run(self, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): try: return self.__cluster_run(cluster_id, @@ -186,6 +190,9 @@ def node_run(self, cluster_id: str, node_id: str, command: str, host=False, inte except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) +########################################### CURRENT PROGRESS ##################################################### + + def cluster_copy(self, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False, timeout: int = None): try: container_name = None if host else 'spark' @@ -319,23 +326,23 @@ def get_job_application_log(self, job_id, application_name): except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) - def stop_job_app(self, job_id, application_name): + def stop_job_app(self, job_id, application_name): # NOT IMPLEMENTED try: return job_submit_helper.stop_app(self, job_id, application_name) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) - def wait_until_job_finished(self, job_id): + def wait_until_job_finished(self, job_id): # NOT IMPLEMENTED try: job_submit_helper.wait_until_job_finished(self, job_id) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) - def wait_until_all_jobs_finished(self, jobs): + def wait_until_all_jobs_finished(self, jobs): # NOT IMPLEMENTED for job in jobs: self.wait_until_job_finished(job) - def run_cluster_diagnostics(self, cluster_id, output_directory=None): + def run_cluster_diagnostics(self, cluster_id, output_directory=None): # NOT IMPLEMENTED try: output = cluster_diagnostic_helper.run(self, cluster_id, output_directory) return output diff --git a/aztk/spark/client/base/__init__.py b/aztk/spark/client/base/__init__.py index b957ca23..62681658 100644 --- a/aztk/spark/client/base/__init__.py +++ b/aztk/spark/client/base/__init__.py @@ -1 +1 @@ -from .client import BaseClient +from .client import SparkBaseClient diff --git a/aztk/spark/client/base/client.py b/aztk/spark/client/base/client.py index d6d1c5fe..95d20f58 100644 --- a/aztk/spark/client/base/client.py +++ b/aztk/spark/client/base/client.py @@ -1,4 +1,26 @@ +from typing import List + +import azure.batch.models as batch_models + from aztk.client.base import BaseClient as CoreBaseClient +from aztk.spark import models + +from .helpers import generate_cluster_start_task, generate_application_task + + +class SparkBaseClient(CoreBaseClient): + def __generate_cluster_start_task(self, + zip_resource_file: batch_models.ResourceFile, + cluster_id: str, + gpu_enabled: bool, + docker_repo: str = None, + file_shares: List[models.FileShare] = None, + plugins: List[models.PluginConfiguration] = None, + mixed_mode: bool = False, + worker_on_master: bool = True): + return generate_cluster_start_task.generate_cluster_start_task(self, zip_resource_file, cluster_id, gpu_enabled, + docker_repo, file_shares, plugins, mixed_mode, + worker_on_master) -class BaseClient(CoreBaseClient): - pass + def __generate_application_task(self, container_id, application, remote=False): + return generate_application_task.generate_application_task(self, container_id, application, remote) diff --git a/aztk/spark/client/base/helpers/generate_application_task.py b/aztk/spark/client/base/helpers/generate_application_task.py new file mode 100644 index 00000000..0dfc3a4d --- /dev/null +++ b/aztk/spark/client/base/helpers/generate_application_task.py @@ -0,0 +1,96 @@ +import os + +import azure.batch.models as batch_models +import yaml + +from aztk.utils import helpers +from aztk.utils.command_builder import CommandBuilder + + +def generate_application_task(spark_client, container_id, application, remote=False): + resource_files = [] + + # The application provided is not hosted remotely and therefore must be uploaded + if not remote: + app_resource_file = helpers.upload_file_to_container( + container_name=container_id, + application_name=application.name, + file_path=application.application, + blob_client=spark_client.blob_client, + use_full_path=False) + + # Upload application file + resource_files.append(app_resource_file) + + application.application = '$AZ_BATCH_TASK_WORKING_DIR/' + os.path.basename(application.application) + + # Upload dependent JARS + jar_resource_file_paths = [] + for jar in application.jars: + current_jar_resource_file_path = helpers.upload_file_to_container( + container_name=container_id, + application_name=application.name, + file_path=jar, + blob_client=spark_client.blob_client, + use_full_path=False) + jar_resource_file_paths.append(current_jar_resource_file_path) + resource_files.append(current_jar_resource_file_path) + + # Upload dependent python files + py_files_resource_file_paths = [] + for py_file in application.py_files: + current_py_files_resource_file_path = helpers.upload_file_to_container( + container_name=container_id, + application_name=application.name, + file_path=py_file, + blob_client=spark_client.blob_client, + use_full_path=False) + py_files_resource_file_paths.append(current_py_files_resource_file_path) + resource_files.append(current_py_files_resource_file_path) + + # Upload other dependent files + files_resource_file_paths = [] + for file in application.files: + files_resource_file_path = helpers.upload_file_to_container( + container_name=container_id, + application_name=application.name, + file_path=file, + blob_client=spark_client.blob_client, + use_full_path=False) + files_resource_file_paths.append(files_resource_file_path) + resource_files.append(files_resource_file_path) + + # Upload application definition + application.jars = [os.path.basename(jar) for jar in application.jars] + application.py_files = [os.path.basename(py_files) for py_files in application.py_files] + application.files = [os.path.basename(files) for files in application.files] + application_definition_file = helpers.upload_text_to_container( + container_name=container_id, + application_name=application.name, + file_path='application.yaml', + content=yaml.dump(vars(application)), + blob_client=spark_client.blob_client) + resource_files.append(application_definition_file) + + # create command to submit task + task_cmd = CommandBuilder('sudo docker exec') + task_cmd.add_argument('-i') + task_cmd.add_option('-e', 'AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR') + task_cmd.add_option('-e', 'STORAGE_LOGS_CONTAINER={0}'.format(container_id)) + task_cmd.add_argument('spark /bin/bash >> output.log 2>&1') + task_cmd.add_argument('-c "source ~/.bashrc; ' \ + 'export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; ' \ + 'cd \$AZ_BATCH_TASK_WORKING_DIR; ' \ + '\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"') + + # Create task + task = batch_models.TaskAddParameter( + id=application.name, + command_line=helpers.wrap_commands_in_shell([task_cmd.to_str()]), + resource_files=resource_files, + constraints=batch_models.TaskConstraints(max_task_retry_count=application.max_retry_count), + user_identity=batch_models.UserIdentity( + auto_user=batch_models.AutoUserSpecification( + scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin))) + + return task diff --git a/aztk/spark/client/base/helpers/generate_cluster_start_task.py b/aztk/spark/client/base/helpers/generate_cluster_start_task.py new file mode 100644 index 00000000..3dc0c742 --- /dev/null +++ b/aztk/spark/client/base/helpers/generate_cluster_start_task.py @@ -0,0 +1,148 @@ +from typing import List + +import azure.batch.models as batch_models +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.internal.cluster_data import NodeData +from aztk.spark import models +from aztk.spark.utils import util +from aztk.utils import constants, helpers +from aztk.spark import models + +POOL_ADMIN_USER_IDENTITY = batch_models.UserIdentity( + auto_user=batch_models.AutoUserSpecification( + scope=batch_models.AutoUserScope.pool, elevation_level=batch_models.ElevationLevel.admin)) + + +def _get_aztk_environment(cluster_id, worker_on_master, mixed_mode): + envs = [] + envs.append(batch_models.EnvironmentSetting(name="AZTK_MIXED_MODE", value=helpers.bool_env(mixed_mode))) + envs.append(batch_models.EnvironmentSetting(name="AZTK_WORKER_ON_MASTER", value=helpers.bool_env(worker_on_master))) + envs.append(batch_models.EnvironmentSetting(name="AZTK_CLUSTER_ID", value=cluster_id)) + return envs + + +def __get_docker_credentials(spark_client): + creds = [] + docker = spark_client.secrets_config.docker + if docker: + if docker.endpoint: + creds.append(batch_models.EnvironmentSetting(name="DOCKER_ENDPOINT", value=docker.endpoint)) + if docker.username: + creds.append(batch_models.EnvironmentSetting(name="DOCKER_USERNAME", value=docker.username)) + if docker.password: + creds.append(batch_models.EnvironmentSetting(name="DOCKER_PASSWORD", value=docker.password)) + + return creds + + +def __get_secrets_env(spark_client): + shared_key = spark_client.secrets_config.shared_key + service_principal = spark_client.secrets_config.service_principal + if shared_key: + return [ + batch_models.EnvironmentSetting(name="BATCH_SERVICE_URL", value=shared_key.batch_service_url), + batch_models.EnvironmentSetting(name="BATCH_ACCOUNT_KEY", value=shared_key.batch_account_key), + batch_models.EnvironmentSetting(name="STORAGE_ACCOUNT_NAME", value=shared_key.storage_account_name), + batch_models.EnvironmentSetting(name="STORAGE_ACCOUNT_KEY", value=shared_key.storage_account_key), + batch_models.EnvironmentSetting(name="STORAGE_ACCOUNT_SUFFIX", value=shared_key.storage_account_suffix), + ] + else: + return [ + batch_models.EnvironmentSetting(name="SP_TENANT_ID", value=service_principal.tenant_id), + batch_models.EnvironmentSetting(name="SP_CLIENT_ID", value=service_principal.client_id), + batch_models.EnvironmentSetting(name="SP_CREDENTIAL", value=service_principal.credential), + batch_models.EnvironmentSetting( + name="SP_BATCH_RESOURCE_ID", value=service_principal.batch_account_resource_id), + batch_models.EnvironmentSetting( + name="SP_STORAGE_RESOURCE_ID", value=service_principal.storage_account_resource_id), + ] + + +def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, + gpu_enabled: bool, + docker_repo: str = None, + plugins=None, + worker_on_master: bool = True, + file_mounts=None, + mixed_mode: bool = False): + """ + For Docker on ubuntu 16.04 - return the command line + to be run on the start task of the pool to setup spark. + """ + default_docker_repo = constants.DEFAULT_DOCKER_REPO if not gpu_enabled else constants.DEFAULT_DOCKER_REPO_GPU + docker_repo = docker_repo or default_docker_repo + + shares = [] + + if file_mounts: + for mount in file_mounts: + # Create the directory on the node + shares.append('mkdir -p {0}'.format(mount.mount_path)) + + # Mount the file share + shares.append( + 'mount -t cifs //{0}.file.core.windows.net/{2} {3} -o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp'. + format(mount.storage_account_name, mount.storage_account_key, mount.file_share_path, mount.mount_path)) + + setup = [ + 'time('\ + 'apt-get -y update;'\ + 'apt-get -y --no-install-recommends install unzip;'\ + 'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\ + 'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\ + ') 2>&1'.format(zip_resource_file.file_path), + '/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1}'.format( + constants.DOCKER_SPARK_CONTAINER_NAME, + docker_repo, + ) + ] + + commands = shares + setup + return commands + + +def generate_cluster_start_task(spark_client, + zip_resource_file: batch_models.ResourceFile, + cluster_id: str, + gpu_enabled: bool, + docker_repo: str = None, + file_shares: List[models.FileShare] = None, + plugins: List[models.PluginConfiguration] = None, + mixed_mode: bool = False, + worker_on_master: bool = True): + """ + This will return the start task object for the pool to be created. + :param cluster_id str: Id of the cluster(Used for uploading the resource files) + :param zip_resource_file: Resource file object pointing to the zip file containing scripts to run on the node + """ + + resource_files = [zip_resource_file] + spark_web_ui_port = constants.DOCKER_SPARK_WEB_UI_PORT + spark_worker_ui_port = constants.DOCKER_SPARK_WORKER_UI_PORT + spark_job_ui_port = constants.DOCKER_SPARK_JOB_UI_PORT + + spark_container_name = constants.DOCKER_SPARK_CONTAINER_NAME + spark_submit_logs_file = constants.SPARK_SUBMIT_LOGS_FILE + + # TODO use certificate + environment_settings = __get_secrets_env(spark_client) + [ + batch_models.EnvironmentSetting(name="SPARK_WEB_UI_PORT", value=spark_web_ui_port), + batch_models.EnvironmentSetting(name="SPARK_WORKER_UI_PORT", value=spark_worker_ui_port), + batch_models.EnvironmentSetting(name="SPARK_JOB_UI_PORT", value=spark_job_ui_port), + batch_models.EnvironmentSetting(name="SPARK_CONTAINER_NAME", value=spark_container_name), + batch_models.EnvironmentSetting(name="SPARK_SUBMIT_LOGS_FILE", value=spark_submit_logs_file), + batch_models.EnvironmentSetting(name="AZTK_GPU_ENABLED", value=helpers.bool_env(gpu_enabled)), + ] + __get_docker_credentials(spark_client) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode) + + # start task command + command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, plugins, worker_on_master, file_shares, + mixed_mode) + + return batch_models.StartTask( + command_line=helpers.wrap_commands_in_shell(command), + resource_files=resource_files, + environment_settings=environment_settings, + user_identity=POOL_ADMIN_USER_IDENTITY, + wait_for_success=True) diff --git a/aztk/spark/client/cluster/client.py b/aztk/spark/client/cluster/client.py index d9238f0f..3d288195 100644 --- a/aztk/spark/client/cluster/client.py +++ b/aztk/spark/client/cluster/client.py @@ -1,9 +1,51 @@ -from aztk.spark.client.cluster.client import Client as cluster_client - -from .helpers import create from aztk.spark import models +from aztk.spark.client.base import SparkBaseClient + +from .helpers import (copy, create, create_user, delete, get, + get_application_log, get_application_status, list, + node_run, run, submit) -class Client(cluster_client): +class Client(SparkBaseClient): def create(self, cluster_configuration: models.ClusterConfiguration, wait: bool = False): return create.create_cluster(self, cluster_configuration, wait) + + def delete(self, cluster_id: str, keep_logs: bool = False): + return delete.delete_cluster(self, cluster_id, keep_logs) + + def get(self, cluster_id: str): + return get.get_cluster(self, cluster_id) + + def list(self): + return list.list_clusters(self) + + def submit(self, + cluster_id: str, + application: models.ApplicationConfiguration, + remote: bool = False, + wait: bool = False): + return submit.submit(self, cluster_id, application, remote, wait) + + def create_user(self, cluster_id: str, username: str, password: str = None, ssh_key: str = None): + return create_user.create_user(self, cluster_id, username, ssh_key, password) + + def get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): + return get_application_log.get_application_log(self, cluster_id, application_name, tail, current_bytes) + + def get_application_status(self, cluster_id: str, application_name: str): + return get_application_status.get_application_status(self, cluster_id, application_name) + + def run(self, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): + return run.cluster_run(self, cluster_id, command, host, internal, timeout) + + def node_run(self, cluster_id: str, node_id: str, command: str, host=False, internal: bool = False, timeout=None): + return node_run.node_run(cluster_id, node_id, command, host, internal, timeout) + + def copy(self, + cluster_id: str, + source_path: str, + destination_path: str, + host: bool = False, + internal: bool = False, + timeout: int = None): + return copy.cluster_copy(self, cluster_id, source_path, destination_path, host, internal, timeout) diff --git a/aztk/spark/client/cluster/helpers/copy.py b/aztk/spark/client/cluster/helpers/copy.py new file mode 100644 index 00000000..2434438d --- /dev/null +++ b/aztk/spark/client/cluster/helpers/copy.py @@ -0,0 +1,12 @@ +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import helpers + + +def cluster_copy(spark_cluster_client, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False, timeout: int = None): + try: + container_name = None if host else 'spark' + return spark_cluster_client.cluster_copy(cluster_id, source_path, destination_path=destination_path, container_name=container_name, get=False, internal=internal, timeout=timeout) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/create.py b/aztk/spark/client/cluster/helpers/create.py index 38d61727..df9ce399 100644 --- a/aztk/spark/client/cluster/helpers/create.py +++ b/aztk/spark/client/cluster/helpers/create.py @@ -28,139 +28,6 @@ def _apply_default_for_cluster_config(configuration: models.ClusterConfiguration return cluster_conf -def _get_aztk_environment(cluster_id, worker_on_master, mixed_mode): - envs = [] - envs.append(batch_models.EnvironmentSetting(name="AZTK_MIXED_MODE", value=helpers.bool_env(mixed_mode))) - envs.append(batch_models.EnvironmentSetting(name="AZTK_WORKER_ON_MASTER", value=helpers.bool_env(worker_on_master))) - envs.append(batch_models.EnvironmentSetting(name="AZTK_CLUSTER_ID", value=cluster_id)) - return envs - - -def __get_docker_credentials(spark_client): - creds = [] - docker = spark_client.secrets_config.docker - if docker: - if docker.endpoint: - creds.append(batch_models.EnvironmentSetting(name="DOCKER_ENDPOINT", value=docker.endpoint)) - if docker.username: - creds.append(batch_models.EnvironmentSetting(name="DOCKER_USERNAME", value=docker.username)) - if docker.password: - creds.append(batch_models.EnvironmentSetting(name="DOCKER_PASSWORD", value=docker.password)) - - return creds - - -def __get_secrets_env(spark_client): - shared_key = spark_client.secrets_config.shared_key - service_principal = spark_client.secrets_config.service_principal - if shared_key: - return [ - batch_models.EnvironmentSetting(name="BATCH_SERVICE_URL", value=shared_key.batch_service_url), - batch_models.EnvironmentSetting(name="BATCH_ACCOUNT_KEY", value=shared_key.batch_account_key), - batch_models.EnvironmentSetting(name="STORAGE_ACCOUNT_NAME", value=shared_key.storage_account_name), - batch_models.EnvironmentSetting(name="STORAGE_ACCOUNT_KEY", value=shared_key.storage_account_key), - batch_models.EnvironmentSetting(name="STORAGE_ACCOUNT_SUFFIX", value=shared_key.storage_account_suffix), - ] - else: - return [ - batch_models.EnvironmentSetting(name="SP_TENANT_ID", value=service_principal.tenant_id), - batch_models.EnvironmentSetting(name="SP_CLIENT_ID", value=service_principal.client_id), - batch_models.EnvironmentSetting(name="SP_CREDENTIAL", value=service_principal.credential), - batch_models.EnvironmentSetting( - name="SP_BATCH_RESOURCE_ID", value=service_principal.batch_account_resource_id), - batch_models.EnvironmentSetting( - name="SP_STORAGE_RESOURCE_ID", value=service_principal.storage_account_resource_id), - ] - - -def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, - gpu_enabled: bool, - docker_repo: str = None, - plugins=None, - worker_on_master: bool = True, - file_mounts=None, - mixed_mode: bool = False): - """ - For Docker on ubuntu 16.04 - return the command line - to be run on the start task of the pool to setup spark. - """ - default_docker_repo = constants.DEFAULT_DOCKER_REPO if not gpu_enabled else constants.DEFAULT_DOCKER_REPO_GPU - docker_repo = docker_repo or default_docker_repo - - shares = [] - - if file_mounts: - for mount in file_mounts: - # Create the directory on the node - shares.append('mkdir -p {0}'.format(mount.mount_path)) - - # Mount the file share - shares.append( - 'mount -t cifs //{0}.file.core.windows.net/{2} {3} -o vers=3.0,username={0},password={1},dir_mode=0777,file_mode=0777,sec=ntlmssp'. - format(mount.storage_account_name, mount.storage_account_key, mount.file_share_path, mount.mount_path)) - - setup = [ - 'time('\ - 'apt-get -y update;'\ - 'apt-get -y --no-install-recommends install unzip;'\ - 'unzip -o $AZ_BATCH_TASK_WORKING_DIR/{0};'\ - 'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh;'\ - ') 2>&1'.format(zip_resource_file.file_path), - '/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1}'.format( - constants.DOCKER_SPARK_CONTAINER_NAME, - docker_repo, - ) - ] - - commands = shares + setup - return commands - - -def generate_cluster_start_task(spark_client, - zip_resource_file: batch_models.ResourceFile, - cluster_id: str, - gpu_enabled: bool, - docker_repo: str = None, - file_shares: List[models.FileShare] = None, - plugins: List[models.PluginConfiguration] = None, - mixed_mode: bool = False, - worker_on_master: bool = True): - """ - This will return the start task object for the pool to be created. - :param cluster_id str: Id of the cluster(Used for uploading the resource files) - :param zip_resource_file: Resource file object pointing to the zip file containing scripts to run on the node - """ - - resource_files = [zip_resource_file] - spark_web_ui_port = constants.DOCKER_SPARK_WEB_UI_PORT - spark_worker_ui_port = constants.DOCKER_SPARK_WORKER_UI_PORT - spark_job_ui_port = constants.DOCKER_SPARK_JOB_UI_PORT - - spark_container_name = constants.DOCKER_SPARK_CONTAINER_NAME - spark_submit_logs_file = constants.SPARK_SUBMIT_LOGS_FILE - - # TODO use certificate - environment_settings = __get_secrets_env(spark_client) + [ - batch_models.EnvironmentSetting(name="SPARK_WEB_UI_PORT", value=spark_web_ui_port), - batch_models.EnvironmentSetting(name="SPARK_WORKER_UI_PORT", value=spark_worker_ui_port), - batch_models.EnvironmentSetting(name="SPARK_JOB_UI_PORT", value=spark_job_ui_port), - batch_models.EnvironmentSetting(name="SPARK_CONTAINER_NAME", value=spark_container_name), - batch_models.EnvironmentSetting(name="SPARK_SUBMIT_LOGS_FILE", value=spark_submit_logs_file), - batch_models.EnvironmentSetting(name="AZTK_GPU_ENABLED", value=helpers.bool_env(gpu_enabled)), - ] + __get_docker_credentials(spark_client) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode) - - # start task command - command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, plugins, worker_on_master, file_shares, - mixed_mode) - - return batch_models.StartTask( - command_line=helpers.wrap_commands_in_shell(command), - resource_files=resource_files, - environment_settings=environment_settings, - user_identity=POOL_ADMIN_USER_IDENTITY, - wait_for_success=True) - - def create_cluster(spark_cluster_client, cluster_conf: models.ClusterConfiguration, wait: bool = False): """ Create a new aztk spark cluster @@ -181,7 +48,7 @@ def create_cluster(spark_cluster_client, cluster_conf: models.ClusterConfigurati node_data = NodeData(cluster_conf).add_core().done() zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() - start_task = generate_cluster_start_task(spark_cluster_client, zip_resource_files, cluster_conf.cluster_id, + start_task = spark_cluster_client.__generate_cluster_start_task(spark_cluster_client, zip_resource_files, cluster_conf.cluster_id, cluster_conf.gpu_enabled(), cluster_conf.get_docker_repo(), cluster_conf.file_shares, cluster_conf.plugins, cluster_conf.mixed_mode(), cluster_conf.worker_on_master) diff --git a/aztk/spark/client/cluster/helpers/create_user.py b/aztk/spark/client/cluster/helpers/create_user.py new file mode 100644 index 00000000..3803eb07 --- /dev/null +++ b/aztk/spark/client/cluster/helpers/create_user.py @@ -0,0 +1,15 @@ +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import helpers + + +def create_user(spark_cluster_client, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str: + try: + cluster = spark_cluster_client.get_cluster(cluster_id) + master_node_id = cluster.master_node_id + if not master_node_id: + raise error.ClusterNotReadyError("The master has not yet been picked, a user cannot be added.") + spark_cluster_client.create_user_on_pool(username, cluster.id, cluster.nodes, ssh_key, password) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/download.py b/aztk/spark/client/cluster/helpers/download.py new file mode 100644 index 00000000..d3782823 --- /dev/null +++ b/aztk/spark/client/cluster/helpers/download.py @@ -0,0 +1,19 @@ + +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import helpers + + +def cluster_download(spark_cluster_client, cluster_id: str, source_path: str, destination_path: str = None, host: bool = False, internal: bool = False, timeout: int = None): + try: + container_name = None if host else 'spark' + return spark_cluster_client.cluster_copy(cluster_id, + source_path, + destination_path=destination_path, + container_name=container_name, + get=True, + internal=internal, + timeout=timeout) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/get_application_log.py b/aztk/spark/client/cluster/helpers/get_application_log.py new file mode 100644 index 00000000..1d9589e5 --- /dev/null +++ b/aztk/spark/client/cluster/helpers/get_application_log.py @@ -0,0 +1,114 @@ +import time + +import azure +import azure.batch.models as batch_models +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import helpers, models +from aztk.utils import constants, helpers + +output_file = constants.TASK_WORKING_DIR + \ + "/" + constants.SPARK_SUBMIT_LOGS_FILE + + +def __check_task_node_exist(batch_client, cluster_id: str, task: batch_models.CloudTask) -> bool: + try: + batch_client.compute_node.get(cluster_id, task.node_info.node_id) + return True + except batch_error.BatchErrorException: + return False + + +def __wait_for_app_to_be_running(batch_client, cluster_id: str, application_name: str) -> batch_models.CloudTask: + """ + Wait for the batch task to leave the waiting state into running(or completed if it was fast enough) + """ + while True: + task = batch_client.task.get(cluster_id, application_name) + + if task.state is batch_models.TaskState.active or task.state is batch_models.TaskState.preparing: + # TODO: log + time.sleep(5) + else: + return task + + +def __get_output_file_properties(batch_client, cluster_id: str, application_name: str): + while True: + try: + file = helpers.get_file_properties(cluster_id, application_name, output_file, batch_client) + return file + except batch_error.BatchErrorException as e: + if e.response.status_code == 404: + # TODO: log + time.sleep(5) + continue + else: + raise e + + +def get_log_from_storage(blob_client, container_name, application_name, task): + try: + blob = blob_client.get_blob_to_text(container_name, application_name + '/' + constants.SPARK_SUBMIT_LOGS_FILE) + except azure.common.AzureMissingResourceHttpError: + raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.") + + return models.ApplicationLog( + name=application_name, + cluster_id=container_name, + application_state=task.state._value_, + log=blob.content, + total_bytes=blob.properties.content_length, + exit_code=task.execution_info.exit_code) + + +def get_log(batch_client, blob_client, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): + job_id = cluster_id + task_id = application_name + + task = __wait_for_app_to_be_running(batch_client, cluster_id, application_name) + + if not __check_task_node_exist(batch_client, cluster_id, task): + return get_log_from_storage(blob_client, cluster_id, application_name, task) + + file = __get_output_file_properties(batch_client, cluster_id, application_name) + target_bytes = file.content_length + + if target_bytes != current_bytes: + ocp_range = None + + if tail: + ocp_range = "bytes={0}-{1}".format(current_bytes, target_bytes - 1) + + stream = batch_client.file.get_from_task( + job_id, task_id, output_file, batch_models.FileGetFromTaskOptions(ocp_range=ocp_range)) + content = helpers.read_stream_as_string(stream) + + return models.ApplicationLog( + name=application_name, + cluster_id=cluster_id, + application_state=task.state._value_, + log=content, + total_bytes=target_bytes, + exit_code=task.execution_info.exit_code) + else: + return models.ApplicationLog( + name=application_name, + cluster_id=cluster_id, + application_state=task.state._value_, + log='', + total_bytes=target_bytes, + exit_code=task.execution_info.exit_code) + + +def get_application_log(spark_cluster_client, + cluster_id: str, + application_name: str, + tail=False, + current_bytes: int = 0): + try: + return get_log(spark_cluster_client.batch_client, spark_cluster_client.blob_client, cluster_id, + application_name, tail, current_bytes) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/get_application_status.py b/aztk/spark/client/cluster/helpers/get_application_status.py new file mode 100644 index 00000000..fafe6521 --- /dev/null +++ b/aztk/spark/client/cluster/helpers/get_application_status.py @@ -0,0 +1,12 @@ +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import helpers + + +def get_application_status(spark_cluster_client, cluster_id: str, app_name: str): + try: + task = spark_cluster_client.batch_client.task.get(cluster_id, app_name) + return task.state._value_ + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/node_run.py b/aztk/spark/client/cluster/helpers/node_run.py new file mode 100644 index 00000000..89a200f4 --- /dev/null +++ b/aztk/spark/client/cluster/helpers/node_run.py @@ -0,0 +1,18 @@ +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import helpers + + +def node_run(spark_cluster_client, + cluster_id: str, + node_id: str, + command: str, + host=False, + internal: bool = False, + timeout=None): + try: + return spark_cluster_client.__node_run( + cluster_id, node_id, command, internal, container_name='spark' if not host else None, timeout=timeout) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/run.py b/aztk/spark/client/cluster/helpers/run.py new file mode 100644 index 00000000..c4eaf3a5 --- /dev/null +++ b/aztk/spark/client/cluster/helpers/run.py @@ -0,0 +1,12 @@ +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import helpers + + +def cluster_run(spark_cluster_client, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): + try: + return spark_cluster_client.cluster_run( + cluster_id, command, internal, container_name='spark' if not host else None, timeout=timeout) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/ssh_into_master.py b/aztk/spark/client/cluster/helpers/ssh_into_master.py new file mode 100644 index 00000000..aec66517 --- /dev/null +++ b/aztk/spark/client/cluster/helpers/ssh_into_master.py @@ -0,0 +1,12 @@ + +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import helpers + + +def cluster_ssh_into_master(spark_cluster_client, cluster_id, node_id, username, ssh_key=None, password=None, port_forward_list=None, internal=False): + try: + spark_cluster_client.ssh_into_node(cluster_id, node_id, username, ssh_key, password, port_forward_list, internal) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/submit.py b/aztk/spark/client/cluster/helpers/submit.py index 689c1b68..ac164f45 100644 --- a/aztk/spark/client/cluster/helpers/submit.py +++ b/aztk/spark/client/cluster/helpers/submit.py @@ -7,7 +7,7 @@ from aztk.error import AztkError from aztk.spark import models -from aztk.utils import constants, helpers +from aztk.utils import helpers from aztk.utils.command_builder import CommandBuilder import azure.batch.models.batch_error as batch_error from aztk import error @@ -21,94 +21,6 @@ def __get_node(spark_client, node_id: str, cluster_id: str) -> batch_models.Comp return spark_client.batch_client.compute_node.get(cluster_id, node_id) -def generate_task(spark_client, container_id, application, remote=False): - resource_files = [] - - # The application provided is not hosted remotely and therefore must be uploaded - if not remote: - app_resource_file = helpers.upload_file_to_container( - container_name=container_id, - application_name=application.name, - file_path=application.application, - blob_client=spark_client.blob_client, - use_full_path=False) - - # Upload application file - resource_files.append(app_resource_file) - - application.application = '$AZ_BATCH_TASK_WORKING_DIR/' + os.path.basename(application.application) - - # Upload dependent JARS - jar_resource_file_paths = [] - for jar in application.jars: - current_jar_resource_file_path = helpers.upload_file_to_container( - container_name=container_id, - application_name=application.name, - file_path=jar, - blob_client=spark_client.blob_client, - use_full_path=False) - jar_resource_file_paths.append(current_jar_resource_file_path) - resource_files.append(current_jar_resource_file_path) - - # Upload dependent python files - py_files_resource_file_paths = [] - for py_file in application.py_files: - current_py_files_resource_file_path = helpers.upload_file_to_container( - container_name=container_id, - application_name=application.name, - file_path=py_file, - blob_client=spark_client.blob_client, - use_full_path=False) - py_files_resource_file_paths.append(current_py_files_resource_file_path) - resource_files.append(current_py_files_resource_file_path) - - # Upload other dependent files - files_resource_file_paths = [] - for file in application.files: - files_resource_file_path = helpers.upload_file_to_container( - container_name=container_id, - application_name=application.name, - file_path=file, - blob_client=spark_client.blob_client, - use_full_path=False) - files_resource_file_paths.append(files_resource_file_path) - resource_files.append(files_resource_file_path) - - # Upload application definition - application.jars = [os.path.basename(jar) for jar in application.jars] - application.py_files = [os.path.basename(py_files) for py_files in application.py_files] - application.files = [os.path.basename(files) for files in application.files] - application_definition_file = helpers.upload_text_to_container( - container_name=container_id, - application_name=application.name, - file_path='application.yaml', - content=yaml.dump(vars(application)), - blob_client=spark_client.blob_client) - resource_files.append(application_definition_file) - - # create command to submit task - task_cmd = CommandBuilder('sudo docker exec') - task_cmd.add_argument('-i') - task_cmd.add_option('-e', 'AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR') - task_cmd.add_option('-e', 'STORAGE_LOGS_CONTAINER={0}'.format(container_id)) - task_cmd.add_argument('spark /bin/bash >> output.log 2>&1') - task_cmd.add_argument('-c "source ~/.bashrc; ' \ - 'export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; ' \ - 'cd \$AZ_BATCH_TASK_WORKING_DIR; ' \ - '\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"') - - # Create task - task = batch_models.TaskAddParameter( - id=application.name, - command_line=helpers.wrap_commands_in_shell([task_cmd.to_str()]), - resource_files=resource_files, - constraints=batch_models.TaskConstraints(max_task_retry_count=application.max_retry_count), - user_identity=batch_models.UserIdentity( - auto_user=batch_models.AutoUserSpecification( - scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin))) - - return task - def affinitize_task_to_master(spark_client, cluster_id, task): cluster = spark_client.get_cluster(cluster_id) @@ -123,7 +35,7 @@ def submit_application(spark_client, cluster_id, application, remote: bool = Fal """ Submit a spark app """ - task = generate_task(spark_client, cluster_id, application, remote) + task = spark_client.generate_application_task(spark_client, cluster_id, application, remote) task = affinitize_task_to_master(spark_client, cluster_id, task) # Add task to batch job (which has the same name as cluster_id) diff --git a/aztk/spark/client/job/client.py b/aztk/spark/client/job/client.py index 4a78f660..2aa5f547 100644 --- a/aztk/spark/client/job/client.py +++ b/aztk/spark/client/job/client.py @@ -1,4 +1,31 @@ -import aztk.spark.Client +from aztk.spark import models +from aztk.spark.client.base import SparkBaseClient -class Client(): - pass +from .helpers import (delete, get, get_application, get_application_log, list, + list_applications, stop, submit) + + +class Client(SparkBaseClient): + def list(self): + return list.list_jobs(self) + + def delete(self, id, keep_logs: bool = False): + return delete.delete(self, id, keep_logs) + + def get(self, id): + return get.get_job(self, id) + + def get_application(self, id, application_name): + return get_application.get_application(self, id, application_name) + + def get_application_log(self, id, application_name): + return get_application_log.get_job_application_log(self, id, application_name) + + def list_applications(self, id): + return list_applications.list_applications(self, id) + + def stop(self, id): + return stop.stop(self, id) + + def submit(self, job_configuration: models.JobConfiguration): + return submit.submit_job(self, job_configuration) diff --git a/aztk/spark/client/job/helpers/delete.py b/aztk/spark/client/job/helpers/delete.py new file mode 100644 index 00000000..2c388b4a --- /dev/null +++ b/aztk/spark/client/job/helpers/delete.py @@ -0,0 +1,39 @@ +import azure.batch.models as batch_models +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import models +from aztk.utils import helpers + +from .get_recent_job import get_recent_job + + +def _delete(spark_client, job_id, keep_logs: bool = False): + recent_run_job = get_recent_job(spark_client, job_id) + deleted_job_or_job_schedule = False + # delete job + try: + spark_client.batch_client.job.delete(recent_run_job.id) + deleted_job_or_job_schedule = True + except batch_models.batch_error.BatchErrorException: + pass + # delete job_schedule + try: + spark_client.batch_client.job_schedule.delete(job_id) + deleted_job_or_job_schedule = True + except batch_models.batch_error.BatchErrorException: + pass + + # delete storage container + if keep_logs: + cluster_data = spark_client.get_cluster_data(job_id) + cluster_data.delete_container(job_id) + + return deleted_job_or_job_schedule + + +def delete(spark_job_client, job_id: str, keep_logs: bool = False): + try: + return _delete(spark_job_client, job_id, keep_logs) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/get.py b/aztk/spark/client/job/helpers/get.py new file mode 100644 index 00000000..0677684f --- /dev/null +++ b/aztk/spark/client/job/helpers/get.py @@ -0,0 +1,32 @@ +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import models +from aztk.utils import helpers + +from .get_recent_job import get_recent_job + + +def _get_job(spark_client, job_id): + job = spark_client.batch_client.job_schedule.get(job_id) + job_apps = [ + app for app in spark_client.batch_client.task.list(job_id=job.execution_info.recent_job.id) if app.id != job_id + ] + recent_run_job = get_recent_job(spark_client, job_id) + pool_prefix = recent_run_job.pool_info.auto_pool_specification.auto_pool_id_prefix + pool = nodes = None + for cloud_pool in spark_client.batch_client.pool.list(): + if pool_prefix in cloud_pool.id: + pool = cloud_pool + break + if pool: + nodes = spark_client.batch_client.compute_node.list(pool_id=pool.id) + return job, job_apps, pool, nodes + + +def get_job(spark_job_client, job_id): + try: + job, apps, pool, nodes = _get_job(spark_job_client, job_id) + return models.Job(job, apps, pool, nodes) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/get_application.py b/aztk/spark/client/job/helpers/get_application.py new file mode 100644 index 00000000..b06b9891 --- /dev/null +++ b/aztk/spark/client/job/helpers/get_application.py @@ -0,0 +1,25 @@ +import azure.batch.models as batch_models +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import models +from aztk.utils import helpers + +from .get_recent_job import get_recent_job + + +def _get_application(spark_client, job_id, application_name): + # info about the app + recent_run_job = get_recent_job(spark_client, job_id) + try: + return spark_client.batch_client.task.get(job_id=recent_run_job.id, task_id=application_name) + except batch_models.batch_error.BatchErrorException: + raise error.AztkError( + "The Spark application {0} is still being provisioned or does not exist.".format(application_name)) + + +def get_application(spark_job_client, job_id, application_name): + try: + return models.Application(_get_application(spark_job_client, job_id, application_name)) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/get_application_log.py b/aztk/spark/client/job/helpers/get_application_log.py new file mode 100644 index 00000000..8eaa04bd --- /dev/null +++ b/aztk/spark/client/job/helpers/get_application_log.py @@ -0,0 +1,39 @@ +import azure.batch.models as batch_models +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import models +from aztk.utils import helpers + +from .list_applications import list_applications +from .get_recent_job import get_recent_job + + +def _get_application_log(spark_client, job_id, application_name): + # TODO: change where the logs are uploaded so they aren't overwritten on scheduled runs + # current: job_id, application_name/output.log + # new: job_id, recent_run_job.id/application_name/output.log + recent_run_job = get_recent_job(spark_client, job_id) + try: + task = spark_client.batch_client.task.get(job_id=recent_run_job.id, task_id=application_name) + except batch_models.batch_error.BatchErrorException as e: + # see if the application is written to metadata of pool + applications = list_applications(spark_client, job_id) + + for application in applications: + if applications[application] is None and application == application_name: + raise error.AztkError("The application {0} has not yet been created.".format(application)) + raise error.AztkError("The application {0} does not exist".format(application_name)) + else: + if task.state in (batch_models.TaskState.active, batch_models.TaskState.running, + batch_models.TaskState.preparing): + raise error.AztkError("The application {0} has not yet finished executing.".format(application_name)) + + return spark_client.cluster.get_application_log(job_id, application_name) + + +def get_job_application_log(spark_job_client, job_id, application_name): + try: + return _get_application_log(spark_job_client, job_id, application_name) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/get_recent_job.py b/aztk/spark/client/job/helpers/get_recent_job.py new file mode 100644 index 00000000..6b4c7d17 --- /dev/null +++ b/aztk/spark/client/job/helpers/get_recent_job.py @@ -0,0 +1,3 @@ +def get_recent_job(spark_client, job_id): + job_schedule = spark_client.batch_client.job_schedule.get(job_id) + return spark_client.batch_client.job.get(job_schedule.execution_info.recent_job.id) diff --git a/aztk/spark/client/job/helpers/list.py b/aztk/spark/client/job/helpers/list.py new file mode 100644 index 00000000..db253df4 --- /dev/null +++ b/aztk/spark/client/job/helpers/list.py @@ -0,0 +1,16 @@ +import azure.batch.models.batch_error as batch_error + +import aztk.models # TODO: get rid of this import and use aztk.spark.models +from aztk import error +from aztk.spark import helpers, models + + +def _list_jobs(spark_client): + return [cloud_job_schedule for cloud_job_schedule in spark_client.batch_client.job_schedule.list()] + + +def list_jobs(self): + try: + return [models.Job(cloud_job_schedule) for cloud_job_schedule in _list_jobs(self)] + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/list_applications.py b/aztk/spark/client/job/helpers/list_applications.py new file mode 100644 index 00000000..6bc12844 --- /dev/null +++ b/aztk/spark/client/job/helpers/list_applications.py @@ -0,0 +1,35 @@ +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import models +from aztk.utils import helpers + +from .get_recent_job import get_recent_job + + +def _list_applications(spark_client, job_id): + recent_run_job = get_recent_job(spark_client, job_id) + # get application names from Batch job metadata + applications = {} + for metadata_item in recent_run_job.metadata: + if metadata_item.name == "applications": + for app_name in metadata_item.value.split('\n'): + applications[app_name] = None + + # get tasks from Batch job + for task in spark_client.batch_client.task.list(recent_run_job.id): + if task.id != job_id: + applications[task.id] = task + + return applications + + +def list_applications(spark_job_client, job_id): + try: + applications = _list_applications(spark_job_client, job_id) + for item in applications: + if applications[item]: + applications[item] = models.Application(applications[item]) + return applications + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/stop.py b/aztk/spark/client/job/helpers/stop.py new file mode 100644 index 00000000..e19bd696 --- /dev/null +++ b/aztk/spark/client/job/helpers/stop.py @@ -0,0 +1,22 @@ +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import models +from aztk.utils import helpers + +from .get_recent_job import get_recent_job + + +def _stop(spark_client, job_id): + # terminate currently running job and tasks + recent_run_job = get_recent_job(spark_client, job_id) + spark_client.batch_client.job.terminate(recent_run_job.id) + # terminate job_schedule + spark_client.batch_client.job_schedule.terminate(job_id) + + +def stop(self, job_id): + try: + return _stop(self, job_id) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/submit.py b/aztk/spark/client/job/helpers/submit.py new file mode 100644 index 00000000..c955fad4 --- /dev/null +++ b/aztk/spark/client/job/helpers/submit.py @@ -0,0 +1,111 @@ +import azure.batch.models as batch_models +import azure.batch.models.batch_error as batch_error +import yaml + +from aztk import error +from aztk.internal.cluster_data import NodeData +from aztk.spark import models +from aztk.utils import helpers +from aztk.utils.command_builder import CommandBuilder + + +def __app_cmd(): + docker_exec = CommandBuilder("sudo docker exec") + docker_exec.add_argument("-i") + docker_exec.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR") + docker_exec.add_option("-e", "AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID") + docker_exec.add_argument("spark /bin/bash >> output.log 2>&1 -c \"" \ + "source ~/.bashrc; " \ + "export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " \ + "cd \$AZ_BATCH_TASK_WORKING_DIR; " \ + "\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py\"") + return docker_exec.to_str() + + +def generate_job_manager_task(spark_client, job, application_tasks): + resource_files = [] + for application, task in application_tasks: + task_definition_resource_file = helpers.upload_text_to_container( + container_name=job.id, + application_name=application.name + '.yaml', + file_path=application.name + '.yaml', + content=yaml.dump(task), + blob_client=spark_client.blob_client) + resource_files.append(task_definition_resource_file) + + task_cmd = __app_cmd() + + # Create task + task = batch_models.JobManagerTask( + id=job.id, + command_line=helpers.wrap_commands_in_shell([task_cmd]), + resource_files=resource_files, + kill_job_on_completion=False, + allow_low_priority_node=True, + user_identity=batch_models.UserIdentity( + auto_user=batch_models.AutoUserSpecification( + scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin))) + + return task + + +def _default_scheduling_target(vm_count: int): + if vm_count == 0: + return models.SchedulingTarget.Any + else: + return models.SchedulingTarget.Dedicated + + +def _apply_default_for_job_config(job_conf: models.JobConfiguration): + if job_conf.scheduling_target is None: + job_conf.scheduling_target = _default_scheduling_target(job_conf.max_dedicated_nodes) + + return job_conf + + +def submit_job(spark_job_client, job_configuration: models.JobConfiguration): + try: + job_configuration = _apply_default_for_job_config(job_configuration) + job_configuration.validate() + cluster_data = spark_job_client._get_cluster_data(job_configuration.id) + node_data = NodeData(job_configuration.to_cluster_config()).add_core().done() + zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() + + start_task = spark_job_client.__generate_cluster_start_task( + spark_job_client, + zip_resource_files, + job_configuration.id, + job_configuration.gpu_enabled, + job_configuration.get_docker_repo(), + mixed_mode=job_configuration.mixed_mode(), + worker_on_master=job_configuration.worker_on_master) + + application_tasks = [] + for application in job_configuration.applications: + application_tasks.append((application, + spark_job_client.__generate_application_task(spark_job_client, job_configuration.id, application))) + + job_manager_task = generate_job_manager_task(spark_job_client, job_configuration, application_tasks) + + software_metadata_key = "spark" + + vm_image = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04') + + autoscale_formula = "$TargetDedicatedNodes = {0}; " \ + "$TargetLowPriorityNodes = {1}".format( + job_configuration.max_dedicated_nodes, + job_configuration.max_low_pri_nodes) + + job = spark_job_client.__submit_job( + job_configuration=job_configuration, + start_task=start_task, + job_manager_task=job_manager_task, + autoscale_formula=autoscale_formula, + software_metadata_key=software_metadata_key, + vm_image_model=vm_image, + application_metadata='\n'.join(application.name for application in (job_configuration.applications or []))) + + return models.Job(job) + + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) From 0da3f7342d64e9dd742929137ed2c621c70b32aa Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 25 Jun 2018 16:06:42 -0700 Subject: [PATCH 03/52] fix imports --- aztk/client/client.py | 4 ++-- aztk/client/cluster/__init__.py | 1 + aztk/client/job/__init__.py | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/aztk/client/client.py b/aztk/client/client.py index eba03384..73fe093a 100644 --- a/aztk/client/client.py +++ b/aztk/client/client.py @@ -1,5 +1,5 @@ -from aztk.spark.client.cluster import Client as cluster_client -from aztk.spark.client.job import Client as job_client +from aztk.client.cluster import Client as cluster_client +from aztk.client.job import Client as job_client class Client: cluster = cluster_client diff --git a/aztk/client/cluster/__init__.py b/aztk/client/cluster/__init__.py index e69de29b..3ff722bf 100644 --- a/aztk/client/cluster/__init__.py +++ b/aztk/client/cluster/__init__.py @@ -0,0 +1 @@ +from .client import Client diff --git a/aztk/client/job/__init__.py b/aztk/client/job/__init__.py index e69de29b..3ff722bf 100644 --- a/aztk/client/job/__init__.py +++ b/aztk/client/job/__init__.py @@ -0,0 +1 @@ +from .client import Client From 1a821b78c7b587aa11cdf8921ee2e3d64b3b9809 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 26 Jun 2018 10:29:24 -0700 Subject: [PATCH 04/52] fixes --- aztk/client/job/client.py | 4 ++-- aztk/client/job/helpers/submit.py | 3 +-- aztk/spark/client/client.py | 3 ++- aztk/spark/client/cluster/client.py | 3 +-- aztk/spark/client/cluster/helpers/list.py | 2 +- aztk/spark/client/job/client.py | 3 +-- aztk/spark/client/job/helpers/submit.py | 2 +- 7 files changed, 9 insertions(+), 11 deletions(-) diff --git a/aztk/client/job/client.py b/aztk/client/job/client.py index ebf4ab62..d1c2fcd1 100644 --- a/aztk/client/job/client.py +++ b/aztk/client/job/client.py @@ -4,7 +4,7 @@ class Client(BaseClient): - def submit(self, job_configuration, start_task, job_manager_task, autoscale_formula, software_metadata_key: str, - vm_image_model, application_metadata): + def __submit(self, job_configuration, start_task, job_manager_task, autoscale_formula, software_metadata_key: str, + vm_image_model, application_metadata): return submit.submit_job(self, job_configuration, start_task, job_manager_task, autoscale_formula, software_metadata_key, vm_image_model, application_metadata) diff --git a/aztk/client/job/helpers/submit.py b/aztk/client/job/helpers/submit.py index 901dd37f..4c8ee7b8 100644 --- a/aztk/client/job/helpers/submit.py +++ b/aztk/client/job/helpers/submit.py @@ -12,8 +12,7 @@ def submit_job( autoscale_formula, software_metadata_key: str, vm_image_model, - application_metadata, -): + application_metadata): """ Job Submission :param job_configuration -> aztk_sdk.spark.models.JobConfiguration diff --git a/aztk/spark/client/client.py b/aztk/spark/client/client.py index 89409e51..8059ece2 100644 --- a/aztk/spark/client/client.py +++ b/aztk/spark/client/client.py @@ -1,7 +1,8 @@ from aztk.spark.client.cluster import Client as cluster_client from aztk.spark.client.job import Client as job_client +from aztk.spark.client.base import SparkBaseClient -class Client(): +class Client(SparkBaseClient): cluster = cluster_client job = job_client diff --git a/aztk/spark/client/cluster/client.py b/aztk/spark/client/cluster/client.py index 3d288195..965c009e 100644 --- a/aztk/spark/client/cluster/client.py +++ b/aztk/spark/client/cluster/client.py @@ -1,12 +1,11 @@ from aztk.spark import models -from aztk.spark.client.base import SparkBaseClient from .helpers import (copy, create, create_user, delete, get, get_application_log, get_application_status, list, node_run, run, submit) -class Client(SparkBaseClient): +class Client: def create(self, cluster_configuration: models.ClusterConfiguration, wait: bool = False): return create.create_cluster(self, cluster_configuration, wait) diff --git a/aztk/spark/client/cluster/helpers/list.py b/aztk/spark/client/cluster/helpers/list.py index 958b0e10..5dd661ce 100644 --- a/aztk/spark/client/cluster/helpers/list.py +++ b/aztk/spark/client/cluster/helpers/list.py @@ -8,6 +8,6 @@ def list_clusters(spark_cluster_client): try: - return [models.Cluster(pool) for pool in spark_cluster_client.__list_clusters(aztk.models.Software.spark)] + return [models.Cluster(pool) for pool in spark_cluster_client.list(aztk.models.Software.spark)] except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/client.py b/aztk/spark/client/job/client.py index 2aa5f547..a11010f8 100644 --- a/aztk/spark/client/job/client.py +++ b/aztk/spark/client/job/client.py @@ -1,11 +1,10 @@ from aztk.spark import models -from aztk.spark.client.base import SparkBaseClient from .helpers import (delete, get, get_application, get_application_log, list, list_applications, stop, submit) -class Client(SparkBaseClient): +class Client: def list(self): return list.list_jobs(self) diff --git a/aztk/spark/client/job/helpers/submit.py b/aztk/spark/client/job/helpers/submit.py index c955fad4..a7def446 100644 --- a/aztk/spark/client/job/helpers/submit.py +++ b/aztk/spark/client/job/helpers/submit.py @@ -96,7 +96,7 @@ def submit_job(spark_job_client, job_configuration: models.JobConfiguration): job_configuration.max_dedicated_nodes, job_configuration.max_low_pri_nodes) - job = spark_job_client.__submit_job( + job = spark_job_client.__submit( job_configuration=job_configuration, start_task=start_task, job_manager_task=job_manager_task, From 5f1542d6d9b674e53aa65026e8b8aaed369fe802 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 26 Jun 2018 16:52:32 -0700 Subject: [PATCH 05/52] fixes --- aztk/client/__init__.py | 2 +- aztk/client/base/__init__.py | 2 +- .../base/{client.py => base_operations.py} | 34 ++++++++++++------- .../base/helpers/create_user_on_node.py | 14 ++++---- .../base/helpers/delete_user_on_pool.py | 4 +-- .../base/helpers/generate_user_on_node.py | 4 +-- .../base/helpers/get_remote_login_settings.py | 22 ++++++++++++ aztk/client/base/helpers/node_run.py | 5 +-- .../base/helpers/{cluster_run.py => run.py} | 8 ++--- aztk/client/client.py | 28 ++++++++++++--- aztk/client/cluster/__init__.py | 2 +- aztk/client/cluster/helpers/copy.py | 4 +-- .../cluster/{client.py => operations.py} | 8 ++--- aztk/client/job/__init__.py | 2 +- aztk/client/job/client.py | 10 ------ aztk/client/job/operations.py | 10 ++++++ aztk/spark/client.py | 2 +- aztk/spark/client/base/__init__.py | 2 +- .../client/base/{client.py => operations.py} | 4 +-- aztk/spark/client/client.py | 30 ++++++++++++---- aztk/spark/client/cluster/__init__.py | 2 +- aztk/spark/client/cluster/helpers/create.py | 4 +-- .../client/cluster/helpers/create_user.py | 2 +- aztk/spark/client/cluster/helpers/delete.py | 2 +- aztk/spark/client/cluster/helpers/get.py | 4 +-- aztk/spark/client/cluster/helpers/list.py | 3 +- aztk/spark/client/cluster/helpers/node_run.py | 2 +- aztk/spark/client/cluster/helpers/run.py | 2 +- .../cluster/{client.py => operations.py} | 9 ++--- aztk/spark/client/job/__init__.py | 2 +- .../client/job/{client.py => operations.py} | 6 ++-- 31 files changed, 153 insertions(+), 82 deletions(-) rename aztk/client/base/{client.py => base_operations.py} (58%) create mode 100644 aztk/client/base/helpers/get_remote_login_settings.py rename aztk/client/base/helpers/{cluster_run.py => run.py} (82%) rename aztk/client/cluster/{client.py => operations.py} (77%) delete mode 100644 aztk/client/job/client.py create mode 100644 aztk/client/job/operations.py rename aztk/spark/client/base/{client.py => operations.py} (92%) rename aztk/spark/client/cluster/{client.py => operations.py} (83%) rename aztk/spark/client/job/{client.py => operations.py} (85%) diff --git a/aztk/client/__init__.py b/aztk/client/__init__.py index 3ff722bf..68316999 100644 --- a/aztk/client/__init__.py +++ b/aztk/client/__init__.py @@ -1 +1 @@ -from .client import Client +from .client import CoreClient diff --git a/aztk/client/base/__init__.py b/aztk/client/base/__init__.py index b957ca23..c6041e47 100644 --- a/aztk/client/base/__init__.py +++ b/aztk/client/base/__init__.py @@ -1 +1 @@ -from .client import BaseClient +from .base_operations import BaseOperations diff --git a/aztk/client/base/client.py b/aztk/client/base/base_operations.py similarity index 58% rename from aztk/client/base/client.py rename to aztk/client/base/base_operations.py index cb01b6bb..859b84fd 100644 --- a/aztk/client/base/client.py +++ b/aztk/client/base/base_operations.py @@ -1,24 +1,22 @@ import aztk.models as models from aztk.internal import cluster_data from aztk.utils import ssh as ssh_lib -from aztk.utils import azure_api -from .helpers import (create_user_on_node, create_user_on_pool, - delete_user_on_node, generate_user_on_node, - generate_user_on_pool, ssh_into_node) +from .helpers import (run, create_user_on_node, create_user_on_pool, + delete_user_on_node, delete_user_on_pool, + generate_user_on_node, generate_user_on_pool, + get_remote_login_settings, node_run, ssh_into_node) -class BaseClient: +class BaseOperations: ''' - Base client that all other clients inherit from + Base operations that all other operations inherit from ''' - def __init__(self, secrets_config: models.SecretsConfiguration): - self.secrets_config = secrets_config - - azure_api.validate_secrets(secrets_config) - self.batch_client = azure_api.make_batch_client(secrets_config) - self.blob_client = azure_api.make_blob_client(secrets_config) + def __init__(self, context): + self.batch_client = context['batch_client'] + self.blob_client = context['blob_client'] + self.secrets_configuration = context['secrets_configuration'] def get_cluster_config(self, cluster_id: str) -> models.ClusterConfiguration: return self.get_cluster_data(cluster_id).read_cluster_config() @@ -56,3 +54,15 @@ def generate_user_on_pool(self, pool_id, nodes): def delete_user_on_node(self, pool_id: str, node_id: str, username: str) -> str: return delete_user_on_node.delete_user(self, pool_id, node_id, username) + + def delete_user_on_pool(self, username, pool_id, nodes): #TODO: change from pool_id, nodes to cluster_id + return delete_user_on_pool.delete_user_on_pool(self, username, pool_id, nodes) + + def node_run(self, cluster_id, node_id, command, internal, container_name=None, timeout=None): + return node_run.node_run(self, cluster_id, node_id, command, internal, container_name, timeout) + + def get_remote_login_settings(self, cluster_id: str, node_id: str): + return get_remote_login_settings.get_remote_login_settings(self, cluster_id, node_id) + + def run(self, cluster_id, command, internal, container_name=None, timeout=None): + return run.cluster_run(self, cluster_id, command, internal, container_name, timeout) diff --git a/aztk/client/base/helpers/create_user_on_node.py b/aztk/client/base/helpers/create_user_on_node.py index b0b69f44..77b21fab 100644 --- a/aztk/client/base/helpers/create_user_on_node.py +++ b/aztk/client/base/helpers/create_user_on_node.py @@ -1,5 +1,4 @@ -import datetime -from datetime import datetime +from datetime import datetime, timedelta, timezone import azure.batch.models as batch_models import azure.batch.models.batch_error as batch_error @@ -25,18 +24,19 @@ def __create_user(self, pool_id: str, node_id: str, username: str, password: str name=username, is_admin=True, password=password, - ssh_public_key=get_ssh_key.get_user_public_key(ssh_key, self.secrets_config), - expiry_time=datetime.now(datetime.timezone.utc) + datetime.timedelta(days=365), + ssh_public_key=get_ssh_key.get_user_public_key(ssh_key, self.secrets_configuration), + expiry_time=datetime.now(timezone.utc) + timedelta(days=365), ), ) def create_user_on_node(base_client, username, pool_id, node_id, ssh_key=None, password=None): try: - __create_user(base_client, pool_id=pool_id, node_id=node_id, username=username, ssh_key=ssh_key, password=password) + __create_user( + base_client, pool_id=pool_id, node_id=node_id, username=username, ssh_key=ssh_key, password=password) except batch_error.BatchErrorException as error: try: - base_client.__delete_user(pool_id, node_id, username) - base_client.__create_user(pool_id=pool_id, node_id=node_id, username=username, ssh_key=ssh_key) + base_client.delete_user_on_node(pool_id, node_id, username) + base_client.create_user_on_node(pool_id=pool_id, node_id=node_id, username=username, ssh_key=ssh_key) except batch_error.BatchErrorException as error: raise error diff --git a/aztk/client/base/helpers/delete_user_on_pool.py b/aztk/client/base/helpers/delete_user_on_pool.py index 75f41744..24ef4633 100644 --- a/aztk/client/base/helpers/delete_user_on_pool.py +++ b/aztk/client/base/helpers/delete_user_on_pool.py @@ -1,7 +1,7 @@ import concurrent.futures -def __delete_user_on_pool(base_client, username, pool_id, nodes): +def delete_user_on_pool(base_client, username, pool_id, nodes): #TODO: change from pool_id, nodes to cluster_id with concurrent.futures.ThreadPoolExecutor() as executor: - futures = [executor.submit(base_client.delete_user, pool_id, node.id, username) for node in nodes] + futures = [executor.submit(base_client.delete_user_on_node, pool_id, node.id, username) for node in nodes] concurrent.futures.wait(futures) diff --git a/aztk/client/base/helpers/generate_user_on_node.py b/aztk/client/base/helpers/generate_user_on_node.py index 494d4d31..8d1517c3 100644 --- a/aztk/client/base/helpers/generate_user_on_node.py +++ b/aztk/client/base/helpers/generate_user_on_node.py @@ -3,9 +3,9 @@ from aztk.utils import secure_utils -def __generate_user_on_node(base_client, pool_id, node_id): +def generate_user_on_node(base_client, pool_id, node_id): generated_username = secure_utils.generate_random_string() ssh_key = RSA.generate(2048) ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8') - base_client.__create_user_on_node(generated_username, pool_id, node_id, ssh_pub_key) + base_client.create_user_on_node(generated_username, pool_id, node_id, ssh_pub_key) return generated_username, ssh_key diff --git a/aztk/client/base/helpers/get_remote_login_settings.py b/aztk/client/base/helpers/get_remote_login_settings.py new file mode 100644 index 00000000..46888115 --- /dev/null +++ b/aztk/client/base/helpers/get_remote_login_settings.py @@ -0,0 +1,22 @@ +import azure.batch.models.batch_error as batch_error + +from aztk import error, models +from aztk.utils import helpers + + +def _get_remote_login_settings(base_client, pool_id: str, node_id: str): + """ + Get the remote_login_settings for node + :param pool_id + :param node_id + :returns aztk.models.RemoteLogin + """ + result = base_client.batch_client.compute_node.get_remote_login_settings(pool_id, node_id) + return models.RemoteLogin(ip_address=result.remote_login_ip_address, port=str(result.remote_login_port)) + + +def get_remote_login_settings(base_client, cluster_id: str, node_id: str): + try: + return _get_remote_login_settings(base_client, cluster_id, node_id) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/client/base/helpers/node_run.py b/aztk/client/base/helpers/node_run.py index 48252ce5..80003db6 100644 --- a/aztk/client/base/helpers/node_run.py +++ b/aztk/client/base/helpers/node_run.py @@ -4,7 +4,8 @@ def node_run(base_client, cluster_id, node_id, command, internal, container_name=None, timeout=None): - pool, nodes = base_client.get_pool_details(cluster_id) + cluster = base_client.get(cluster_id) + pool, nodes = cluster.pool, list(cluster.nodes) try: node = next(node for node in nodes if node.id == node_id) except StopIteration: @@ -26,4 +27,4 @@ def node_run(base_client, cluster_id, node_id, command, internal, container_name timeout=timeout) return output finally: - base_client.delete_user(cluster_id, node.id, generated_username) + base_client.delete_user_on_node(cluster_id, node.id, generated_username) diff --git a/aztk/client/base/helpers/cluster_run.py b/aztk/client/base/helpers/run.py similarity index 82% rename from aztk/client/base/helpers/cluster_run.py rename to aztk/client/base/helpers/run.py index 47954a79..9ab7c038 100644 --- a/aztk/client/base/helpers/cluster_run.py +++ b/aztk/client/base/helpers/run.py @@ -3,9 +3,9 @@ import aztk.models as models from aztk.utils import ssh as ssh_lib -def __cluster_run(base_client, cluster_id, command, internal, container_name=None, timeout=None): - pool, nodes = base_client.get_pool_details(cluster_id) - nodes = list(nodes) +def cluster_run(base_client, cluster_id, command, internal, container_name=None, timeout=None): + cluster = base_client.get(cluster_id) + pool, nodes = cluster.pool, list(cluster.nodes) if internal: cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] else: @@ -27,4 +27,4 @@ def __cluster_run(base_client, cluster_id, command, internal, container_name=Non except OSError as exc: raise exc finally: - base_client.delete_user_on_pool(generated_username, pool.id, nodes) \ No newline at end of file + base_client.delete_user_on_pool(generated_username, pool.id, nodes) diff --git a/aztk/client/client.py b/aztk/client/client.py index 73fe093a..55b9a11b 100644 --- a/aztk/client/client.py +++ b/aztk/client/client.py @@ -1,6 +1,24 @@ -from aztk.client.cluster import Client as cluster_client -from aztk.client.job import Client as job_client +import aztk.models as models +from aztk.client.cluster import CoreClusterOperations +from aztk.client.job import CoreJobOperations +from aztk.utils import azure_api -class Client: - cluster = cluster_client - job = job_client + +class CoreClient: + def __init__(self, secrets_configuration: models.SecretsConfiguration): + context = self.get_context(secrets_configuration) + self.cluster = CoreClusterOperations(context) + self.job = CoreJobOperations(context) + + def get_context(self, secrets_configuration: models.SecretsConfiguration): + self.secrets_configuration = secrets_configuration + + azure_api.validate_secrets(secrets_configuration) + self.batch_client = azure_api.make_batch_client(secrets_configuration) + self.blob_client = azure_api.make_blob_client(secrets_configuration) + context = { + 'batch_client': self.batch_client, + 'blob_client': self.blob_client, + 'secrets_configuration': self.secrets_configuration, + } + return context diff --git a/aztk/client/cluster/__init__.py b/aztk/client/cluster/__init__.py index 3ff722bf..c596ce0f 100644 --- a/aztk/client/cluster/__init__.py +++ b/aztk/client/cluster/__init__.py @@ -1 +1 @@ -from .client import Client +from .operations import CoreClusterOperations diff --git a/aztk/client/cluster/helpers/copy.py b/aztk/client/cluster/helpers/copy.py index e1413ed9..11e741fa 100644 --- a/aztk/client/cluster/helpers/copy.py +++ b/aztk/client/cluster/helpers/copy.py @@ -7,8 +7,8 @@ def cluster_copy(cluster_client, cluster_id, source_path, destination_path=None, container_name=None, internal=False, get=False, timeout=None): - pool, nodes = cluster_client.__get_pool_details(cluster_id) - nodes = list(nodes) + cluster = cluster_client.get(cluster_id) + pool, nodes = cluster.pool, list(cluster.nodes) if internal: cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] else: diff --git a/aztk/client/cluster/client.py b/aztk/client/cluster/operations.py similarity index 77% rename from aztk/client/cluster/client.py rename to aztk/client/cluster/operations.py index 45f69abe..87a2b9d6 100644 --- a/aztk/client/cluster/client.py +++ b/aztk/client/cluster/operations.py @@ -1,16 +1,16 @@ -from aztk.client.base import BaseClient +from aztk.client.base import BaseOperations from aztk.models import ClusterConfiguration from .helpers import copy, create, delete, get, list -class Client(BaseClient): +class CoreClusterOperations(BaseOperations): def create(self, cluster_configuration: ClusterConfiguration, software_metadata_key: str, start_task, vm_image_model): - create.create_pool_and_job(self, cluster_configuration, software_metadata_key, start_task, vm_image_model) + return create.create_pool_and_job(self, cluster_configuration, software_metadata_key, start_task, vm_image_model) def get(self, cluster_id: str): - get.get_pool_details(self, cluster_id) + return get.get_pool_details(self, cluster_id) def copy(self, cluster_id, diff --git a/aztk/client/job/__init__.py b/aztk/client/job/__init__.py index 3ff722bf..609cc5e5 100644 --- a/aztk/client/job/__init__.py +++ b/aztk/client/job/__init__.py @@ -1 +1 @@ -from .client import Client +from .operations import CoreJobOperations diff --git a/aztk/client/job/client.py b/aztk/client/job/client.py deleted file mode 100644 index d1c2fcd1..00000000 --- a/aztk/client/job/client.py +++ /dev/null @@ -1,10 +0,0 @@ -from aztk.client.base import BaseClient -from .helpers import ( - submit, ) - - -class Client(BaseClient): - def __submit(self, job_configuration, start_task, job_manager_task, autoscale_formula, software_metadata_key: str, - vm_image_model, application_metadata): - return submit.submit_job(self, job_configuration, start_task, job_manager_task, autoscale_formula, - software_metadata_key, vm_image_model, application_metadata) diff --git a/aztk/client/job/operations.py b/aztk/client/job/operations.py new file mode 100644 index 00000000..5af0eff3 --- /dev/null +++ b/aztk/client/job/operations.py @@ -0,0 +1,10 @@ +from aztk.client.base import BaseOperations +from .helpers import ( + submit, ) + + +class CoreJobOperations(BaseOperations): + def submit(self, job_configuration, start_task, job_manager_task, autoscale_formula, software_metadata_key: str, + vm_image_model, application_metadata): + return submit.submit_job(self, job_configuration, start_task, job_manager_task, autoscale_formula, + software_metadata_key, vm_image_model, application_metadata) diff --git a/aztk/spark/client.py b/aztk/spark/client.py index 830a7e50..cb706503 100644 --- a/aztk/spark/client.py +++ b/aztk/spark/client.py @@ -98,7 +98,7 @@ def list_clusters(self): except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) - def get_remote_login_settings(self, cluster_id: str, node_id: str): # NOT IMPLEMENTED + def get_remote_login_settings(self, cluster_id: str, node_id: str): try: return self.__get_remote_login_settings(cluster_id, node_id) except batch_error.BatchErrorException as e: diff --git a/aztk/spark/client/base/__init__.py b/aztk/spark/client/base/__init__.py index 62681658..d3eac8bb 100644 --- a/aztk/spark/client/base/__init__.py +++ b/aztk/spark/client/base/__init__.py @@ -1 +1 @@ -from .client import SparkBaseClient +from .operations import SparkBaseOperations diff --git a/aztk/spark/client/base/client.py b/aztk/spark/client/base/operations.py similarity index 92% rename from aztk/spark/client/base/client.py rename to aztk/spark/client/base/operations.py index 95d20f58..a31cc5e8 100644 --- a/aztk/spark/client/base/client.py +++ b/aztk/spark/client/base/operations.py @@ -2,13 +2,13 @@ import azure.batch.models as batch_models -from aztk.client.base import BaseClient as CoreBaseClient +from aztk.client.base import BaseOperations as CoreBaseOperations from aztk.spark import models from .helpers import generate_cluster_start_task, generate_application_task -class SparkBaseClient(CoreBaseClient): +class SparkBaseOperations(CoreBaseOperations): def __generate_cluster_start_task(self, zip_resource_file: batch_models.ResourceFile, cluster_id: str, diff --git a/aztk/spark/client/client.py b/aztk/spark/client/client.py index 8059ece2..7983c751 100644 --- a/aztk/spark/client/client.py +++ b/aztk/spark/client/client.py @@ -1,8 +1,26 @@ -from aztk.spark.client.cluster import Client as cluster_client -from aztk.spark.client.job import Client as job_client -from aztk.spark.client.base import SparkBaseClient +from aztk.client import CoreClient +from aztk.spark import models +from aztk.spark.client.cluster import ClusterOperations +from aztk.spark.client.job import JobOperations +from aztk.utils import azure_api -class Client(SparkBaseClient): - cluster = cluster_client - job = job_client + +class Client(CoreClient): + def __init__(self, secrets_configuration: models.SecretsConfiguration): + context = self.get_context(secrets_configuration) + self.cluster = ClusterOperations(context) + self.job = JobOperations(context) + + def get_context(self, secrets_configuration: models.SecretsConfiguration): + self.secrets_configuration = secrets_configuration + + azure_api.validate_secrets(secrets_configuration) + self.batch_client = azure_api.make_batch_client(secrets_configuration) + self.blob_client = azure_api.make_blob_client(secrets_configuration) + context = { + 'batch_client': self.batch_client, + 'blob_client': self.blob_client, + 'secrets_configuration': secrets_configuration, + } + return context diff --git a/aztk/spark/client/cluster/__init__.py b/aztk/spark/client/cluster/__init__.py index 3ff722bf..2d13856a 100644 --- a/aztk/spark/client/cluster/__init__.py +++ b/aztk/spark/client/cluster/__init__.py @@ -1 +1 @@ -from .client import Client +from .operations import ClusterOperations diff --git a/aztk/spark/client/cluster/helpers/create.py b/aztk/spark/client/cluster/helpers/create.py index df9ce399..8b785223 100644 --- a/aztk/spark/client/cluster/helpers/create.py +++ b/aztk/spark/client/cluster/helpers/create.py @@ -57,12 +57,12 @@ def create_cluster(spark_cluster_client, cluster_conf: models.ClusterConfigurati vm_image = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04') - cluster = spark_cluster_client.create_pool_and_job(cluster_conf, software_metadata_key, start_task, vm_image) + cluster = super(type(spark_cluster_client), spark_cluster_client).create(cluster_conf, software_metadata_key, start_task, vm_image) # Wait for the master to be ready if wait: util.wait_for_master_to_be_ready(spark_cluster_client, cluster.id) - cluster = spark_cluster_client.get_cluster(cluster.id) + cluster = spark_cluster_client.get(cluster.id) return cluster diff --git a/aztk/spark/client/cluster/helpers/create_user.py b/aztk/spark/client/cluster/helpers/create_user.py index 3803eb07..9c078e8a 100644 --- a/aztk/spark/client/cluster/helpers/create_user.py +++ b/aztk/spark/client/cluster/helpers/create_user.py @@ -6,7 +6,7 @@ def create_user(spark_cluster_client, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str: try: - cluster = spark_cluster_client.get_cluster(cluster_id) + cluster = spark_cluster_client.get(cluster_id) master_node_id = cluster.master_node_id if not master_node_id: raise error.ClusterNotReadyError("The master has not yet been picked, a user cannot be added.") diff --git a/aztk/spark/client/cluster/helpers/delete.py b/aztk/spark/client/cluster/helpers/delete.py index 10a9f263..fa2e1464 100644 --- a/aztk/spark/client/cluster/helpers/delete.py +++ b/aztk/spark/client/cluster/helpers/delete.py @@ -5,6 +5,6 @@ def delete_cluster(spark_cluster_client, cluster_id: str, keep_logs: bool = False): try: - return spark_cluster_client.delete_pool_and_job(cluster_id, keep_logs) + return super(type(spark_cluster_client), spark_cluster_client).delete(cluster_id, keep_logs) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/get.py b/aztk/spark/client/cluster/helpers/get.py index b63fb54b..53769f76 100644 --- a/aztk/spark/client/cluster/helpers/get.py +++ b/aztk/spark/client/cluster/helpers/get.py @@ -5,9 +5,9 @@ from aztk.spark import helpers -def get_cluster(spark_cluster_client, cluster_id: str): +def get_cluster(spark_cluster_client , cluster_id: str): try: - pool, nodes = spark_cluster_client.get_pool_details(cluster_id) + pool, nodes = super(type(spark_cluster_client), spark_cluster_client).get(cluster_id) return models.Cluster(pool, nodes) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/list.py b/aztk/spark/client/cluster/helpers/list.py index 5dd661ce..ba7a83c9 100644 --- a/aztk/spark/client/cluster/helpers/list.py +++ b/aztk/spark/client/cluster/helpers/list.py @@ -8,6 +8,7 @@ def list_clusters(spark_cluster_client): try: - return [models.Cluster(pool) for pool in spark_cluster_client.list(aztk.models.Software.spark)] + software_metadata_key = aztk.models.Software.spark + return [models.Cluster(pool) for pool in super(type(spark_cluster_client), spark_cluster_client).list(software_metadata_key)] except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/node_run.py b/aztk/spark/client/cluster/helpers/node_run.py index 89a200f4..d92891f1 100644 --- a/aztk/spark/client/cluster/helpers/node_run.py +++ b/aztk/spark/client/cluster/helpers/node_run.py @@ -12,7 +12,7 @@ def node_run(spark_cluster_client, internal: bool = False, timeout=None): try: - return spark_cluster_client.__node_run( + return super(type(spark_cluster_client), spark_cluster_client).node_run( cluster_id, node_id, command, internal, container_name='spark' if not host else None, timeout=timeout) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/run.py b/aztk/spark/client/cluster/helpers/run.py index c4eaf3a5..a89967ef 100644 --- a/aztk/spark/client/cluster/helpers/run.py +++ b/aztk/spark/client/cluster/helpers/run.py @@ -6,7 +6,7 @@ def cluster_run(spark_cluster_client, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): try: - return spark_cluster_client.cluster_run( + return super(type(spark_cluster_client), spark_cluster_client).run( cluster_id, command, internal, container_name='spark' if not host else None, timeout=timeout) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/client.py b/aztk/spark/client/cluster/operations.py similarity index 83% rename from aztk/spark/client/cluster/client.py rename to aztk/spark/client/cluster/operations.py index 965c009e..b065393a 100644 --- a/aztk/spark/client/cluster/client.py +++ b/aztk/spark/client/cluster/operations.py @@ -1,11 +1,12 @@ from aztk.spark import models +from aztk.spark.client.base import SparkBaseOperations +from aztk.client.cluster import CoreClusterOperations -from .helpers import (copy, create, create_user, delete, get, - get_application_log, get_application_status, list, +from .helpers import (copy, create, create_user, delete, get, get_application_log, get_application_status, list, node_run, run, submit) -class Client: +class ClusterOperations(CoreClusterOperations, SparkBaseOperations): def create(self, cluster_configuration: models.ClusterConfiguration, wait: bool = False): return create.create_cluster(self, cluster_configuration, wait) @@ -38,7 +39,7 @@ def run(self, cluster_id: str, command: str, host=False, internal: bool = False, return run.cluster_run(self, cluster_id, command, host, internal, timeout) def node_run(self, cluster_id: str, node_id: str, command: str, host=False, internal: bool = False, timeout=None): - return node_run.node_run(cluster_id, node_id, command, host, internal, timeout) + return node_run.node_run(self, cluster_id, node_id, command, host, internal, timeout) def copy(self, cluster_id: str, diff --git a/aztk/spark/client/job/__init__.py b/aztk/spark/client/job/__init__.py index 3ff722bf..00e76137 100644 --- a/aztk/spark/client/job/__init__.py +++ b/aztk/spark/client/job/__init__.py @@ -1 +1 @@ -from .client import Client +from .operations import JobOperations diff --git a/aztk/spark/client/job/client.py b/aztk/spark/client/job/operations.py similarity index 85% rename from aztk/spark/client/job/client.py rename to aztk/spark/client/job/operations.py index a11010f8..e0841505 100644 --- a/aztk/spark/client/job/client.py +++ b/aztk/spark/client/job/operations.py @@ -1,10 +1,10 @@ from aztk.spark import models +from aztk.spark.client.base import SparkBaseOperations -from .helpers import (delete, get, get_application, get_application_log, list, - list_applications, stop, submit) +from .helpers import (delete, get, get_application, get_application_log, list, list_applications, stop, submit) -class Client: +class JobOperations(SparkBaseOperations): def list(self): return list.list_jobs(self) From f1faed9a7471c26ecf22aadf34508baa549621dd Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 26 Jun 2018 17:26:08 -0700 Subject: [PATCH 06/52] refactor integration test secrets management --- .../spark/sdk/cluster/test_cluster.py | 139 ++++++------------ .../integration_tests/spark/sdk/get_client.py | 47 ++++++ .../spark/sdk/job/test_job.py | 36 +---- 3 files changed, 99 insertions(+), 123 deletions(-) create mode 100644 tests/integration_tests/spark/sdk/get_client.py diff --git a/tests/integration_tests/spark/sdk/cluster/test_cluster.py b/tests/integration_tests/spark/sdk/cluster/test_cluster.py index 13b455fc..adff6e8e 100644 --- a/tests/integration_tests/spark/sdk/cluster/test_cluster.py +++ b/tests/integration_tests/spark/sdk/cluster/test_cluster.py @@ -1,53 +1,22 @@ -import subprocess import os +import subprocess import time from datetime import datetime from zipfile import ZipFile import azure.batch.models as batch_models +import pytest from azure.batch.models import BatchErrorException import aztk.spark -import pytest -from aztk.utils import constants from aztk.error import AztkError +from aztk.utils import constants from aztk_cli import config +from tests.integration_tests.spark.sdk.get_client import get_spark_client, get_test_suffix -# base cluster name -dt = datetime.now() -current_time = dt.microsecond -base_cluster_id = "cluster-{}".format(current_time) - -# load secrets -# note: this assumes secrets are set up in .aztk/secrets -tenant_id = os.environ.get("TENANT_ID") -client_id = os.environ.get("CLIENT_ID") -credential = os.environ.get("CREDENTIAL") -batch_account_resource_id = os.environ.get("BATCH_ACCOUNT_RESOURCE_ID") -storage_account_resource_id = os.environ.get("STORAGE_ACCOUNT_RESOURCE_ID") -ssh_pub_key = os.environ.get("ID_RSA_PUB") -ssh_priv_key = os.environ.get("ID_RSA") -keys = [tenant_id, client_id, credential, batch_account_resource_id, - storage_account_resource_id, ssh_priv_key, ssh_pub_key] - -if all(keys): - spark_client = aztk.spark.Client( - aztk.spark.models.SecretsConfiguration( - service_principal=aztk.spark.models.ServicePrincipalConfiguration( - tenant_id=tenant_id, - client_id=client_id, - credential=credential, - batch_account_resource_id=batch_account_resource_id, - storage_account_resource_id=storage_account_resource_id - ), - ssh_pub_key=ssh_pub_key, - ssh_priv_key=ssh_priv_key - ) - ) -else: - # fallback to local secrets if environment variables don't exist - spark_client = aztk.spark.Client(config.load_aztk_secrets()) +base_cluster_id = get_test_suffix("cluster") +spark_client = get_spark_client() def clean_up_cluster(cluster_id): @@ -87,7 +56,7 @@ def ensure_spark_processes(cluster_id): def wait_for_all_nodes(cluster_id, nodes): while True: for node in nodes: - if node.state not in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running]: + if node.state not in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running]: break else: nodes = spark_client.get_cluster(cluster_id).nodes @@ -99,7 +68,7 @@ def test_create_cluster(): test_id = "test-create-" # TODO: make Cluster Configuration more robust, test each value cluster_configuration = aztk.spark.models.ClusterConfiguration( - cluster_id=test_id+base_cluster_id, + cluster_id=test_id + base_cluster_id, vm_count=2, vm_low_pri_count=0, vm_size="standard_f2", @@ -107,8 +76,7 @@ def test_create_cluster(): custom_scripts=None, file_shares=None, toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), - spark_configuration=None - ) + spark_configuration=None) try: cluster = spark_client.create_cluster(cluster_configuration, wait=True) @@ -127,10 +95,11 @@ def test_create_cluster(): finally: clean_up_cluster(cluster_configuration.cluster_id) + def test_get_cluster(): test_id = "test-get-" cluster_configuration = aztk.spark.models.ClusterConfiguration( - cluster_id=test_id+base_cluster_id, + cluster_id=test_id + base_cluster_id, vm_count=2, vm_low_pri_count=0, vm_size="standard_f2", @@ -138,8 +107,7 @@ def test_get_cluster(): custom_scripts=None, file_shares=None, toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), - spark_configuration=None - ) + spark_configuration=None) try: spark_client.create_cluster(cluster_configuration, wait=True) cluster = spark_client.get_cluster(cluster_id=cluster_configuration.cluster_id) @@ -163,7 +131,7 @@ def test_get_cluster(): def test_list_clusters(): test_id = "test-list-" cluster_configuration = aztk.spark.models.ClusterConfiguration( - cluster_id=test_id+base_cluster_id, + cluster_id=test_id + base_cluster_id, vm_count=2, vm_low_pri_count=0, vm_size="standard_f2", @@ -171,8 +139,7 @@ def test_list_clusters(): custom_scripts=None, file_shares=None, toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), - spark_configuration=None - ) + spark_configuration=None) try: spark_client.create_cluster(cluster_configuration, wait=True) clusters = spark_client.list_clusters() @@ -189,7 +156,7 @@ def test_list_clusters(): def test_get_remote_login_settings(): test_id = "test-get-remote-login-" cluster_configuration = aztk.spark.models.ClusterConfiguration( - cluster_id=test_id+base_cluster_id, + cluster_id=test_id + base_cluster_id, vm_count=2, vm_low_pri_count=0, vm_size="standard_f2", @@ -197,8 +164,7 @@ def test_get_remote_login_settings(): custom_scripts=None, file_shares=None, toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), - spark_configuration=None - ) + spark_configuration=None) try: spark_client.create_cluster(cluster_configuration, wait=True) cluster = spark_client.get_cluster(cluster_id=cluster_configuration.cluster_id) @@ -218,7 +184,7 @@ def test_get_remote_login_settings(): def test_submit(): test_id = "test-submit-" cluster_configuration = aztk.spark.models.ClusterConfiguration( - cluster_id=test_id+base_cluster_id, + cluster_id=test_id + base_cluster_id, vm_count=2, vm_low_pri_count=0, vm_size="standard_f2", @@ -226,8 +192,7 @@ def test_submit(): custom_scripts=None, file_shares=None, toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), - spark_configuration=None - ) + spark_configuration=None) application_configuration = aztk.spark.models.ApplicationConfiguration( name="pipy100", application="./examples/src/main/python/pi.py", @@ -242,12 +207,12 @@ def test_submit(): driver_cores=None, executor_memory=None, executor_cores=None, - max_retry_count=None - ) + max_retry_count=None) try: spark_client.create_cluster(cluster_configuration, wait=True) - spark_client.submit(cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + spark_client.submit( + cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) assert True except (AztkError, BatchErrorException): @@ -260,7 +225,7 @@ def test_submit(): def test_get_application_log(): test_id = "test-get-app-log-" cluster_configuration = aztk.spark.models.ClusterConfiguration( - cluster_id=test_id+base_cluster_id, + cluster_id=test_id + base_cluster_id, vm_count=2, vm_low_pri_count=0, vm_size="standard_f2", @@ -268,8 +233,7 @@ def test_get_application_log(): custom_scripts=None, file_shares=None, toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), - spark_configuration=None - ) + spark_configuration=None) application_configuration = aztk.spark.models.ApplicationConfiguration( name="pipy100", application="./examples/src/main/python/pi.py", @@ -284,16 +248,17 @@ def test_get_application_log(): driver_cores=None, executor_memory=None, executor_cores=None, - max_retry_count=None - ) + max_retry_count=None) try: spark_client.create_cluster(cluster_configuration, wait=True) - spark_client.submit(cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) - application_log = spark_client.get_application_log(cluster_id=cluster_configuration.cluster_id, - application_name=application_configuration.name, - tail=False, - current_bytes=0) + spark_client.submit( + cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + application_log = spark_client.get_application_log( + cluster_id=cluster_configuration.cluster_id, + application_name=application_configuration.name, + tail=False, + current_bytes=0) assert application_log.exit_code == 0 assert application_log.name == application_configuration.name == "pipy100" @@ -321,7 +286,7 @@ def test_create_user_ssh_key(): def test_get_application_status_complete(): test_id = "test-app-status-complete-" cluster_configuration = aztk.spark.models.ClusterConfiguration( - cluster_id=test_id+base_cluster_id, + cluster_id=test_id + base_cluster_id, vm_count=2, vm_low_pri_count=0, vm_size="standard_f2", @@ -329,8 +294,7 @@ def test_get_application_status_complete(): custom_scripts=None, file_shares=None, toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), - spark_configuration=None - ) + spark_configuration=None) application_configuration = aztk.spark.models.ApplicationConfiguration( name="pipy100", application="./examples/src/main/python/pi.py", @@ -345,13 +309,14 @@ def test_get_application_status_complete(): driver_cores=None, executor_memory=None, executor_cores=None, - max_retry_count=None - ) + max_retry_count=None) try: spark_client.create_cluster(cluster_configuration, wait=True) - spark_client.submit(cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) - status = spark_client.get_application_status(cluster_id=cluster_configuration.cluster_id, app_name=application_configuration.name) + spark_client.submit( + cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + status = spark_client.get_application_status( + cluster_id=cluster_configuration.cluster_id, app_name=application_configuration.name) assert status == "completed" @@ -365,7 +330,7 @@ def test_get_application_status_complete(): def test_delete_cluster(): test_id = "test-delete-" cluster_configuration = aztk.spark.models.ClusterConfiguration( - cluster_id=test_id+base_cluster_id, + cluster_id=test_id + base_cluster_id, vm_count=2, vm_low_pri_count=0, vm_size="standard_f2", @@ -373,8 +338,7 @@ def test_delete_cluster(): custom_scripts=None, file_shares=None, toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), - spark_configuration=None - ) + spark_configuration=None) try: spark_client.create_cluster(cluster_configuration, wait=True) @@ -388,10 +352,11 @@ def test_delete_cluster(): finally: clean_up_cluster(cluster_configuration.cluster_id) + def test_spark_processes_up(): test_id = "test-spark-processes-up-" cluster_configuration = aztk.spark.models.ClusterConfiguration( - cluster_id=test_id+base_cluster_id, + cluster_id=test_id + base_cluster_id, vm_count=2, vm_low_pri_count=0, vm_size="standard_f2", @@ -399,8 +364,7 @@ def test_spark_processes_up(): custom_scripts=None, file_shares=None, toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), - spark_configuration=None - ) + spark_configuration=None) try: cluster = spark_client.create_cluster(cluster_configuration, wait=True) @@ -419,7 +383,7 @@ def test_spark_processes_up(): def test_debug_tool(): test_id = "debug-tool-" cluster_configuration = aztk.spark.models.ClusterConfiguration( - cluster_id=test_id+base_cluster_id, + cluster_id=test_id + base_cluster_id, size=2, size_low_priority=0, vm_size="standard_f2", @@ -427,17 +391,10 @@ def test_debug_tool(): custom_scripts=None, file_shares=None, toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), - spark_configuration=None - ) + spark_configuration=None) expected_members = [ - "df.txt", - "hostname.txt", - "docker-images.txt", - "docker-containers.txt", - "spark/docker.log", - "spark/ps_aux.txt", - "spark/logs", - "spark/wd" + "df.txt", "hostname.txt", "docker-images.txt", "docker-containers.txt", "spark/docker.log", "spark/ps_aux.txt", + "spark/logs", "spark/wd" ] try: cluster = spark_client.create_cluster(cluster_configuration, wait=True) @@ -445,7 +402,7 @@ def test_debug_tool(): wait_for_all_nodes(cluster.id, nodes) cluster_output = spark_client.run_cluster_diagnostics(cluster_id=cluster.id) for node_output in cluster_output: - node_output.output.seek(0) # tempfile requires seek 0 before reading + node_output.output.seek(0) # tempfile requires seek 0 before reading debug_zip = ZipFile(node_output.output) assert node_output.id in [node.id for node in nodes] assert node_output.error is None diff --git a/tests/integration_tests/spark/sdk/get_client.py b/tests/integration_tests/spark/sdk/get_client.py new file mode 100644 index 00000000..1559e638 --- /dev/null +++ b/tests/integration_tests/spark/sdk/get_client.py @@ -0,0 +1,47 @@ +import os +from datetime import datetime + +import aztk.spark +from aztk_cli import config + + +def get_spark_client(): + # load secrets + # note: this assumes secrets are set up in .aztk/secrets + tenant_id = os.environ.get("TENANT_ID") + client_id = os.environ.get("CLIENT_ID") + credential = os.environ.get("CREDENTIAL") + batch_account_resource_id = os.environ.get("BATCH_ACCOUNT_RESOURCE_ID") + storage_account_resource_id = os.environ.get("STORAGE_ACCOUNT_RESOURCE_ID") + ssh_pub_key = os.environ.get("ID_RSA_PUB") + ssh_private_key = os.environ.get("ID_RSA") + keys = [ + tenant_id, client_id, credential, batch_account_resource_id, storage_account_resource_id, ssh_private_key, + ssh_pub_key + ] + + spark_client = None + if all(keys): + spark_client = aztk.spark.Client( + aztk.spark.models.SecretsConfiguration( + service_principal=aztk.spark.models.ServicePrincipalConfiguration( + tenant_id=tenant_id, + client_id=client_id, + credential=credential, + batch_account_resource_id=batch_account_resource_id, + storage_account_resource_id=storage_account_resource_id), + ssh_pub_key=ssh_pub_key, + ssh_priv_key=ssh_private_key)) + else: + # fallback to local secrets if environment variables don't exist + spark_client = aztk.spark.Client(config.load_aztk_secrets()) + + return spark_client + + +def get_test_suffix(prefix: str): + # base cluster name + dt = datetime.now() + current_time = dt.microsecond + base_cluster_id = "{0}-{1}".format(prefix, current_time) + return base_cluster_id diff --git a/tests/integration_tests/spark/sdk/job/test_job.py b/tests/integration_tests/spark/sdk/job/test_job.py index dba40771..1303c221 100644 --- a/tests/integration_tests/spark/sdk/job/test_job.py +++ b/tests/integration_tests/spark/sdk/job/test_job.py @@ -7,39 +7,11 @@ import aztk.spark from aztk.error import AztkError from aztk_cli import config +from tests.integration_tests.spark.sdk.get_client import get_spark_client, get_test_suffix -dt = datetime.now() -time = dt.microsecond -base_job_id = "job-{}".format(time) - - -# load secrets -# note: this assumes secrets are set up in .aztk/secrets -tenant_id = os.environ.get("TENANT_ID") -client_id = os.environ.get("CLIENT_ID") -credential = os.environ.get("CREDENTIAL") -batch_account_resource_id = os.environ.get("BATCH_ACCOUNT_RESOURCE_ID") -storage_account_resource_id = os.environ.get("STORAGE_ACCOUNT_RESOURCE_ID") -ssh_pub_key = os.environ.get("ID_RSA_PUB") -ssh_priv_key = os.environ.get("ID_RSA") -keys = [tenant_id, client_id, credential, batch_account_resource_id, - storage_account_resource_id, ssh_priv_key, ssh_pub_key] - -if all(keys): - spark_client = aztk.spark.Client( - aztk.spark.models.SecretsConfiguration( - service_principal=aztk.spark.models.ServicePrincipalConfiguration( - tenant_id=tenant_id, - client_id=client_id, - credential=credential, - batch_account_resource_id=batch_account_resource_id, - storage_account_resource_id=storage_account_resource_id - ) - ) - ) -else: - # fallback to local secrets if environment variables don't exist - spark_client = aztk.spark.Client(config.load_aztk_secrets()) + +base_job_id = get_test_suffix("job") +spark_client = get_spark_client() def test_submit_job(): From 202591d0552b1a1b8ac2fd252d3a774beec768ad Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 26 Jun 2018 17:48:28 -0700 Subject: [PATCH 07/52] fix cluster create, add new test --- .../helpers/generate_cluster_start_task.py | 6 +- aztk/spark/client/base/operations.py | 20 ++-- aztk/spark/client/cluster/helpers/create.py | 4 +- aztk/spark/utils/util.py | 2 +- .../spark/sdk/cluster/test_cluster_new.py | 95 +++++++++++++++++++ 5 files changed, 111 insertions(+), 16 deletions(-) create mode 100644 tests/integration_tests/spark/sdk/cluster/test_cluster_new.py diff --git a/aztk/spark/client/base/helpers/generate_cluster_start_task.py b/aztk/spark/client/base/helpers/generate_cluster_start_task.py index 3dc0c742..6c9e1a8f 100644 --- a/aztk/spark/client/base/helpers/generate_cluster_start_task.py +++ b/aztk/spark/client/base/helpers/generate_cluster_start_task.py @@ -25,7 +25,7 @@ def _get_aztk_environment(cluster_id, worker_on_master, mixed_mode): def __get_docker_credentials(spark_client): creds = [] - docker = spark_client.secrets_config.docker + docker = spark_client.secrets_configuration.docker if docker: if docker.endpoint: creds.append(batch_models.EnvironmentSetting(name="DOCKER_ENDPOINT", value=docker.endpoint)) @@ -38,8 +38,8 @@ def __get_docker_credentials(spark_client): def __get_secrets_env(spark_client): - shared_key = spark_client.secrets_config.shared_key - service_principal = spark_client.secrets_config.service_principal + shared_key = spark_client.secrets_configuration.shared_key + service_principal = spark_client.secrets_configuration.service_principal if shared_key: return [ batch_models.EnvironmentSetting(name="BATCH_SERVICE_URL", value=shared_key.batch_service_url), diff --git a/aztk/spark/client/base/operations.py b/aztk/spark/client/base/operations.py index a31cc5e8..31894792 100644 --- a/aztk/spark/client/base/operations.py +++ b/aztk/spark/client/base/operations.py @@ -9,18 +9,18 @@ class SparkBaseOperations(CoreBaseOperations): - def __generate_cluster_start_task(self, - zip_resource_file: batch_models.ResourceFile, - cluster_id: str, - gpu_enabled: bool, - docker_repo: str = None, - file_shares: List[models.FileShare] = None, - plugins: List[models.PluginConfiguration] = None, - mixed_mode: bool = False, - worker_on_master: bool = True): + def generate_cluster_start_task(self, + zip_resource_file: batch_models.ResourceFile, + cluster_id: str, + gpu_enabled: bool, + docker_repo: str = None, + file_shares: List[models.FileShare] = None, + plugins: List[models.PluginConfiguration] = None, + mixed_mode: bool = False, + worker_on_master: bool = True): return generate_cluster_start_task.generate_cluster_start_task(self, zip_resource_file, cluster_id, gpu_enabled, docker_repo, file_shares, plugins, mixed_mode, worker_on_master) - def __generate_application_task(self, container_id, application, remote=False): + def generate_application_task(self, container_id, application, remote=False): return generate_application_task.generate_application_task(self, container_id, application, remote) diff --git a/aztk/spark/client/cluster/helpers/create.py b/aztk/spark/client/cluster/helpers/create.py index 8b785223..c0887035 100644 --- a/aztk/spark/client/cluster/helpers/create.py +++ b/aztk/spark/client/cluster/helpers/create.py @@ -42,13 +42,13 @@ def create_cluster(spark_cluster_client, cluster_conf: models.ClusterConfigurati cluster_conf = _apply_default_for_cluster_config(cluster_conf) cluster_conf.validate() - cluster_data = spark_cluster_client._get_cluster_data(cluster_conf.cluster_id) + cluster_data = spark_cluster_client.get_cluster_data(cluster_conf.cluster_id) try: zip_resource_files = None node_data = NodeData(cluster_conf).add_core().done() zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() - start_task = spark_cluster_client.__generate_cluster_start_task(spark_cluster_client, zip_resource_files, cluster_conf.cluster_id, + start_task = spark_cluster_client.generate_cluster_start_task(zip_resource_files, cluster_conf.cluster_id, cluster_conf.gpu_enabled(), cluster_conf.get_docker_repo(), cluster_conf.file_shares, cluster_conf.plugins, cluster_conf.mixed_mode(), cluster_conf.worker_on_master) diff --git a/aztk/spark/utils/util.py b/aztk/spark/utils/util.py index 3ff722cb..17ef6173 100644 --- a/aztk/spark/utils/util.py +++ b/aztk/spark/utils/util.py @@ -23,7 +23,7 @@ def wait_for_master_to_be_ready(client, cluster_id: str): start_time = datetime.datetime.now() while True: if not master_node_id: - master_node_id = client.get_cluster(cluster_id).master_node_id + master_node_id = client.get(cluster_id).master_node_id if not master_node_id: time.sleep(5) continue diff --git a/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py b/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py new file mode 100644 index 00000000..7e2b6293 --- /dev/null +++ b/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py @@ -0,0 +1,95 @@ +import os +import subprocess +import time +from datetime import datetime +from zipfile import ZipFile + +import azure.batch.models as batch_models +import pytest +from azure.batch.models import BatchErrorException + +import aztk.spark +from aztk.error import AztkError +from aztk.utils import constants +from aztk_cli import config +from tests.integration_tests.spark.sdk.get_client import get_spark_client, get_test_suffix + +base_cluster_id = get_test_suffix("cluster") +spark_client = get_spark_client() + + +def clean_up_cluster(cluster_id): + try: + spark_client.cluster.delete(cluster_id=cluster_id) + except (BatchErrorException, AztkError): + # pass in the event that the cluster does not exist + pass + + +def ensure_spark_master(cluster_id): + results = spark_client.cluster.run(cluster_id, + "if $AZTK_IS_MASTER ; then $SPARK_HOME/sbin/spark-daemon.sh status org.apache.spark.deploy.master.Master 1 ;" \ + " else echo AZTK_IS_MASTER is false ; fi") + for _, result in results: + if isinstance(result, Exception): + raise result + print(result[0]) + assert result[0] in ["org.apache.spark.deploy.master.Master is running.", "AZTK_IS_MASTER is false"] + + +def ensure_spark_worker(cluster_id): + results = spark_client.cluster.run(cluster_id, + "if $AZTK_IS_WORKER ; then $SPARK_HOME/sbin/spark-daemon.sh status org.apache.spark.deploy.worker.Worker 1 ;" \ + " else echo AZTK_IS_WORKER is false ; fi") + for _, result in results: + if isinstance(result, Exception): + raise result + assert result[0] in ["org.apache.spark.deploy.worker.Worker is running.", "AZTK_IS_WORKER is false"] + + +def ensure_spark_processes(cluster_id): + ensure_spark_master(cluster_id) + ensure_spark_worker(cluster_id) + + +def wait_for_all_nodes(cluster_id, nodes): + while True: + for node in nodes: + if node.state not in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running]: + break + else: + nodes = spark_client.cluster.get(cluster_id).nodes + continue + break + + +def test_create_cluster(): + test_id = "test-create-" + # TODO: make Cluster Configuration more robust, test each value + cluster_configuration = aztk.spark.models.ClusterConfiguration( + cluster_id=test_id + base_cluster_id, + vm_count=2, + vm_low_pri_count=0, + vm_size="standard_f2", + subnet_id=None, + custom_scripts=None, + file_shares=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + spark_configuration=None) + try: + cluster = spark_client.cluster.create(cluster_configuration, wait=True) + + assert cluster.pool is not None + assert cluster.nodes is not None + assert cluster.id == cluster_configuration.cluster_id + assert cluster.vm_size == "standard_f2" + assert cluster.current_dedicated_nodes == 2 + assert cluster.gpu_enabled is False + assert cluster.master_node_id is not None + assert cluster.current_low_pri_nodes == 0 + + except (AztkError, BatchErrorException) as e: + assert False + + finally: + clean_up_cluster(cluster_configuration.cluster_id) From 4fa150ea0d8d6d933b6663740d7a28565d5547d2 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 27 Jun 2018 12:01:31 -0700 Subject: [PATCH 08/52] add tests for new sdk api and fix bugs --- aztk/spark/client/cluster/helpers/delete.py | 5 +- .../client/cluster/helpers/diagnostics.py | 44 +++ aztk/spark/client/cluster/helpers/submit.py | 39 +-- aztk/spark/client/cluster/operations.py | 5 +- aztk/spark/helpers/__init__.py | 2 + .../spark/sdk/cluster/test_cluster_new.py | 286 ++++++++++++++++++ .../spark/sdk/job/test_job_new.py | 275 +++++++++++++++++ 7 files changed, 628 insertions(+), 28 deletions(-) create mode 100644 aztk/spark/client/cluster/helpers/diagnostics.py create mode 100644 tests/integration_tests/spark/sdk/job/test_job_new.py diff --git a/aztk/spark/client/cluster/helpers/delete.py b/aztk/spark/client/cluster/helpers/delete.py index fa2e1464..5daa0455 100644 --- a/aztk/spark/client/cluster/helpers/delete.py +++ b/aztk/spark/client/cluster/helpers/delete.py @@ -1,6 +1,7 @@ -from aztk import error import azure.batch.models.batch_error as batch_error -from aztk.spark import helpers + +from aztk import error +from aztk.utils import helpers def delete_cluster(spark_cluster_client, cluster_id: str, keep_logs: bool = False): diff --git a/aztk/spark/client/cluster/helpers/diagnostics.py b/aztk/spark/client/cluster/helpers/diagnostics.py new file mode 100644 index 00000000..de26b06d --- /dev/null +++ b/aztk/spark/client/cluster/helpers/diagnostics.py @@ -0,0 +1,44 @@ + + + +import os + +from azure.batch.models import batch_error + +from aztk import error +from aztk.utils import helpers + + +def _run(spark_cluster_operations, cluster_id, output_directory=None): + # copy debug program to each node + output = spark_cluster_operations.copy(cluster_id, os.path.abspath("./aztk/spark/utils/debug.py"), "/tmp/debug.py", host=True) + ssh_cmd = _build_diagnostic_ssh_command() + run_output = spark_cluster_operations.run(cluster_id, ssh_cmd, host=True) + remote_path = "/tmp/debug.zip" + if output_directory: + local_path = os.path.join(os.path.abspath(output_directory), "debug.zip") + output = spark_cluster_operations.download(cluster_id, remote_path, local_path, host=True) + + # write run output to debug/ directory + with open(os.path.join(os.path.dirname(local_path), "debug-output.txt"), 'w', encoding="UTF-8") as f: + [f.write(line + '\n') for node_output in run_output for line in node_output.output] + else: + output = spark_cluster_operations.download(cluster_id, remote_path, host=True) + + return output + + +def _build_diagnostic_ssh_command(): + return "sudo rm -rf /tmp/debug.zip; "\ + "sudo apt-get install -y python3-pip; "\ + "sudo -H pip3 install --upgrade pip; "\ + "sudo -H pip3 install docker; "\ + "sudo python3 /tmp/debug.py" + + +def run_cluster_diagnostics(spark_cluster_operations, cluster_id, output_directory=None): + try: + output = _run(spark_cluster_operations, cluster_id, output_directory) + return output + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/submit.py b/aztk/spark/client/cluster/helpers/submit.py index ac164f45..93d50af0 100644 --- a/aztk/spark/client/cluster/helpers/submit.py +++ b/aztk/spark/client/cluster/helpers/submit.py @@ -1,57 +1,46 @@ -import datetime -import os -from typing import List - import azure.batch.models as batch_models -import yaml +import azure.batch.models.batch_error as batch_error +from aztk import error from aztk.error import AztkError from aztk.spark import models from aztk.utils import helpers -from aztk.utils.command_builder import CommandBuilder -import azure.batch.models.batch_error as batch_error -from aztk import error - -''' -Submit helper methods -''' - -def __get_node(spark_client, node_id: str, cluster_id: str) -> batch_models.ComputeNode: - return spark_client.batch_client.compute_node.get(cluster_id, node_id) +def __get_node(spark_cluster_operations, node_id: str, cluster_id: str) -> batch_models.ComputeNode: + return spark_cluster_operations.batch_client.compute_node.get(cluster_id, node_id) -def affinitize_task_to_master(spark_client, cluster_id, task): - cluster = spark_client.get_cluster(cluster_id) +def affinitize_task_to_master(spark_cluster_operations, cluster_id, task): + cluster = spark_cluster_operations.get(cluster_id) if cluster.master_node_id is None: raise AztkError("Master has not yet been selected. Please wait until the cluster is finished provisioning.") - master_node = spark_client.batch_client.compute_node.get(pool_id=cluster_id, node_id=cluster.master_node_id) + master_node = spark_cluster_operations.batch_client.compute_node.get(pool_id=cluster_id, node_id=cluster.master_node_id) task.affinity_info = batch_models.AffinityInformation(affinity_id=master_node.affinity_id) return task -def submit_application(spark_client, cluster_id, application, remote: bool = False, wait: bool = False): +def submit_application(spark_cluster_operations, cluster_id, application, remote: bool = False, wait: bool = False): """ Submit a spark app """ - task = spark_client.generate_application_task(spark_client, cluster_id, application, remote) - task = affinitize_task_to_master(spark_client, cluster_id, task) + task = spark_cluster_operations.generate_application_task(cluster_id, application, remote) + task = affinitize_task_to_master(spark_cluster_operations, cluster_id, task) # Add task to batch job (which has the same name as cluster_id) job_id = cluster_id - spark_client.batch_client.task.add(job_id=job_id, task=task) + spark_cluster_operations.batch_client.task.add(job_id=job_id, task=task) if wait: - helpers.wait_for_task_to_complete(job_id=job_id, task_id=task.id, batch_client=spark_client.batch_client) + helpers.wait_for_task_to_complete(job_id=job_id, task_id=task.id, batch_client=spark_cluster_operations.batch_client) -def submit(spark_cluster_client, +def submit(spark_cluster_operations, cluster_id: str, application: models.ApplicationConfiguration, remote: bool = False, wait: bool = False): try: - submit_application(spark_cluster_client, cluster_id, application, remote, wait) + submit_application(spark_cluster_operations, cluster_id, application, remote, wait) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index b065393a..bf7e3018 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -3,7 +3,7 @@ from aztk.client.cluster import CoreClusterOperations from .helpers import (copy, create, create_user, delete, get, get_application_log, get_application_status, list, - node_run, run, submit) + node_run, run, submit, diagnostics) class ClusterOperations(CoreClusterOperations, SparkBaseOperations): @@ -49,3 +49,6 @@ def copy(self, internal: bool = False, timeout: int = None): return copy.cluster_copy(self, cluster_id, source_path, destination_path, host, internal, timeout) + + def diagnostics(self, cluster_id, output_directory=None): + return diagnostics.run_cluster_diagnostics(self, cluster_id, output_directory) diff --git a/aztk/spark/helpers/__init__.py b/aztk/spark/helpers/__init__.py index e69de29b..1880b509 100644 --- a/aztk/spark/helpers/__init__.py +++ b/aztk/spark/helpers/__init__.py @@ -0,0 +1,2 @@ +# ALL FILES IN THIS DIRECTORY ARE DEPRECATED, WILL BE REMOTE IN v0.9.0 + diff --git a/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py b/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py index 7e2b6293..4d20a448 100644 --- a/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py +++ b/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py @@ -93,3 +93,289 @@ def test_create_cluster(): finally: clean_up_cluster(cluster_configuration.cluster_id) + + +def test_list_clusters(): + test_id = "test-list-" + cluster_configuration = aztk.spark.models.ClusterConfiguration( + cluster_id=test_id + base_cluster_id, + vm_count=2, + vm_low_pri_count=0, + vm_size="standard_f2", + subnet_id=None, + custom_scripts=None, + file_shares=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + spark_configuration=None) + try: + spark_client.cluster.create(cluster_configuration, wait=True) + clusters = spark_client.cluster.list() + + assert cluster_configuration.cluster_id in [cluster.id for cluster in clusters] + + except (AztkError, BatchErrorException): + assert False + + finally: + clean_up_cluster(cluster_configuration.cluster_id) + + +def test_get_remote_login_settings(): + test_id = "test-get-remote-login-" + cluster_configuration = aztk.spark.models.ClusterConfiguration( + cluster_id=test_id + base_cluster_id, + vm_count=2, + vm_low_pri_count=0, + vm_size="standard_f2", + subnet_id=None, + custom_scripts=None, + file_shares=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + spark_configuration=None) + try: + spark_client.cluster.create(cluster_configuration, wait=True) + cluster = spark_client.cluster.get(cluster_id=cluster_configuration.cluster_id) + rls = spark_client.cluster.get_remote_login_settings(cluster_id=cluster.id, node_id=cluster.master_node_id) + + assert rls.ip_address is not None + assert rls.port is not None + + except (AztkError, BatchErrorException) as e: + raise e + assert False + + finally: + clean_up_cluster(cluster_configuration.cluster_id) + + +def test_submit(): + test_id = "test-submit-" + cluster_configuration = aztk.spark.models.ClusterConfiguration( + cluster_id=test_id + base_cluster_id, + vm_count=2, + vm_low_pri_count=0, + vm_size="standard_f2", + subnet_id=None, + custom_scripts=None, + file_shares=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + spark_configuration=None) + application_configuration = aztk.spark.models.ApplicationConfiguration( + name="pipy100", + application="./examples/src/main/python/pi.py", + application_args=[100], + main_class=None, + jars=[], + py_files=[], + files=[], + driver_java_options=None, + driver_class_path=None, + driver_memory=None, + driver_cores=None, + executor_memory=None, + executor_cores=None, + max_retry_count=None) + try: + spark_client.cluster.create(cluster_configuration, wait=True) + + spark_client.cluster.submit( + cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + assert True + + except (AztkError, BatchErrorException): + assert False + + finally: + clean_up_cluster(cluster_configuration.cluster_id) + + +def test_get_application_log(): + test_id = "test-get-app-log-" + cluster_configuration = aztk.spark.models.ClusterConfiguration( + cluster_id=test_id + base_cluster_id, + vm_count=2, + vm_low_pri_count=0, + vm_size="standard_f2", + subnet_id=None, + custom_scripts=None, + file_shares=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + spark_configuration=None) + application_configuration = aztk.spark.models.ApplicationConfiguration( + name="pipy100", + application="./examples/src/main/python/pi.py", + application_args=[100], + main_class=None, + jars=[], + py_files=[], + files=[], + driver_java_options=None, + driver_class_path=None, + driver_memory=None, + driver_cores=None, + executor_memory=None, + executor_cores=None, + max_retry_count=None) + try: + spark_client.cluster.create(cluster_configuration, wait=True) + + spark_client.cluster.submit( + cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + application_log = spark_client.cluster.get_application_log( + cluster_id=cluster_configuration.cluster_id, + application_name=application_configuration.name, + tail=False, + current_bytes=0) + + assert application_log.exit_code == 0 + assert application_log.name == application_configuration.name == "pipy100" + assert application_log.application_state == "completed" + assert application_log.log is not None + assert application_log.total_bytes is not None + + except (AztkError, BatchErrorException): + assert False + + finally: + clean_up_cluster(cluster_configuration.cluster_id) + + +def test_create_user_password(): + #TODO: test with paramiko + pass + + +def test_create_user_ssh_key(): + #TODO: test with paramiko + pass + + +def test_get_application_status_complete(): + test_id = "test-app-status-complete-" + cluster_configuration = aztk.spark.models.ClusterConfiguration( + cluster_id=test_id + base_cluster_id, + vm_count=2, + vm_low_pri_count=0, + vm_size="standard_f2", + subnet_id=None, + custom_scripts=None, + file_shares=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + spark_configuration=None) + application_configuration = aztk.spark.models.ApplicationConfiguration( + name="pipy100", + application="./examples/src/main/python/pi.py", + application_args=[100], + main_class=None, + jars=[], + py_files=[], + files=[], + driver_java_options=None, + driver_class_path=None, + driver_memory=None, + driver_cores=None, + executor_memory=None, + executor_cores=None, + max_retry_count=None) + try: + spark_client.cluster.create(cluster_configuration, wait=True) + + spark_client.cluster.submit( + cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + status = spark_client.cluster.get_application_status( + cluster_id=cluster_configuration.cluster_id, application_name=application_configuration.name) + + assert status == "completed" + + except (AztkError, BatchErrorException): + assert False + + finally: + clean_up_cluster(cluster_configuration.cluster_id) + + +def test_delete_cluster(): + test_id = "test-delete-" + cluster_configuration = aztk.spark.models.ClusterConfiguration( + cluster_id=test_id + base_cluster_id, + vm_count=2, + vm_low_pri_count=0, + vm_size="standard_f2", + subnet_id=None, + custom_scripts=None, + file_shares=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + spark_configuration=None) + + try: + spark_client.cluster.create(cluster_configuration, wait=True) + success = spark_client.cluster.delete(cluster_id=cluster_configuration.cluster_id) + + assert success is True + + except (AztkError, BatchErrorException): + assert False + + finally: + clean_up_cluster(cluster_configuration.cluster_id) + + +def test_spark_processes_up(): + test_id = "test-spark-processes-up-" + cluster_configuration = aztk.spark.models.ClusterConfiguration( + cluster_id=test_id + base_cluster_id, + vm_count=2, + vm_low_pri_count=0, + vm_size="standard_f2", + subnet_id=None, + custom_scripts=None, + file_shares=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + spark_configuration=None) + + try: + cluster = spark_client.cluster.create(cluster_configuration, wait=True) + wait_for_all_nodes(cluster.id, cluster.nodes) + success = spark_client.cluster.delete(cluster_id=cluster_configuration.cluster_id) + + assert success is True + + except (AztkError, BatchErrorException): + assert False + + finally: + clean_up_cluster(cluster_configuration.cluster_id) + + +def test_debug_tool(): + test_id = "debug-tool-" + cluster_configuration = aztk.spark.models.ClusterConfiguration( + cluster_id=test_id + base_cluster_id, + size=2, + size_low_priority=0, + vm_size="standard_f2", + subnet_id=None, + custom_scripts=None, + file_shares=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + spark_configuration=None) + expected_members = [ + "df.txt", "hostname.txt", "docker-images.txt", "docker-containers.txt", "spark/docker.log", "spark/ps_aux.txt", + "spark/logs", "spark/wd" + ] + try: + cluster = spark_client.cluster.create(cluster_configuration, wait=True) + nodes = [node for node in cluster.nodes] + wait_for_all_nodes(cluster.id, nodes) + cluster_output = spark_client.cluster.diagnostics(cluster_id=cluster.id) + for node_output in cluster_output: + node_output.output.seek(0) # tempfile requires seek 0 before reading + debug_zip = ZipFile(node_output.output) + assert node_output.id in [node.id for node in nodes] + assert node_output.error is None + assert any(member in name for name in debug_zip.namelist() for member in expected_members) + except (AztkError, BatchErrorException): + assert False + + finally: + clean_up_cluster(cluster_configuration.cluster_id) diff --git a/tests/integration_tests/spark/sdk/job/test_job_new.py b/tests/integration_tests/spark/sdk/job/test_job_new.py new file mode 100644 index 00000000..561e58c8 --- /dev/null +++ b/tests/integration_tests/spark/sdk/job/test_job_new.py @@ -0,0 +1,275 @@ +import os +import subprocess +from datetime import datetime + +from azure.batch.models import BatchErrorException + +import aztk.spark +from aztk.error import AztkError +from aztk_cli import config +from tests.integration_tests.spark.sdk.get_client import get_spark_client, get_test_suffix + + +base_job_id = get_test_suffix("job") +spark_client = get_spark_client() + + +def test_submit_job(): + test_id = "submit-" + app1 = aztk.spark.models.ApplicationConfiguration( + name="pipy100", + application="./examples/src/main/python/pi.py", + application_args=[100] + ) + app2 = aztk.spark.models.ApplicationConfiguration( + name="pipy101", + application="./examples/src/main/python/pi.py", + application_args=[100] + ) + job_configuration = aztk.spark.models.JobConfiguration( + id=test_id+base_job_id, + applications=[app1, app2], + vm_size="standard_f1", + spark_configuration=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + max_dedicated_nodes=2, + max_low_pri_nodes=0 + ) + try: + job = spark_client.job.submit(job_configuration=job_configuration) + spark_client.wait_until_job_finished(id=job_configuration.id) + + assert job.id == job_configuration.id + assert job.state is not None + + except (AztkError, BatchErrorException) as e: + raise e + + finally: + clean_up_job(job_configuration.id) + + +def test_list_jobs(): + test_id = "list-" + app1 = aztk.spark.models.ApplicationConfiguration( + name="pipy100", + application="./examples/src/main/python/pi.py", + application_args=[100] + ) + app2 = aztk.spark.models.ApplicationConfiguration( + name="pipy101", + application="./examples/src/main/python/pi.py", + application_args=[100] + ) + job_configuration = aztk.spark.models.JobConfiguration( + id=test_id+base_job_id, + applications=[app1, app2], + vm_size="standard_f1", + custom_scripts=None, + spark_configuration=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + max_dedicated_nodes=1, + max_low_pri_nodes=0, + worker_on_master=True + ) + try: + spark_client.job.submit(job_configuration=job_configuration) + spark_client.wait_until_job_finished(job_configuration.id) + + jobs = spark_client.job.list() + + assert jobs is not None + assert job_configuration.id in [job.id for job in jobs] + + except (AztkError, BatchErrorException) as e: + raise e + + finally: + clean_up_job(job_configuration.id) + + +def test_list_applications(): + test_id = "list-apps-" + app1 = aztk.spark.models.ApplicationConfiguration( + name="pipy100", + application="./examples/src/main/python/pi.py", + application_args=[100] + ) + app2 = aztk.spark.models.ApplicationConfiguration( + name="pipy101", + application="./examples/src/main/python/pi.py", + application_args=[100] + ) + job_configuration = aztk.spark.models.JobConfiguration( + id=test_id+base_job_id, + applications=[app1, app2], + vm_size="standard_f1", + custom_scripts=None, + spark_configuration=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + max_dedicated_nodes=2, + max_low_pri_nodes=0 + ) + try: + spark_client.job.submit(job_configuration=job_configuration) + spark_client.wait_until_job_finished(job_configuration.id) + + applications = spark_client.list_applications(id=job_configuration.id) + + assert applications not in (None, []) + assert len(applications) == 2 + for application in applications: + assert isinstance(application, (aztk.spark.models.Application, str)) + + except (AztkError, BatchErrorException) as e: + raise e + + finally: + clean_up_job(job_configuration.id) + + +def test_get_job(): + test_id = "get-" + app1 = aztk.spark.models.ApplicationConfiguration( + name="pipy100", + application="./examples/src/main/python/pi.py", + application_args=[100] + ) + app2 = aztk.spark.models.ApplicationConfiguration( + name="pipy101", + application="./examples/src/main/python/pi.py", + application_args=[100] + ) + job_configuration = aztk.spark.models.JobConfiguration( + id=test_id+base_job_id, + applications=[app1, app2], + vm_size="standard_f1", + custom_scripts=None, + spark_configuration=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + max_dedicated_nodes=1, + max_low_pri_nodes=0, + worker_on_master=True + ) + try: + spark_client.job.submit(job_configuration=job_configuration) + spark_client.wait_until_job_finished(job_configuration.id) + + job = spark_client.job.get(id=job_configuration.id) + assert job.id == job_configuration.id + assert app1.name in [app.name for app in job.applications] + assert app2.name in [app.name for app in job.applications] + + except (AztkError, BatchErrorException) as e: + raise e + + finally: + clean_up_job(job_configuration.id) + + +def test_get_application(): + test_id = "get-app-" + app1 = aztk.spark.models.ApplicationConfiguration( + name="pipy100", + application="./examples/src/main/python/pi.py", + application_args=[100] + ) + job_configuration = aztk.spark.models.JobConfiguration( + id=test_id+base_job_id, + applications=[app1], + vm_size="standard_f1", + custom_scripts=None, + spark_configuration=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + max_dedicated_nodes=2, + max_low_pri_nodes=0 + ) + try: + spark_client.job.submit(job_configuration=job_configuration) + spark_client.wait_until_job_finished(job_configuration.id) + application = spark_client.get_application(id=job_configuration.id, application_name=app1.name) + assert isinstance(application, aztk.spark.models.Application) + assert application.exit_code == 0 + assert application.state == "completed" + assert application.name == "pipy100" + except (AztkError, BatchErrorException) as e: + raise e + finally: + clean_up_job(job_configuration.id) + + +def test_get_application_log(): + test_id = "gal-" + app1 = aztk.spark.models.ApplicationConfiguration( + name="pipy100", + application="./examples/src/main/python/pi.py", + application_args=[100] + ) + job_configuration = aztk.spark.models.JobConfiguration( + id=test_id+base_job_id, + applications=[app1], + vm_size="standard_f1", + custom_scripts=None, + spark_configuration=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + max_dedicated_nodes=2, + max_low_pri_nodes=0 + ) + try: + spark_client.job.submit(job_configuration=job_configuration) + spark_client.wait_until_job_finished(job_configuration.id) + + application_log = spark_client.job.get_application_log(id=job_configuration.id, application_name=app1.name) + + assert isinstance(application_log, aztk.spark.models.ApplicationLog) + assert application_log.log is not None + assert application_log.exit_code == 0 + assert application_log.name == "pipy100" + assert application_log.total_bytes != 0 + + except (AztkError, BatchErrorException) as e: + raise e + + finally: + clean_up_job(job_configuration.id) + + +def test_delete_job(): + test_id = "delete-" + app1 = aztk.spark.models.ApplicationConfiguration( + name="pipy100", + application="./examples/src/main/python/pi.py", + application_args=[100] + ) + job_configuration = aztk.spark.models.JobConfiguration( + id=test_id+base_job_id, + applications=[app1], + vm_size="standard_f1", + custom_scripts=None, + spark_configuration=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + max_dedicated_nodes=1, + max_low_pri_nodes=0, + worker_on_master=True + ) + try: + spark_client.job.submit(job_configuration=job_configuration) + spark_client.wait_until_job_finished(job_configuration.id) + spark_client.job.delete(job_configuration.id) + assert job_configuration.id not in spark_client.job.list() + try: + spark_client.job.get(job_configuration.id) + except AztkError: + # this should fail + assert True + except (AztkError, BatchErrorException) as e: + raise e + finally: + clean_up_job(job_configuration.id) + + +def clean_up_job(job_id): + try: + spark_client.job.delete(job_id) + except (BatchErrorException, AztkError): + pass From a47ab0729059750e47265a739de71e3c11835ea1 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 27 Jun 2018 13:11:54 -0700 Subject: [PATCH 09/52] fix naming and bugs --- aztk/spark/client/cluster/helpers/copy.py | 4 ++-- aztk/spark/client/cluster/helpers/create_user.py | 6 +++--- aztk/spark/client/cluster/helpers/delete.py | 4 ++-- aztk/spark/client/cluster/helpers/download.py | 4 ++-- aztk/spark/client/cluster/helpers/get.py | 4 ++-- aztk/spark/client/cluster/helpers/get_application_log.py | 4 ++-- aztk/spark/client/cluster/helpers/get_application_status.py | 4 ++-- aztk/spark/client/cluster/helpers/list.py | 4 ++-- aztk/spark/client/cluster/helpers/node_run.py | 4 ++-- aztk/spark/client/cluster/helpers/run.py | 4 ++-- aztk/spark/client/cluster/helpers/ssh_into_master.py | 4 ++-- 11 files changed, 23 insertions(+), 23 deletions(-) diff --git a/aztk/spark/client/cluster/helpers/copy.py b/aztk/spark/client/cluster/helpers/copy.py index 2434438d..aafbacf1 100644 --- a/aztk/spark/client/cluster/helpers/copy.py +++ b/aztk/spark/client/cluster/helpers/copy.py @@ -4,9 +4,9 @@ from aztk.spark import helpers -def cluster_copy(spark_cluster_client, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False, timeout: int = None): +def cluster_copy(spark_cluster_operations, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False, timeout: int = None): try: container_name = None if host else 'spark' - return spark_cluster_client.cluster_copy(cluster_id, source_path, destination_path=destination_path, container_name=container_name, get=False, internal=internal, timeout=timeout) + return spark_cluster_operations.copy(cluster_id, source_path, destination_path=destination_path, container_name=container_name, get=False, internal=internal, timeout=timeout) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/create_user.py b/aztk/spark/client/cluster/helpers/create_user.py index 9c078e8a..c4869325 100644 --- a/aztk/spark/client/cluster/helpers/create_user.py +++ b/aztk/spark/client/cluster/helpers/create_user.py @@ -4,12 +4,12 @@ from aztk.spark import helpers -def create_user(spark_cluster_client, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str: +def create_user(spark_cluster_operations, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str: try: - cluster = spark_cluster_client.get(cluster_id) + cluster = spark_cluster_operations.get(cluster_id) master_node_id = cluster.master_node_id if not master_node_id: raise error.ClusterNotReadyError("The master has not yet been picked, a user cannot be added.") - spark_cluster_client.create_user_on_pool(username, cluster.id, cluster.nodes, ssh_key, password) + spark_cluster_operations.create_user_on_pool(username, cluster.id, cluster.nodes, ssh_key, password) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/delete.py b/aztk/spark/client/cluster/helpers/delete.py index 5daa0455..446d77bb 100644 --- a/aztk/spark/client/cluster/helpers/delete.py +++ b/aztk/spark/client/cluster/helpers/delete.py @@ -4,8 +4,8 @@ from aztk.utils import helpers -def delete_cluster(spark_cluster_client, cluster_id: str, keep_logs: bool = False): +def delete_cluster(spark_cluster_operations, cluster_id: str, keep_logs: bool = False): try: - return super(type(spark_cluster_client), spark_cluster_client).delete(cluster_id, keep_logs) + return super(type(spark_cluster_operations), spark_cluster_operations).delete(cluster_id, keep_logs) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/download.py b/aztk/spark/client/cluster/helpers/download.py index d3782823..24b46ef4 100644 --- a/aztk/spark/client/cluster/helpers/download.py +++ b/aztk/spark/client/cluster/helpers/download.py @@ -5,10 +5,10 @@ from aztk.spark import helpers -def cluster_download(spark_cluster_client, cluster_id: str, source_path: str, destination_path: str = None, host: bool = False, internal: bool = False, timeout: int = None): +def cluster_download(spark_cluster_operations, cluster_id: str, source_path: str, destination_path: str = None, host: bool = False, internal: bool = False, timeout: int = None): try: container_name = None if host else 'spark' - return spark_cluster_client.cluster_copy(cluster_id, + return spark_cluster_operations.copy(cluster_id, source_path, destination_path=destination_path, container_name=container_name, diff --git a/aztk/spark/client/cluster/helpers/get.py b/aztk/spark/client/cluster/helpers/get.py index 53769f76..8e0ec5f6 100644 --- a/aztk/spark/client/cluster/helpers/get.py +++ b/aztk/spark/client/cluster/helpers/get.py @@ -5,9 +5,9 @@ from aztk.spark import helpers -def get_cluster(spark_cluster_client , cluster_id: str): +def get_cluster(spark_cluster_operations, cluster_id: str): try: - pool, nodes = super(type(spark_cluster_client), spark_cluster_client).get(cluster_id) + pool, nodes = super(type(spark_cluster_operations), spark_cluster_operations).get(cluster_id) return models.Cluster(pool, nodes) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/get_application_log.py b/aztk/spark/client/cluster/helpers/get_application_log.py index 1d9589e5..6e60ed24 100644 --- a/aztk/spark/client/cluster/helpers/get_application_log.py +++ b/aztk/spark/client/cluster/helpers/get_application_log.py @@ -102,13 +102,13 @@ def get_log(batch_client, blob_client, cluster_id: str, application_name: str, t exit_code=task.execution_info.exit_code) -def get_application_log(spark_cluster_client, +def get_application_log(spark_cluster_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): try: - return get_log(spark_cluster_client.batch_client, spark_cluster_client.blob_client, cluster_id, + return get_log(spark_cluster_operations.batch_client, spark_cluster_operations.blob_client, cluster_id, application_name, tail, current_bytes) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/get_application_status.py b/aztk/spark/client/cluster/helpers/get_application_status.py index fafe6521..425a8d1d 100644 --- a/aztk/spark/client/cluster/helpers/get_application_status.py +++ b/aztk/spark/client/cluster/helpers/get_application_status.py @@ -4,9 +4,9 @@ from aztk.spark import helpers -def get_application_status(spark_cluster_client, cluster_id: str, app_name: str): +def get_application_status(spark_cluster_operations, cluster_id: str, app_name: str): try: - task = spark_cluster_client.batch_client.task.get(cluster_id, app_name) + task = spark_cluster_operations.batch_client.task.get(cluster_id, app_name) return task.state._value_ except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/list.py b/aztk/spark/client/cluster/helpers/list.py index ba7a83c9..07fd414b 100644 --- a/aztk/spark/client/cluster/helpers/list.py +++ b/aztk/spark/client/cluster/helpers/list.py @@ -6,9 +6,9 @@ from aztk.spark import helpers -def list_clusters(spark_cluster_client): +def list_clusters(spark_cluster_operations): try: software_metadata_key = aztk.models.Software.spark - return [models.Cluster(pool) for pool in super(type(spark_cluster_client), spark_cluster_client).list(software_metadata_key)] + return [models.Cluster(pool) for pool in super(type(spark_cluster_operations), spark_cluster_operations).list(software_metadata_key)] except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/node_run.py b/aztk/spark/client/cluster/helpers/node_run.py index d92891f1..41b719a5 100644 --- a/aztk/spark/client/cluster/helpers/node_run.py +++ b/aztk/spark/client/cluster/helpers/node_run.py @@ -4,7 +4,7 @@ from aztk.spark import helpers -def node_run(spark_cluster_client, +def node_run(spark_cluster_operations, cluster_id: str, node_id: str, command: str, @@ -12,7 +12,7 @@ def node_run(spark_cluster_client, internal: bool = False, timeout=None): try: - return super(type(spark_cluster_client), spark_cluster_client).node_run( + return super(type(spark_cluster_operations), spark_cluster_operations).node_run( cluster_id, node_id, command, internal, container_name='spark' if not host else None, timeout=timeout) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/run.py b/aztk/spark/client/cluster/helpers/run.py index a89967ef..9c21817b 100644 --- a/aztk/spark/client/cluster/helpers/run.py +++ b/aztk/spark/client/cluster/helpers/run.py @@ -4,9 +4,9 @@ from aztk.spark import helpers -def cluster_run(spark_cluster_client, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): +def cluster_run(spark_cluster_operations, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): try: - return super(type(spark_cluster_client), spark_cluster_client).run( + return super(type(spark_cluster_operations), spark_cluster_operations).run( cluster_id, command, internal, container_name='spark' if not host else None, timeout=timeout) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/ssh_into_master.py b/aztk/spark/client/cluster/helpers/ssh_into_master.py index aec66517..1d3e8670 100644 --- a/aztk/spark/client/cluster/helpers/ssh_into_master.py +++ b/aztk/spark/client/cluster/helpers/ssh_into_master.py @@ -5,8 +5,8 @@ from aztk.spark import helpers -def cluster_ssh_into_master(spark_cluster_client, cluster_id, node_id, username, ssh_key=None, password=None, port_forward_list=None, internal=False): +def cluster_ssh_into_master(spark_cluster_operations, cluster_id, node_id, username, ssh_key=None, password=None, port_forward_list=None, internal=False): try: - spark_cluster_client.ssh_into_node(cluster_id, node_id, username, ssh_key, password, port_forward_list, internal) + spark_cluster_operations.ssh_into_node(cluster_id, node_id, username, ssh_key, password, port_forward_list, internal) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) From 457adfa46d14036684f93994794ca6c5b12a986f Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 27 Jun 2018 13:18:13 -0700 Subject: [PATCH 10/52] update job operations naming, bug fixes --- aztk/spark/client/job/helpers/delete.py | 14 +++++++------- aztk/spark/client/job/helpers/get.py | 16 ++++++++-------- aztk/spark/client/job/helpers/get_application.py | 10 +++++----- .../client/job/helpers/get_application_log.py | 14 +++++++------- aztk/spark/client/job/helpers/get_recent_job.py | 6 +++--- aztk/spark/client/job/helpers/list.py | 4 ++-- .../client/job/helpers/list_applications.py | 10 +++++----- aztk/spark/client/job/helpers/stop.py | 8 ++++---- aztk/spark/client/job/helpers/submit.py | 14 +++++++------- 9 files changed, 48 insertions(+), 48 deletions(-) diff --git a/aztk/spark/client/job/helpers/delete.py b/aztk/spark/client/job/helpers/delete.py index 2c388b4a..eb0f7658 100644 --- a/aztk/spark/client/job/helpers/delete.py +++ b/aztk/spark/client/job/helpers/delete.py @@ -8,32 +8,32 @@ from .get_recent_job import get_recent_job -def _delete(spark_client, job_id, keep_logs: bool = False): - recent_run_job = get_recent_job(spark_client, job_id) +def _delete(spark_job_operations, job_id, keep_logs: bool = False): + recent_run_job = get_recent_job(spark_job_operations, job_id) deleted_job_or_job_schedule = False # delete job try: - spark_client.batch_client.job.delete(recent_run_job.id) + spark_job_operations.batch_client.job.delete(recent_run_job.id) deleted_job_or_job_schedule = True except batch_models.batch_error.BatchErrorException: pass # delete job_schedule try: - spark_client.batch_client.job_schedule.delete(job_id) + spark_job_operations.batch_client.job_schedule.delete(job_id) deleted_job_or_job_schedule = True except batch_models.batch_error.BatchErrorException: pass # delete storage container if keep_logs: - cluster_data = spark_client.get_cluster_data(job_id) + cluster_data = spark_job_operations.get_cluster_data(job_id) cluster_data.delete_container(job_id) return deleted_job_or_job_schedule -def delete(spark_job_client, job_id: str, keep_logs: bool = False): +def delete(spark_job_operations, job_id: str, keep_logs: bool = False): try: - return _delete(spark_job_client, job_id, keep_logs) + return _delete(spark_job_operations, job_id, keep_logs) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/get.py b/aztk/spark/client/job/helpers/get.py index 0677684f..94fab362 100644 --- a/aztk/spark/client/job/helpers/get.py +++ b/aztk/spark/client/job/helpers/get.py @@ -7,26 +7,26 @@ from .get_recent_job import get_recent_job -def _get_job(spark_client, job_id): - job = spark_client.batch_client.job_schedule.get(job_id) +def _get_job(spark_job_operations, job_id): + job = spark_job_operations.batch_client.job_schedule.get(job_id) job_apps = [ - app for app in spark_client.batch_client.task.list(job_id=job.execution_info.recent_job.id) if app.id != job_id + app for app in spark_job_operations.batch_client.task.list(job_id=job.execution_info.recent_job.id) if app.id != job_id ] - recent_run_job = get_recent_job(spark_client, job_id) + recent_run_job = get_recent_job(spark_job_operations, job_id) pool_prefix = recent_run_job.pool_info.auto_pool_specification.auto_pool_id_prefix pool = nodes = None - for cloud_pool in spark_client.batch_client.pool.list(): + for cloud_pool in spark_job_operations.batch_client.pool.list(): if pool_prefix in cloud_pool.id: pool = cloud_pool break if pool: - nodes = spark_client.batch_client.compute_node.list(pool_id=pool.id) + nodes = spark_job_operations.batch_client.compute_node.list(pool_id=pool.id) return job, job_apps, pool, nodes -def get_job(spark_job_client, job_id): +def get_job(spark_job_operations, job_id): try: - job, apps, pool, nodes = _get_job(spark_job_client, job_id) + job, apps, pool, nodes = _get_job(spark_job_operations, job_id) return models.Job(job, apps, pool, nodes) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/get_application.py b/aztk/spark/client/job/helpers/get_application.py index b06b9891..4a514302 100644 --- a/aztk/spark/client/job/helpers/get_application.py +++ b/aztk/spark/client/job/helpers/get_application.py @@ -8,18 +8,18 @@ from .get_recent_job import get_recent_job -def _get_application(spark_client, job_id, application_name): +def _get_application(spark_job_operations, job_id, application_name): # info about the app - recent_run_job = get_recent_job(spark_client, job_id) + recent_run_job = get_recent_job(spark_job_operations, job_id) try: - return spark_client.batch_client.task.get(job_id=recent_run_job.id, task_id=application_name) + return spark_job_operations.batch_client.task.get(job_id=recent_run_job.id, task_id=application_name) except batch_models.batch_error.BatchErrorException: raise error.AztkError( "The Spark application {0} is still being provisioned or does not exist.".format(application_name)) -def get_application(spark_job_client, job_id, application_name): +def get_application(spark_job_operations, job_id, application_name): try: - return models.Application(_get_application(spark_job_client, job_id, application_name)) + return models.Application(_get_application(spark_job_operations, job_id, application_name)) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/get_application_log.py b/aztk/spark/client/job/helpers/get_application_log.py index 8eaa04bd..51dc63ac 100644 --- a/aztk/spark/client/job/helpers/get_application_log.py +++ b/aztk/spark/client/job/helpers/get_application_log.py @@ -9,16 +9,16 @@ from .get_recent_job import get_recent_job -def _get_application_log(spark_client, job_id, application_name): +def _get_application_log(spark_job_operations, job_id, application_name): # TODO: change where the logs are uploaded so they aren't overwritten on scheduled runs # current: job_id, application_name/output.log # new: job_id, recent_run_job.id/application_name/output.log - recent_run_job = get_recent_job(spark_client, job_id) + recent_run_job = get_recent_job(spark_job_operations, job_id) try: - task = spark_client.batch_client.task.get(job_id=recent_run_job.id, task_id=application_name) + task = spark_job_operations.batch_client.task.get(job_id=recent_run_job.id, task_id=application_name) except batch_models.batch_error.BatchErrorException as e: # see if the application is written to metadata of pool - applications = list_applications(spark_client, job_id) + applications = spark_job_operations.list_applications(job_id) for application in applications: if applications[application] is None and application == application_name: @@ -29,11 +29,11 @@ def _get_application_log(spark_client, job_id, application_name): batch_models.TaskState.preparing): raise error.AztkError("The application {0} has not yet finished executing.".format(application_name)) - return spark_client.cluster.get_application_log(job_id, application_name) + return spark_job_operations.get_application_log(job_id, application_name) -def get_job_application_log(spark_job_client, job_id, application_name): +def get_job_application_log(spark_job_operations, job_id, application_name): try: - return _get_application_log(spark_job_client, job_id, application_name) + return _get_application_log(spark_job_operations, job_id, application_name) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/get_recent_job.py b/aztk/spark/client/job/helpers/get_recent_job.py index 6b4c7d17..62a5082d 100644 --- a/aztk/spark/client/job/helpers/get_recent_job.py +++ b/aztk/spark/client/job/helpers/get_recent_job.py @@ -1,3 +1,3 @@ -def get_recent_job(spark_client, job_id): - job_schedule = spark_client.batch_client.job_schedule.get(job_id) - return spark_client.batch_client.job.get(job_schedule.execution_info.recent_job.id) +def get_recent_job(spark_job_operations, job_id): + job_schedule = spark_job_operations.batch_client.job_schedule.get(job_id) + return spark_job_operations.batch_client.job.get(job_schedule.execution_info.recent_job.id) diff --git a/aztk/spark/client/job/helpers/list.py b/aztk/spark/client/job/helpers/list.py index db253df4..907949e6 100644 --- a/aztk/spark/client/job/helpers/list.py +++ b/aztk/spark/client/job/helpers/list.py @@ -5,8 +5,8 @@ from aztk.spark import helpers, models -def _list_jobs(spark_client): - return [cloud_job_schedule for cloud_job_schedule in spark_client.batch_client.job_schedule.list()] +def _list_jobs(spark_job_operations): + return [cloud_job_schedule for cloud_job_schedule in spark_job_operations.batch_client.job_schedule.list()] def list_jobs(self): diff --git a/aztk/spark/client/job/helpers/list_applications.py b/aztk/spark/client/job/helpers/list_applications.py index 6bc12844..aca5cc16 100644 --- a/aztk/spark/client/job/helpers/list_applications.py +++ b/aztk/spark/client/job/helpers/list_applications.py @@ -7,8 +7,8 @@ from .get_recent_job import get_recent_job -def _list_applications(spark_client, job_id): - recent_run_job = get_recent_job(spark_client, job_id) +def _list_applications(spark_job_operations, job_id): + recent_run_job = get_recent_job(spark_job_operations, job_id) # get application names from Batch job metadata applications = {} for metadata_item in recent_run_job.metadata: @@ -17,16 +17,16 @@ def _list_applications(spark_client, job_id): applications[app_name] = None # get tasks from Batch job - for task in spark_client.batch_client.task.list(recent_run_job.id): + for task in spark_job_operations.batch_client.task.list(recent_run_job.id): if task.id != job_id: applications[task.id] = task return applications -def list_applications(spark_job_client, job_id): +def list_applications(spark_job_operations, job_id): try: - applications = _list_applications(spark_job_client, job_id) + applications = _list_applications(spark_job_operations, job_id) for item in applications: if applications[item]: applications[item] = models.Application(applications[item]) diff --git a/aztk/spark/client/job/helpers/stop.py b/aztk/spark/client/job/helpers/stop.py index e19bd696..8e21c416 100644 --- a/aztk/spark/client/job/helpers/stop.py +++ b/aztk/spark/client/job/helpers/stop.py @@ -7,12 +7,12 @@ from .get_recent_job import get_recent_job -def _stop(spark_client, job_id): +def _stop(spark_job_operations, job_id): # terminate currently running job and tasks - recent_run_job = get_recent_job(spark_client, job_id) - spark_client.batch_client.job.terminate(recent_run_job.id) + recent_run_job = get_recent_job(spark_job_operations, job_id) + spark_job_operations.batch_client.job.terminate(recent_run_job.id) # terminate job_schedule - spark_client.batch_client.job_schedule.terminate(job_id) + spark_job_operations.batch_client.job_schedule.terminate(job_id) def stop(self, job_id): diff --git a/aztk/spark/client/job/helpers/submit.py b/aztk/spark/client/job/helpers/submit.py index a7def446..15ffa667 100644 --- a/aztk/spark/client/job/helpers/submit.py +++ b/aztk/spark/client/job/helpers/submit.py @@ -63,16 +63,15 @@ def _apply_default_for_job_config(job_conf: models.JobConfiguration): return job_conf -def submit_job(spark_job_client, job_configuration: models.JobConfiguration): +def submit_job(spark_job_operations, job_configuration: models.JobConfiguration): try: job_configuration = _apply_default_for_job_config(job_configuration) job_configuration.validate() - cluster_data = spark_job_client._get_cluster_data(job_configuration.id) + cluster_data = spark_job_operations.get_cluster_data(job_configuration.id) node_data = NodeData(job_configuration.to_cluster_config()).add_core().done() zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() - start_task = spark_job_client.__generate_cluster_start_task( - spark_job_client, + start_task = spark_job_operations.generate_cluster_start_task( zip_resource_files, job_configuration.id, job_configuration.gpu_enabled, @@ -83,9 +82,10 @@ def submit_job(spark_job_client, job_configuration: models.JobConfiguration): application_tasks = [] for application in job_configuration.applications: application_tasks.append((application, - spark_job_client.__generate_application_task(spark_job_client, job_configuration.id, application))) + spark_job_operations.generate_application_task(job_configuration.id, + application))) - job_manager_task = generate_job_manager_task(spark_job_client, job_configuration, application_tasks) + job_manager_task = generate_job_manager_task(spark_job_operations, job_configuration, application_tasks) software_metadata_key = "spark" @@ -96,7 +96,7 @@ def submit_job(spark_job_client, job_configuration: models.JobConfiguration): job_configuration.max_dedicated_nodes, job_configuration.max_low_pri_nodes) - job = spark_job_client.__submit( + job = super(type(spark_job_operations), spark_job_operations).submit( job_configuration=job_configuration, start_task=start_task, job_manager_task=job_manager_task, From 880e66d08c9b4cd9b33eb6ddb19447f97b64dd3e Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 27 Jun 2018 16:28:50 -0700 Subject: [PATCH 11/52] fix cluster tests --- aztk/client/cluster/helpers/copy.py | 10 +++++----- aztk/spark/client/cluster/helpers/copy.py | 2 +- aztk/spark/client/cluster/helpers/download.py | 2 +- aztk/spark/client/cluster/operations.py | 15 ++++++++++++--- 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/aztk/client/cluster/helpers/copy.py b/aztk/client/cluster/helpers/copy.py index 11e741fa..706ee27e 100644 --- a/aztk/client/cluster/helpers/copy.py +++ b/aztk/client/cluster/helpers/copy.py @@ -6,16 +6,16 @@ from aztk.utils import ssh as ssh_lib -def cluster_copy(cluster_client, cluster_id, source_path, destination_path=None, container_name=None, internal=False, get=False, timeout=None): - cluster = cluster_client.get(cluster_id) +def cluster_copy(cluster_operations, cluster_id, source_path, destination_path=None, container_name=None, internal=False, get=False, timeout=None): + cluster = cluster_operations.get(cluster_id) pool, nodes = cluster.pool, list(cluster.nodes) if internal: cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] else: - cluster_nodes = [(node, cluster_client.__get_remote_login_settings(pool.id, node.id)) for node in nodes] + cluster_nodes = [(node, cluster_operations.get_remote_login_settings(pool.id, node.id)) for node in nodes] try: - generated_username, ssh_key = cluster_client.generate_user_on_pool(pool.id, nodes) + generated_username, ssh_key = cluster_operations.generate_user_on_pool(pool.id, nodes) output = asyncio.get_event_loop().run_until_complete( ssh_lib.clus_copy( container_name=container_name, @@ -32,4 +32,4 @@ def cluster_copy(cluster_client, cluster_id, source_path, destination_path=None, except (OSError, batch_error.BatchErrorException) as exc: raise exc finally: - cluster_client.__delete_user_on_pool(generated_username, pool.id, nodes) + cluster_operations.delete_user_on_pool(generated_username, pool.id, nodes) diff --git a/aztk/spark/client/cluster/helpers/copy.py b/aztk/spark/client/cluster/helpers/copy.py index aafbacf1..6051637d 100644 --- a/aztk/spark/client/cluster/helpers/copy.py +++ b/aztk/spark/client/cluster/helpers/copy.py @@ -7,6 +7,6 @@ def cluster_copy(spark_cluster_operations, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False, timeout: int = None): try: container_name = None if host else 'spark' - return spark_cluster_operations.copy(cluster_id, source_path, destination_path=destination_path, container_name=container_name, get=False, internal=internal, timeout=timeout) + return super(type(spark_cluster_operations), spark_cluster_operations).copy(cluster_id, source_path, destination_path=destination_path, container_name=container_name, get=False, internal=internal, timeout=timeout) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/download.py b/aztk/spark/client/cluster/helpers/download.py index 24b46ef4..b58b62c6 100644 --- a/aztk/spark/client/cluster/helpers/download.py +++ b/aztk/spark/client/cluster/helpers/download.py @@ -8,7 +8,7 @@ def cluster_download(spark_cluster_operations, cluster_id: str, source_path: str, destination_path: str = None, host: bool = False, internal: bool = False, timeout: int = None): try: container_name = None if host else 'spark' - return spark_cluster_operations.copy(cluster_id, + return super(type(spark_cluster_operations), spark_cluster_operations).copy(cluster_id, source_path, destination_path=destination_path, container_name=container_name, diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index bf7e3018..ebc1b56c 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -1,9 +1,9 @@ +from aztk.client.cluster import CoreClusterOperations from aztk.spark import models from aztk.spark.client.base import SparkBaseOperations -from aztk.client.cluster import CoreClusterOperations -from .helpers import (copy, create, create_user, delete, get, get_application_log, get_application_status, list, - node_run, run, submit, diagnostics) +from .helpers import (copy, create, create_user, delete, diagnostics, download, get, get_application_log, + get_application_status, list, node_run, run, submit) class ClusterOperations(CoreClusterOperations, SparkBaseOperations): @@ -50,5 +50,14 @@ def copy(self, timeout: int = None): return copy.cluster_copy(self, cluster_id, source_path, destination_path, host, internal, timeout) + def download(self, + cluster_id: str, + source_path: str, + destination_path: str = None, + host: bool = False, + internal: bool = False, + timeout: int = None): + return download.cluster_download(self, cluster_id, source_path, destination_path, host, internal, timeout) + def diagnostics(self, cluster_id, output_directory=None): return diagnostics.run_cluster_diagnostics(self, cluster_id, output_directory) From 1d07be931aaa052f79020b59b6544532944af48c Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Thu, 28 Jun 2018 11:08:59 -0700 Subject: [PATCH 12/52] fix joboperations and tests --- aztk/client/base/base_operations.py | 10 +- .../base/helpers/get_application_log.py | 114 ++++++++++++++++++ aztk/spark/client.py | 4 +- .../cluster/helpers/get_application_log.py | 111 +---------------- .../client/job/helpers/get_application_log.py | 2 +- .../client/job/helpers/wait_until_complete.py | 22 ++++ aztk/spark/client/job/operations.py | 9 +- .../spark/sdk/job/test_job_new.py | 18 +-- 8 files changed, 165 insertions(+), 125 deletions(-) create mode 100644 aztk/client/base/helpers/get_application_log.py create mode 100644 aztk/spark/client/job/helpers/wait_until_complete.py diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index 859b84fd..cec5e30c 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -2,10 +2,9 @@ from aztk.internal import cluster_data from aztk.utils import ssh as ssh_lib -from .helpers import (run, create_user_on_node, create_user_on_pool, - delete_user_on_node, delete_user_on_pool, - generate_user_on_node, generate_user_on_pool, - get_remote_login_settings, node_run, ssh_into_node) +from .helpers import (create_user_on_node, create_user_on_pool, delete_user_on_node, delete_user_on_pool, + generate_user_on_node, generate_user_on_pool, get_application_log, get_remote_login_settings, + node_run, run, ssh_into_node) class BaseOperations: @@ -66,3 +65,6 @@ def get_remote_login_settings(self, cluster_id: str, node_id: str): def run(self, cluster_id, command, internal, container_name=None, timeout=None): return run.cluster_run(self, cluster_id, command, internal, container_name, timeout) + + def get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): + return get_application_log.get_application_log(self, cluster_id, application_name, tail, current_bytes) diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py new file mode 100644 index 00000000..2a2bd07a --- /dev/null +++ b/aztk/client/base/helpers/get_application_log.py @@ -0,0 +1,114 @@ +import time + +import azure +import azure.batch.models as batch_models +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import helpers, models +from aztk.utils import constants, helpers + +output_file = constants.TASK_WORKING_DIR + \ + "/" + constants.SPARK_SUBMIT_LOGS_FILE + + +def __check_task_node_exist(batch_client, cluster_id: str, task: batch_models.CloudTask) -> bool: + try: + batch_client.compute_node.get(cluster_id, task.node_info.node_id) + return True + except batch_error.BatchErrorException: + return False + + +def __wait_for_app_to_be_running(batch_client, cluster_id: str, application_name: str) -> batch_models.CloudTask: + """ + Wait for the batch task to leave the waiting state into running(or completed if it was fast enough) + """ + while True: + task = batch_client.task.get(cluster_id, application_name) + + if task.state is batch_models.TaskState.active or task.state is batch_models.TaskState.preparing: + # TODO: log + time.sleep(5) + else: + return task + + +def __get_output_file_properties(batch_client, cluster_id: str, application_name: str): + while True: + try: + file = helpers.get_file_properties(cluster_id, application_name, output_file, batch_client) + return file + except batch_error.BatchErrorException as e: + if e.response.status_code == 404: + # TODO: log + time.sleep(5) + continue + else: + raise e + + +def get_log_from_storage(blob_client, container_name, application_name, task): + try: + blob = blob_client.get_blob_to_text(container_name, application_name + '/' + constants.SPARK_SUBMIT_LOGS_FILE) + except azure.common.AzureMissingResourceHttpError: + raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.") + + return models.ApplicationLog( + name=application_name, + cluster_id=container_name, + application_state=task.state._value_, + log=blob.content, + total_bytes=blob.properties.content_length, + exit_code=task.execution_info.exit_code) + + +def get_log(batch_client, blob_client, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): + job_id = cluster_id + task_id = application_name + + task = __wait_for_app_to_be_running(batch_client, cluster_id, application_name) + + if not __check_task_node_exist(batch_client, cluster_id, task): + return get_log_from_storage(blob_client, cluster_id, application_name, task) + + file = __get_output_file_properties(batch_client, cluster_id, application_name) + target_bytes = file.content_length + + if target_bytes != current_bytes: + ocp_range = None + + if tail: + ocp_range = "bytes={0}-{1}".format(current_bytes, target_bytes - 1) + + stream = batch_client.file.get_from_task( + job_id, task_id, output_file, batch_models.FileGetFromTaskOptions(ocp_range=ocp_range)) + content = helpers.read_stream_as_string(stream) + + return models.ApplicationLog( + name=application_name, + cluster_id=cluster_id, + application_state=task.state._value_, + log=content, + total_bytes=target_bytes, + exit_code=task.execution_info.exit_code) + else: + return models.ApplicationLog( + name=application_name, + cluster_id=cluster_id, + application_state=task.state._value_, + log='', + total_bytes=target_bytes, + exit_code=task.execution_info.exit_code) + + +def get_application_log(base_operations, + cluster_id: str, + application_name: str, + tail=False, + current_bytes: int = 0): + try: + return get_log(base_operations.batch_client, base_operations.blob_client, cluster_id, + application_name, tail, current_bytes) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client.py b/aztk/spark/client.py index cb706503..83701469 100644 --- a/aztk/spark/client.py +++ b/aztk/spark/client.py @@ -332,7 +332,7 @@ def stop_job_app(self, job_id, application_name): # NOT IMPLEMENTED except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) - def wait_until_job_finished(self, job_id): # NOT IMPLEMENTED + def wait_until_job_finished(self, job_id): try: job_submit_helper.wait_until_job_finished(self, job_id) except batch_error.BatchErrorException as e: @@ -342,7 +342,7 @@ def wait_until_all_jobs_finished(self, jobs): # NOT IMPLEMENTED for job in jobs: self.wait_until_job_finished(job) - def run_cluster_diagnostics(self, cluster_id, output_directory=None): # NOT IMPLEMENTED + def run_cluster_diagnostics(self, cluster_id, output_directory=None): try: output = cluster_diagnostic_helper.run(self, cluster_id, output_directory) return output diff --git a/aztk/spark/client/cluster/helpers/get_application_log.py b/aztk/spark/client/cluster/helpers/get_application_log.py index 6e60ed24..3cdbaaba 100644 --- a/aztk/spark/client/cluster/helpers/get_application_log.py +++ b/aztk/spark/client/cluster/helpers/get_application_log.py @@ -1,114 +1,11 @@ -import time - -import azure -import azure.batch.models as batch_models -import azure.batch.models.batch_error as batch_error +from azure.batch.models import batch_error from aztk import error -from aztk.spark import helpers, models -from aztk.utils import constants, helpers - -output_file = constants.TASK_WORKING_DIR + \ - "/" + constants.SPARK_SUBMIT_LOGS_FILE - - -def __check_task_node_exist(batch_client, cluster_id: str, task: batch_models.CloudTask) -> bool: - try: - batch_client.compute_node.get(cluster_id, task.node_info.node_id) - return True - except batch_error.BatchErrorException: - return False - - -def __wait_for_app_to_be_running(batch_client, cluster_id: str, application_name: str) -> batch_models.CloudTask: - """ - Wait for the batch task to leave the waiting state into running(or completed if it was fast enough) - """ - while True: - task = batch_client.task.get(cluster_id, application_name) - - if task.state is batch_models.TaskState.active or task.state is batch_models.TaskState.preparing: - # TODO: log - time.sleep(5) - else: - return task - - -def __get_output_file_properties(batch_client, cluster_id: str, application_name: str): - while True: - try: - file = helpers.get_file_properties(cluster_id, application_name, output_file, batch_client) - return file - except batch_error.BatchErrorException as e: - if e.response.status_code == 404: - # TODO: log - time.sleep(5) - continue - else: - raise e - - -def get_log_from_storage(blob_client, container_name, application_name, task): - try: - blob = blob_client.get_blob_to_text(container_name, application_name + '/' + constants.SPARK_SUBMIT_LOGS_FILE) - except azure.common.AzureMissingResourceHttpError: - raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.") - - return models.ApplicationLog( - name=application_name, - cluster_id=container_name, - application_state=task.state._value_, - log=blob.content, - total_bytes=blob.properties.content_length, - exit_code=task.execution_info.exit_code) - - -def get_log(batch_client, blob_client, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): - job_id = cluster_id - task_id = application_name - - task = __wait_for_app_to_be_running(batch_client, cluster_id, application_name) - - if not __check_task_node_exist(batch_client, cluster_id, task): - return get_log_from_storage(blob_client, cluster_id, application_name, task) - - file = __get_output_file_properties(batch_client, cluster_id, application_name) - target_bytes = file.content_length - - if target_bytes != current_bytes: - ocp_range = None - - if tail: - ocp_range = "bytes={0}-{1}".format(current_bytes, target_bytes - 1) - - stream = batch_client.file.get_from_task( - job_id, task_id, output_file, batch_models.FileGetFromTaskOptions(ocp_range=ocp_range)) - content = helpers.read_stream_as_string(stream) - - return models.ApplicationLog( - name=application_name, - cluster_id=cluster_id, - application_state=task.state._value_, - log=content, - total_bytes=target_bytes, - exit_code=task.execution_info.exit_code) - else: - return models.ApplicationLog( - name=application_name, - cluster_id=cluster_id, - application_state=task.state._value_, - log='', - total_bytes=target_bytes, - exit_code=task.execution_info.exit_code) +from aztk.spark import helpers -def get_application_log(spark_cluster_operations, - cluster_id: str, - application_name: str, - tail=False, - current_bytes: int = 0): +def get_application_log(spark_cluster_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): try: - return get_log(spark_cluster_operations.batch_client, spark_cluster_operations.blob_client, cluster_id, - application_name, tail, current_bytes) + return super(type(spark_cluster_operations), spark_cluster_operations).get_application_log(spark_cluster_operations, cluster_id, application_name, tail, current_bytes) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/get_application_log.py b/aztk/spark/client/job/helpers/get_application_log.py index 51dc63ac..af2406cc 100644 --- a/aztk/spark/client/job/helpers/get_application_log.py +++ b/aztk/spark/client/job/helpers/get_application_log.py @@ -29,7 +29,7 @@ def _get_application_log(spark_job_operations, job_id, application_name): batch_models.TaskState.preparing): raise error.AztkError("The application {0} has not yet finished executing.".format(application_name)) - return spark_job_operations.get_application_log(job_id, application_name) + return super(type(spark_job_operations), spark_job_operations).get_application_log(job_id, application_name) def get_job_application_log(spark_job_operations, job_id, application_name): diff --git a/aztk/spark/client/job/helpers/wait_until_complete.py b/aztk/spark/client/job/helpers/wait_until_complete.py new file mode 100644 index 00000000..0f58ea3b --- /dev/null +++ b/aztk/spark/client/job/helpers/wait_until_complete.py @@ -0,0 +1,22 @@ +import time + +import azure.batch.models as batch_models +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.utils import helpers + + +def _wait_until_job_finished(spark_job_operations, job_id): + job_state = spark_job_operations.batch_client.job_schedule.get(job_id).state + + while job_state != batch_models.JobScheduleState.completed: + time.sleep(3) + job_state = spark_job_operations.batch_client.job_schedule.get(job_id).state + + +def wait_until_job_finished(spark_job_operations, job_id): + try: + _wait_until_job_finished(spark_job_operations, job_id) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/operations.py b/aztk/spark/client/job/operations.py index e0841505..4e57267f 100644 --- a/aztk/spark/client/job/operations.py +++ b/aztk/spark/client/job/operations.py @@ -1,10 +1,12 @@ +from aztk.client.job import CoreJobOperations from aztk.spark import models from aztk.spark.client.base import SparkBaseOperations -from .helpers import (delete, get, get_application, get_application_log, list, list_applications, stop, submit) +from .helpers import (delete, get, get_application, get_application_log, list, list_applications, stop, submit, + wait_until_complete) -class JobOperations(SparkBaseOperations): +class JobOperations(CoreJobOperations, SparkBaseOperations): def list(self): return list.list_jobs(self) @@ -28,3 +30,6 @@ def stop(self, id): def submit(self, job_configuration: models.JobConfiguration): return submit.submit_job(self, job_configuration) + + def wait_until_job_finished(self, id): #TODO: rename to something better + wait_until_complete.wait_until_job_finished(self, id) diff --git a/tests/integration_tests/spark/sdk/job/test_job_new.py b/tests/integration_tests/spark/sdk/job/test_job_new.py index 561e58c8..0350c6cb 100644 --- a/tests/integration_tests/spark/sdk/job/test_job_new.py +++ b/tests/integration_tests/spark/sdk/job/test_job_new.py @@ -37,7 +37,7 @@ def test_submit_job(): ) try: job = spark_client.job.submit(job_configuration=job_configuration) - spark_client.wait_until_job_finished(id=job_configuration.id) + spark_client.job.wait_until_job_finished(id=job_configuration.id) assert job.id == job_configuration.id assert job.state is not None @@ -74,7 +74,7 @@ def test_list_jobs(): ) try: spark_client.job.submit(job_configuration=job_configuration) - spark_client.wait_until_job_finished(job_configuration.id) + spark_client.job.wait_until_job_finished(job_configuration.id) jobs = spark_client.job.list() @@ -112,9 +112,9 @@ def test_list_applications(): ) try: spark_client.job.submit(job_configuration=job_configuration) - spark_client.wait_until_job_finished(job_configuration.id) + spark_client.job.wait_until_job_finished(job_configuration.id) - applications = spark_client.list_applications(id=job_configuration.id) + applications = spark_client.job.list_applications(id=job_configuration.id) assert applications not in (None, []) assert len(applications) == 2 @@ -153,7 +153,7 @@ def test_get_job(): ) try: spark_client.job.submit(job_configuration=job_configuration) - spark_client.wait_until_job_finished(job_configuration.id) + spark_client.job.wait_until_job_finished(job_configuration.id) job = spark_client.job.get(id=job_configuration.id) assert job.id == job_configuration.id @@ -186,8 +186,8 @@ def test_get_application(): ) try: spark_client.job.submit(job_configuration=job_configuration) - spark_client.wait_until_job_finished(job_configuration.id) - application = spark_client.get_application(id=job_configuration.id, application_name=app1.name) + spark_client.job.wait_until_job_finished(job_configuration.id) + application = spark_client.job.get_application(id=job_configuration.id, application_name=app1.name) assert isinstance(application, aztk.spark.models.Application) assert application.exit_code == 0 assert application.state == "completed" @@ -217,7 +217,7 @@ def test_get_application_log(): ) try: spark_client.job.submit(job_configuration=job_configuration) - spark_client.wait_until_job_finished(job_configuration.id) + spark_client.job.wait_until_job_finished(job_configuration.id) application_log = spark_client.job.get_application_log(id=job_configuration.id, application_name=app1.name) @@ -254,7 +254,7 @@ def test_delete_job(): ) try: spark_client.job.submit(job_configuration=job_configuration) - spark_client.wait_until_job_finished(job_configuration.id) + spark_client.job.wait_until_job_finished(job_configuration.id) spark_client.job.delete(job_configuration.id) assert job_configuration.id not in spark_client.job.list() try: From 8c3b28973d5c04404db9dc0e9e1e5dfe9ba13308 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 2 Jul 2018 13:40:19 -0700 Subject: [PATCH 13/52] update cli and fix some bugs --- .../base/helpers/get_application_log.py | 2 +- aztk/client/base/helpers/run.py | 14 +- aztk/client/client.py | 426 +++++++++++++++++- aztk/client/cluster/helpers/copy.py | 6 + aztk/spark/client.py | 2 +- aztk/spark/client/client.py | 209 ++++++++- aztk/spark/client/cluster/helpers/copy.py | 9 +- aztk/spark/client/cluster/helpers/create.py | 4 +- .../client/cluster/helpers/create_user.py | 2 +- aztk/spark/client/cluster/helpers/download.py | 2 +- aztk/spark/client/cluster/helpers/get.py | 2 +- .../cluster/helpers/get_application_log.py | 4 +- .../cluster/helpers/get_application_status.py | 2 +- aztk/spark/client/cluster/helpers/list.py | 2 +- aztk/spark/client/cluster/helpers/node_run.py | 2 +- aztk/spark/client/cluster/helpers/run.py | 2 +- .../client/cluster/helpers/ssh_into_master.py | 2 +- aztk/spark/client/cluster/operations.py | 44 +- .../client/job/helpers/stop_application.py | 16 + aztk/spark/client/job/operations.py | 7 +- .../endpoints/cluster/cluster_add_user.py | 4 +- .../endpoints/cluster/cluster_app_logs.py | 2 +- .../spark/endpoints/cluster/cluster_copy.py | 4 +- .../spark/endpoints/cluster/cluster_create.py | 4 +- .../spark/endpoints/cluster/cluster_debug.py | 2 +- .../spark/endpoints/cluster/cluster_delete.py | 2 +- .../spark/endpoints/cluster/cluster_get.py | 2 +- .../spark/endpoints/cluster/cluster_list.py | 2 +- .../spark/endpoints/cluster/cluster_run.py | 4 +- .../spark/endpoints/cluster/cluster_ssh.py | 2 +- .../spark/endpoints/cluster/cluster_submit.py | 8 +- aztk_cli/spark/endpoints/job/delete.py | 4 +- aztk_cli/spark/endpoints/job/get.py | 2 +- aztk_cli/spark/endpoints/job/get_app.py | 2 +- aztk_cli/spark/endpoints/job/get_app_logs.py | 2 +- aztk_cli/spark/endpoints/job/list.py | 2 +- aztk_cli/spark/endpoints/job/list_apps.py | 2 +- aztk_cli/spark/endpoints/job/stop.py | 2 +- aztk_cli/spark/endpoints/job/stop_app.py | 2 +- aztk_cli/spark/endpoints/job/submit.py | 2 +- aztk_cli/utils.py | 13 +- 41 files changed, 746 insertions(+), 82 deletions(-) create mode 100644 aztk/spark/client/job/helpers/stop_application.py diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index 2a2bd07a..b64a6ccc 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -5,7 +5,7 @@ import azure.batch.models.batch_error as batch_error from aztk import error -from aztk.spark import helpers, models +from aztk.spark import models from aztk.utils import constants, helpers output_file = constants.TASK_WORKING_DIR + \ diff --git a/aztk/client/base/helpers/run.py b/aztk/client/base/helpers/run.py index 9ab7c038..9f409994 100644 --- a/aztk/client/base/helpers/run.py +++ b/aztk/client/base/helpers/run.py @@ -1,7 +1,12 @@ import asyncio +from azure.batch.models import batch_error + import aztk.models as models +from aztk import error from aztk.utils import ssh as ssh_lib +from aztk.utils import helpers + def cluster_run(base_client, cluster_id, command, internal, container_name=None, timeout=None): cluster = base_client.get(cluster_id) @@ -10,9 +15,12 @@ def cluster_run(base_client, cluster_id, command, internal, container_name=None, cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] else: cluster_nodes = [(node, base_client.get_remote_login_settings(pool.id, node.id)) for node in nodes] - try: generated_username, ssh_key = base_client.generate_user_on_pool(pool.id, nodes) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) + + try: output = asyncio.get_event_loop().run_until_complete( ssh_lib.clus_exec_command( command, @@ -20,9 +28,7 @@ def cluster_run(base_client, cluster_id, command, internal, container_name=None, cluster_nodes, ssh_key=ssh_key.exportKey().decode('utf-8'), container_name=container_name, - timeout=timeout - ) - ) + timeout=timeout)) return output except OSError as exc: raise exc diff --git a/aztk/client/client.py b/aztk/client/client.py index 55b9a11b..cd2cc9ff 100644 --- a/aztk/client/client.py +++ b/aztk/client/client.py @@ -1,11 +1,26 @@ +import asyncio +import concurrent.futures +from datetime import datetime, timedelta, timezone + +import azure.batch.models as batch_models +import azure.batch.models.batch_error as batch_error +from Cryptodome.PublicKey import RSA + +import aztk.error as error import aztk.models as models +import aztk.utils.azure_api as azure_api +import aztk.utils.constants as constants +import aztk.utils.get_ssh_key as get_ssh_key +import aztk.utils.helpers as helpers +import aztk.utils.ssh as ssh_lib from aztk.client.cluster import CoreClusterOperations from aztk.client.job import CoreJobOperations -from aztk.utils import azure_api +from aztk.internal import cluster_data +from aztk.utils import deprecated, secure_utils class CoreClient: - def __init__(self, secrets_configuration: models.SecretsConfiguration): + def __init__(self, secrets_configuration: models.SecretsConfiguration): # make accept secrets_config and secrets_configuration context = self.get_context(secrets_configuration) self.cluster = CoreClusterOperations(context) self.job = CoreJobOperations(context) @@ -22,3 +37,410 @@ def get_context(self, secrets_configuration: models.SecretsConfiguration): 'secrets_configuration': self.secrets_configuration, } return context + + # ALL THE FOLLOWING METHODS ARE DEPRECATED AND WILL BE REMOVED IN 0.10.0 + + def get_cluster_config(self, cluster_id: str) -> models.ClusterConfiguration: + return self._get_cluster_data(cluster_id).read_cluster_config() + + def _get_cluster_data(self, cluster_id: str) -> cluster_data.ClusterData: + """ + Returns ClusterData object to manage data related to the given cluster id + """ + return cluster_data.ClusterData(self.blob_client, cluster_id) + + ''' + General Batch Operations + ''' + + def __delete_pool_and_job(self, pool_id: str, keep_logs: bool = False): + """ + Delete a pool and it's associated job + :param cluster_id: the pool to add the user to + :return bool: deleted the pool if exists and job if exists + """ + # job id is equal to pool id + job_id = pool_id + job_exists = True + + try: + self.batch_client.job.get(job_id) + except batch_models.batch_error.BatchErrorException: + job_exists = False + + pool_exists = self.batch_client.pool.exists(pool_id) + + if job_exists: + self.batch_client.job.delete(job_id) + + if pool_exists: + self.batch_client.pool.delete(pool_id) + + if not keep_logs: + cluster_data = self._get_cluster_data(pool_id) + cluster_data.delete_container(pool_id) + + return job_exists or pool_exists + + def __create_pool_and_job(self, cluster_conf: models.ClusterConfiguration, software_metadata_key: str, start_task, VmImageModel): + """ + Create a pool and job + :param cluster_conf: the configuration object used to create the cluster + :type cluster_conf: aztk.models.ClusterConfiguration + :parm software_metadata_key: the id of the software being used on the cluster + :param start_task: the start task for the cluster + :param VmImageModel: the type of image to provision for the cluster + :param wait: wait until the cluster is ready + """ + self._get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf) + # reuse pool_id as job_id + pool_id = cluster_conf.cluster_id + job_id = cluster_conf.cluster_id + + # Get a verified node agent sku + sku_to_use, image_ref_to_use = \ + helpers.select_latest_verified_vm_image_with_node_agent_sku( + VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, self.batch_client) + + network_conf = None + if cluster_conf.subnet_id is not None: + network_conf = batch_models.NetworkConfiguration( + subnet_id=cluster_conf.subnet_id) + auto_scale_formula = "$TargetDedicatedNodes={0}; $TargetLowPriorityNodes={1}".format( + cluster_conf.size, cluster_conf.size_low_priority) + + # Configure the pool + pool = batch_models.PoolAddParameter( + id=pool_id, + virtual_machine_configuration=batch_models.VirtualMachineConfiguration( + image_reference=image_ref_to_use, + node_agent_sku_id=sku_to_use), + vm_size=cluster_conf.vm_size, + enable_auto_scale=True, + auto_scale_formula=auto_scale_formula, + auto_scale_evaluation_interval=timedelta(minutes=5), + start_task=start_task, + enable_inter_node_communication=True if not cluster_conf.subnet_id else False, + max_tasks_per_node=4, + network_configuration=network_conf, + metadata=[ + batch_models.MetadataItem( + name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), + batch_models.MetadataItem( + name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA) + ]) + + # Create the pool + create user for the pool + helpers.create_pool_if_not_exist(pool, self.batch_client) + + # Create job + job = batch_models.JobAddParameter( + id=job_id, + pool_info=batch_models.PoolInformation(pool_id=pool_id)) + + # Add job to batch + self.batch_client.job.add(job) + + return helpers.get_cluster(cluster_conf.cluster_id, self.batch_client) + + def __get_pool_details(self, cluster_id: str): + """ + Print the information for the given cluster + :param cluster_id: Id of the cluster + :return pool: CloudPool, nodes: ComputeNodePaged + """ + pool = self.batch_client.pool.get(cluster_id) + nodes = self.batch_client.compute_node.list(pool_id=cluster_id) + return pool, nodes + + def __list_clusters(self, software_metadata_key): + """ + List all the cluster on your account. + """ + pools = self.batch_client.pool.list() + software_metadata = ( + constants.AZTK_SOFTWARE_METADATA_KEY, software_metadata_key) + cluster_metadata = ( + constants.AZTK_MODE_METADATA_KEY, constants.AZTK_CLUSTER_MODE_METADATA) + + aztk_pools = [] + for pool in [pool for pool in pools if pool.metadata]: + pool_metadata = [(metadata.name, metadata.value) for metadata in pool.metadata] + if all([metadata in pool_metadata for metadata in [software_metadata, cluster_metadata]]): + aztk_pools.append(pool) + return aztk_pools + + def __create_user(self, pool_id: str, node_id: str, username: str, password: str = None, ssh_key: str = None) -> str: + """ + Create a pool user + :param pool: the pool to add the user to + :param node: the node to add the user to + :param username: username of the user to add + :param password: password of the user to add + :param ssh_key: ssh_key of the user to add + """ + # Create new ssh user for the given node + self.batch_client.compute_node.add_user( + pool_id, + node_id, + batch_models.ComputeNodeUser( + name=username, + is_admin=True, + password=password, + ssh_public_key=get_ssh_key.get_user_public_key( + ssh_key, self.secrets_config), + expiry_time=datetime.now(timezone.utc) + timedelta(days=365))) + + def __delete_user(self, pool_id: str, node_id: str, username: str) -> str: + """ + Create a pool user + :param pool: the pool to add the user to + :param node: the node to add the user to + :param username: username of the user to add + """ + # Delete a user on the given node + self.batch_client.compute_node.delete_user(pool_id, node_id, username) + + def __get_remote_login_settings(self, pool_id: str, node_id: str): + """ + Get the remote_login_settings for node + :param pool_id + :param node_id + :returns aztk.models.RemoteLogin + """ + result = self.batch_client.compute_node.get_remote_login_settings( + pool_id, node_id) + return models.RemoteLogin(ip_address=result.remote_login_ip_address, port=str(result.remote_login_port)) + + def __create_user_on_node(self, username, pool_id, node_id, ssh_key=None, password=None): + try: + self.__create_user(pool_id=pool_id, node_id=node_id, username=username, ssh_key=ssh_key, password=password) + except batch_error.BatchErrorException as error: + try: + self.__delete_user(pool_id, node_id, username) + self.__create_user(pool_id=pool_id, node_id=node_id, username=username, ssh_key=ssh_key) + except batch_error.BatchErrorException as error: + raise error + + def __generate_user_on_node(self, pool_id, node_id): + generated_username = secure_utils.generate_random_string() + ssh_key = RSA.generate(2048) + ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8') + self.__create_user_on_node(generated_username, pool_id, node_id, ssh_pub_key) + return generated_username, ssh_key + + def __generate_user_on_pool(self, pool_id, nodes): + generated_username = secure_utils.generate_random_string() + ssh_key = RSA.generate(2048) + ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8') + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = {executor.submit(self.__create_user_on_node, + generated_username, + pool_id, + node.id, + ssh_pub_key): node for node in nodes} + concurrent.futures.wait(futures) + + return generated_username, ssh_key + + def __create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key=None, password=None): + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = {executor.submit(self.__create_user_on_node, + username, + pool_id, + node.id, + ssh_pub_key, + password): node for node in nodes} + concurrent.futures.wait(futures) + + def __delete_user_on_pool(self, username, pool_id, nodes): + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [executor.submit(self.__delete_user, pool_id, node.id, username) for node in nodes] + concurrent.futures.wait(futures) + + def __node_run(self, cluster_id, node_id, command, internal, container_name=None, timeout=None): + pool, nodes = self.__get_pool_details(cluster_id) + try: + node = next(node for node in nodes if node.id == node_id) + except StopIteration: + raise error.AztkError("Node with id {} not found".format(node_id)) + + if internal: + node_rls = models.RemoteLogin(ip_address=node.ip_address, port="22") + else: + node_rls = self.__get_remote_login_settings(pool.id, node.id) + + try: + generated_username, ssh_key = self.__generate_user_on_node(pool.id, node.id) + output = ssh_lib.node_exec_command( + node.id, + command, + generated_username, + node_rls.ip_address, + node_rls.port, + ssh_key=ssh_key.exportKey().decode('utf-8'), + container_name=container_name, + timeout=timeout + ) + return output + finally: + self.__delete_user(cluster_id, node.id, generated_username) + + def __cluster_run(self, cluster_id, command, internal, container_name=None, timeout=None): + pool, nodes = self.__get_pool_details(cluster_id) + nodes = list(nodes) + if internal: + cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] + else: + cluster_nodes = [(node, self.__get_remote_login_settings(pool.id, node.id)) for node in nodes] + + try: + generated_username, ssh_key = self.__generate_user_on_pool(pool.id, nodes) + output = asyncio.get_event_loop().run_until_complete( + ssh_lib.clus_exec_command( + command, + generated_username, + cluster_nodes, + ssh_key=ssh_key.exportKey().decode('utf-8'), + container_name=container_name, + timeout=timeout + ) + ) + return output + except OSError as exc: + raise exc + finally: + self.__delete_user_on_pool(generated_username, pool.id, nodes) + + def __cluster_copy(self, cluster_id, source_path, destination_path=None, container_name=None, internal=False, get=False, timeout=None): + pool, nodes = self.__get_pool_details(cluster_id) + nodes = list(nodes) + if internal: + cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] + else: + cluster_nodes = [(node, self.__get_remote_login_settings(pool.id, node.id)) for node in nodes] + + try: + generated_username, ssh_key = self.__generate_user_on_pool(pool.id, nodes) + output = asyncio.get_event_loop().run_until_complete( + ssh_lib.clus_copy( + container_name=container_name, + username=generated_username, + nodes=cluster_nodes, + source_path=source_path, + destination_path=destination_path, + ssh_key=ssh_key.exportKey().decode('utf-8'), + get=get, + timeout=timeout + ) + ) + return output + except (OSError, batch_error.BatchErrorException) as exc: + raise exc + finally: + self.__delete_user_on_pool(generated_username, pool.id, nodes) + + def __ssh_into_node(self, pool_id, node_id, username, ssh_key=None, password=None, port_forward_list=None, internal=False): + if internal: + result = self.batch_client.compute_node.get(pool_id=pool_id, node_id=node_id) + rls = models.RemoteLogin(ip_address=result.ip_address, port="22") + else: + result = self.batch_client.compute_node.get_remote_login_settings(pool_id, node_id) + rls = models.RemoteLogin(ip_address=result.remote_login_ip_address, port=str(result.remote_login_port)) + + ssh_lib.node_ssh( + username=username, + hostname=rls.ip_address, + port=rls.port, + ssh_key=ssh_key, + password=password, + port_forward_list=port_forward_list, + ) + + def __submit_job(self, + job_configuration, + start_task, + job_manager_task, + autoscale_formula, + software_metadata_key: str, + vm_image_model, + application_metadata): + """ + Job Submission + :param job_configuration -> aztk_sdk.spark.models.JobConfiguration + :param start_task -> batch_models.StartTask + :param job_manager_task -> batch_models.TaskAddParameter + :param autoscale_formula -> str + :param software_metadata_key -> str + :param vm_image_model -> aztk_sdk.models.VmImage + :returns None + """ + self._get_cluster_data(job_configuration.id).save_cluster_config(job_configuration.to_cluster_config()) + + # get a verified node agent sku + sku_to_use, image_ref_to_use = \ + helpers.select_latest_verified_vm_image_with_node_agent_sku( + vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, self.batch_client) + + # set up subnet if necessary + network_conf = None + if job_configuration.subnet_id: + network_conf = batch_models.NetworkConfiguration( + subnet_id=job_configuration.subnet_id) + + # set up a schedule for a recurring job + auto_pool_specification = batch_models.AutoPoolSpecification( + pool_lifetime_option=batch_models.PoolLifetimeOption.job_schedule, + auto_pool_id_prefix=job_configuration.id, + keep_alive=False, + pool=batch_models.PoolSpecification( + display_name=job_configuration.id, + virtual_machine_configuration=batch_models.VirtualMachineConfiguration( + image_reference=image_ref_to_use, + node_agent_sku_id=sku_to_use), + vm_size=job_configuration.vm_size, + enable_auto_scale=True, + auto_scale_formula=autoscale_formula, + auto_scale_evaluation_interval=timedelta(minutes=5), + start_task=start_task, + enable_inter_node_communication=not job_configuration.mixed_mode(), + network_configuration=network_conf, + max_tasks_per_node=4, + metadata=[ + batch_models.MetadataItem( + name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), + batch_models.MetadataItem( + name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA) + ] + ) + ) + + # define job specification + job_spec = batch_models.JobSpecification( + pool_info=batch_models.PoolInformation(auto_pool_specification=auto_pool_specification), + display_name=job_configuration.id, + on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job, + job_manager_task=job_manager_task, + metadata=[ + batch_models.MetadataItem( + name='applications', value=application_metadata) + ] + ) + + # define schedule + schedule = batch_models.Schedule( + do_not_run_until=None, + do_not_run_after=None, + start_window=None, + recurrence_interval=None + ) + + # create job schedule and add task + setup = batch_models.JobScheduleAddParameter( + id=job_configuration.id, + schedule=schedule, + job_specification=job_spec) + + self.batch_client.job_schedule.add(setup) + + return self.batch_client.job_schedule.get(job_schedule_id=job_configuration.id) diff --git a/aztk/client/cluster/helpers/copy.py b/aztk/client/cluster/helpers/copy.py index 706ee27e..adf87d02 100644 --- a/aztk/client/cluster/helpers/copy.py +++ b/aztk/client/cluster/helpers/copy.py @@ -3,7 +3,9 @@ import azure.batch.models.batch_error as batch_error import aztk.models as models +from aztk import error from aztk.utils import ssh as ssh_lib +from aztk.utils import helpers def cluster_copy(cluster_operations, cluster_id, source_path, destination_path=None, container_name=None, internal=False, get=False, timeout=None): @@ -16,6 +18,10 @@ def cluster_copy(cluster_operations, cluster_id, source_path, destination_path=N try: generated_username, ssh_key = cluster_operations.generate_user_on_pool(pool.id, nodes) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) + + try: output = asyncio.get_event_loop().run_until_complete( ssh_lib.clus_copy( container_name=container_name, diff --git a/aztk/spark/client.py b/aztk/spark/client.py index 83701469..5693fa4a 100644 --- a/aztk/spark/client.py +++ b/aztk/spark/client.py @@ -4,7 +4,7 @@ import aztk from aztk import error -from aztk.client import Client as BaseClient +from aztk.client import CoreClient as BaseClient from aztk.internal.cluster_data import NodeData from aztk.spark import models from aztk.spark.helpers import create_cluster as create_cluster_helper diff --git a/aztk/spark/client/client.py b/aztk/spark/client/client.py index 7983c751..0e4cd862 100644 --- a/aztk/spark/client/client.py +++ b/aztk/spark/client/client.py @@ -1,9 +1,21 @@ +from typing import List + +import azure.batch.models.batch_error as batch_error + +import aztk +from aztk import error from aztk.client import CoreClient +from aztk.internal.cluster_data import NodeData from aztk.spark import models from aztk.spark.client.cluster import ClusterOperations from aztk.spark.client.job import JobOperations -from aztk.utils import azure_api - +from aztk.spark.helpers import create_cluster as create_cluster_helper +from aztk.spark.helpers import get_log as get_log_helper +from aztk.spark.helpers import job_submission as job_submit_helper +from aztk.spark.helpers import submit as cluster_submit_helper +from aztk.spark.helpers import cluster_diagnostic_helper +from aztk.spark.utils import util +from aztk.utils import azure_api, deprecated, helpers class Client(CoreClient): @@ -24,3 +36,196 @@ def get_context(self, secrets_configuration: models.SecretsConfiguration): 'secrets_configuration': secrets_configuration, } return context + + # ALL THE FOLLOWING METHODS ARE DEPRECATED AND WILL BE REMOVED IN 0.10.0 + + @deprecated("0.10.0") + def create_cluster(self, cluster_conf: models.ClusterConfiguration, wait: bool = False): + return self.cluster.create(cluster_configuration=cluster_conf, wait=wait) + + @deprecated("0.10.0") + def create_clusters_in_parallel(self, cluster_confs): # NOT IMPLEMENTED + for cluster_conf in cluster_confs: + self.cluster.create_cluster(cluster_conf) + + @deprecated("0.10.0") + def delete_cluster(self, cluster_id: str, keep_logs: bool = False): + return self.cluster.delete(id=cluster_id, keep_logs=keep_logs) + + @deprecated("0.10.0") + def get_cluster(self, cluster_id: str): + return self.cluster.get(id=cluster_id) + + @deprecated("0.10.0") + def list_clusters(self): + return self.cluster.list() + + @deprecated("0.10.0") + def get_remote_login_settings(self, cluster_id: str, node_id: str): + return self.cluster.get_remote_login_settings(cluster_id, node_id) + + @deprecated("0.10.0") + def submit(self, + cluster_id: str, + application: models.ApplicationConfiguration, + remote: bool = False, + wait: bool = False): + return self.cluster.submit(id=cluster_id, application=application, remote=remote, wait=wait) + + @deprecated("0.10.0") + def submit_all_applications(self, cluster_id: str, applications): # NOT IMPLEMENTED + for application in applications: + self.cluster.submit(cluster_id, application) + + @deprecated("0.10.0") + def wait_until_application_done(self, cluster_id: str, task_id: str): # NOT IMPLEMENTED + try: + helpers.wait_for_task_to_complete(job_id=cluster_id, task_id=task_id, batch_client=self.batch_client) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) + + @deprecated("0.10.0") + def wait_until_applications_done(self, cluster_id: str): # NOT IMPLEMENTED + try: + helpers.wait_for_tasks_to_complete(job_id=cluster_id, batch_client=self.batch_client) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) + + @deprecated("0.10.0") + def wait_until_cluster_is_ready(self, cluster_id: str): # NOT IMPLEMENTED + try: + util.wait_for_master_to_be_ready(self, cluster_id) + pool = self.batch_client.pool.get(cluster_id) + nodes = self.batch_client.compute_node.list(pool_id=cluster_id) + return models.Cluster(pool, nodes) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) + + @deprecated("0.10.0") + def wait_until_all_clusters_are_ready(self, clusters: List[str]): # NOT IMPLEMENTED + for cluster_id in clusters: + self.wait_until_cluster_is_ready(cluster_id) + + @deprecated("0.10.0") + def create_user(self, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str: + return self.cluster.create_user(id=cluster_id, username=username, password=password, ssh_key=ssh_key) + + @deprecated("0.10.0") + def get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): + return self.cluster.get_application_log( + id=cluster_id, application_name=application_name, tail=tail, current_bytes=current_bytes) + + @deprecated("0.10.0") + def get_application_status(self, cluster_id: str, app_name: str): + return self.cluster.get_application_status(id=cluster_id, application_name=app_name) + + @deprecated("0.10.0") + def cluster_run(self, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): + return self.cluster.run(id=cluster_id, command=command, host=host, internal=internal) + + @deprecated("0.10.0") + def node_run(self, cluster_id: str, node_id: str, command: str, host=False, internal: bool = False, timeout=None): + return self.cluster.node_run( + id=cluster_id, node_id=node_id, command=command, host=host, internal=internal, timeout=timeout) + + @deprecated("0.10.0") + def cluster_copy(self, + cluster_id: str, + source_path: str, + destination_path: str, + host: bool = False, + internal: bool = False, + timeout: int = None): + return self.cluster.copy( + id=cluster_id, + source_path=source_path, + destination_path=destination_path, + host=host, + internal=internal, + timeout=timeout) + + @deprecated("0.10.0") + def cluster_download(self, + cluster_id: str, + source_path: str, + destination_path: str = None, + host: bool = False, + internal: bool = False, + timeout: int = None): + return self.cluster.download( + id=cluster_id, + source_path=source_path, + destination_path=destination_path, + host=host, + internal=internal, + timeout=timeout) + + @deprecated("0.10.0") + def cluster_ssh_into_master(self, + cluster_id, + node_id, + username, + ssh_key=None, + password=None, + port_forward_list=None, + internal=False): + return self.cluster.ssh_into_node(cluster_id, node_id, username, ssh_key, password, port_forward_list, internal) + + ''' + job submission + ''' + + @deprecated("0.10.0") + def submit_job(self, job_configuration: models.JobConfiguration): + return self.job.submit(job_configuration) + + @deprecated("0.10.0") + def list_jobs(self): + return self.job.list() + + @deprecated("0.10.0") + def list_applications(self, job_id): + return self.job.list_applications(job_id) + + @deprecated("0.10.0") + def get_job(self, job_id): + return self.job.get(job_id) + + @deprecated("0.10.0") + def stop_job(self, job_id): + return self.job.stop(job_id) + + @deprecated("0.10.0") + def delete_job(self, job_id: str, keep_logs: bool = False): + return self.job.delete(job_id, keep_logs) + + @deprecated("0.10.0") + def get_application(self, job_id, application_name): + return self.job.get_application(job_id, application_name) + + @deprecated("0.10.0") + def get_job_application_log(self, job_id, application_name): + return self.job.get_application_log(job_id, application_name) + + @deprecated("0.10.0") + def stop_job_app(self, job_id, application_name): # NOT IMPLEMENTED + try: + return job_submit_helper.stop_app(self, job_id, application_name) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) + + @deprecated("0.10.0") + def wait_until_job_finished(self, job_id): + try: + job_submit_helper.wait_until_job_finished(self, job_id) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) + + @deprecated("0.10.0") + def wait_until_all_jobs_finished(self, jobs): # NOT IMPLEMENTED + for job in jobs: + self.wait_until_job_finished(job) + + @deprecated("0.10.0") + def run_cluster_diagnostics(self, cluster_id, output_directory=None): + return self.cluster.diagnostics(cluster_id, output_directory) diff --git a/aztk/spark/client/cluster/helpers/copy.py b/aztk/spark/client/cluster/helpers/copy.py index 6051637d..1326d933 100644 --- a/aztk/spark/client/cluster/helpers/copy.py +++ b/aztk/spark/client/cluster/helpers/copy.py @@ -7,6 +7,13 @@ def cluster_copy(spark_cluster_operations, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False, timeout: int = None): try: container_name = None if host else 'spark' - return super(type(spark_cluster_operations), spark_cluster_operations).copy(cluster_id, source_path, destination_path=destination_path, container_name=container_name, get=False, internal=internal, timeout=timeout) + return super(type(spark_cluster_operations), spark_cluster_operations).copy( + cluster_id, + source_path, + destination_path=destination_path, + container_name=container_name, + get=False, + internal=internal, + timeout=timeout) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/create.py b/aztk/spark/client/cluster/helpers/create.py index c0887035..16f6c66d 100644 --- a/aztk/spark/client/cluster/helpers/create.py +++ b/aztk/spark/client/cluster/helpers/create.py @@ -1,5 +1,3 @@ -from typing import List - import azure.batch.models as batch_models import azure.batch.models.batch_error as batch_error @@ -7,7 +5,7 @@ from aztk.internal.cluster_data import NodeData from aztk.spark import models from aztk.spark.utils import util -from aztk.utils import constants, helpers +from aztk.utils import helpers POOL_ADMIN_USER_IDENTITY = batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( diff --git a/aztk/spark/client/cluster/helpers/create_user.py b/aztk/spark/client/cluster/helpers/create_user.py index c4869325..801b3375 100644 --- a/aztk/spark/client/cluster/helpers/create_user.py +++ b/aztk/spark/client/cluster/helpers/create_user.py @@ -1,7 +1,7 @@ import azure.batch.models.batch_error as batch_error from aztk import error -from aztk.spark import helpers +from aztk.utils import helpers def create_user(spark_cluster_operations, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str: diff --git a/aztk/spark/client/cluster/helpers/download.py b/aztk/spark/client/cluster/helpers/download.py index b58b62c6..82bd12c2 100644 --- a/aztk/spark/client/cluster/helpers/download.py +++ b/aztk/spark/client/cluster/helpers/download.py @@ -2,7 +2,7 @@ import azure.batch.models.batch_error as batch_error from aztk import error -from aztk.spark import helpers +from aztk.utils import helpers def cluster_download(spark_cluster_operations, cluster_id: str, source_path: str, destination_path: str = None, host: bool = False, internal: bool = False, timeout: int = None): diff --git a/aztk/spark/client/cluster/helpers/get.py b/aztk/spark/client/cluster/helpers/get.py index 8e0ec5f6..4eae4ba6 100644 --- a/aztk/spark/client/cluster/helpers/get.py +++ b/aztk/spark/client/cluster/helpers/get.py @@ -2,7 +2,7 @@ from aztk import error from aztk.spark import models -from aztk.spark import helpers +from aztk.utils import helpers def get_cluster(spark_cluster_operations, cluster_id: str): diff --git a/aztk/spark/client/cluster/helpers/get_application_log.py b/aztk/spark/client/cluster/helpers/get_application_log.py index 3cdbaaba..9ecaed5c 100644 --- a/aztk/spark/client/cluster/helpers/get_application_log.py +++ b/aztk/spark/client/cluster/helpers/get_application_log.py @@ -1,11 +1,11 @@ from azure.batch.models import batch_error from aztk import error -from aztk.spark import helpers +from aztk.utils import helpers def get_application_log(spark_cluster_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): try: - return super(type(spark_cluster_operations), spark_cluster_operations).get_application_log(spark_cluster_operations, cluster_id, application_name, tail, current_bytes) + return super(type(spark_cluster_operations), spark_cluster_operations).get_application_log(cluster_id, application_name, tail, current_bytes) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/get_application_status.py b/aztk/spark/client/cluster/helpers/get_application_status.py index 425a8d1d..d8b4ac75 100644 --- a/aztk/spark/client/cluster/helpers/get_application_status.py +++ b/aztk/spark/client/cluster/helpers/get_application_status.py @@ -1,7 +1,7 @@ import azure.batch.models.batch_error as batch_error from aztk import error -from aztk.spark import helpers +from aztk.utils import helpers def get_application_status(spark_cluster_operations, cluster_id: str, app_name: str): diff --git a/aztk/spark/client/cluster/helpers/list.py b/aztk/spark/client/cluster/helpers/list.py index 07fd414b..9e6d3231 100644 --- a/aztk/spark/client/cluster/helpers/list.py +++ b/aztk/spark/client/cluster/helpers/list.py @@ -3,7 +3,7 @@ import aztk.models # TODO: get rid of this import and use aztk.spark.models from aztk import error from aztk.spark import models -from aztk.spark import helpers +from aztk.utils import helpers def list_clusters(spark_cluster_operations): diff --git a/aztk/spark/client/cluster/helpers/node_run.py b/aztk/spark/client/cluster/helpers/node_run.py index 41b719a5..3060f3e1 100644 --- a/aztk/spark/client/cluster/helpers/node_run.py +++ b/aztk/spark/client/cluster/helpers/node_run.py @@ -1,7 +1,7 @@ import azure.batch.models.batch_error as batch_error from aztk import error -from aztk.spark import helpers +from aztk.utils import helpers def node_run(spark_cluster_operations, diff --git a/aztk/spark/client/cluster/helpers/run.py b/aztk/spark/client/cluster/helpers/run.py index 9c21817b..385a6c90 100644 --- a/aztk/spark/client/cluster/helpers/run.py +++ b/aztk/spark/client/cluster/helpers/run.py @@ -1,7 +1,7 @@ import azure.batch.models.batch_error as batch_error from aztk import error -from aztk.spark import helpers +from aztk.utils import helpers def cluster_run(spark_cluster_operations, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): diff --git a/aztk/spark/client/cluster/helpers/ssh_into_master.py b/aztk/spark/client/cluster/helpers/ssh_into_master.py index 1d3e8670..e0b64d65 100644 --- a/aztk/spark/client/cluster/helpers/ssh_into_master.py +++ b/aztk/spark/client/cluster/helpers/ssh_into_master.py @@ -2,7 +2,7 @@ import azure.batch.models.batch_error as batch_error from aztk import error -from aztk.spark import helpers +from aztk.utils import helpers def cluster_ssh_into_master(spark_cluster_operations, cluster_id, node_id, username, ssh_key=None, password=None, port_forward_list=None, internal=False): diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index ebc1b56c..efa3da9d 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -10,54 +10,54 @@ class ClusterOperations(CoreClusterOperations, SparkBaseOperations): def create(self, cluster_configuration: models.ClusterConfiguration, wait: bool = False): return create.create_cluster(self, cluster_configuration, wait) - def delete(self, cluster_id: str, keep_logs: bool = False): - return delete.delete_cluster(self, cluster_id, keep_logs) + def delete(self, id: str, keep_logs: bool = False): + return delete.delete_cluster(self, id, keep_logs) - def get(self, cluster_id: str): - return get.get_cluster(self, cluster_id) + def get(self, id: str): + return get.get_cluster(self, id) def list(self): return list.list_clusters(self) def submit(self, - cluster_id: str, + id: str, application: models.ApplicationConfiguration, remote: bool = False, wait: bool = False): - return submit.submit(self, cluster_id, application, remote, wait) + return submit.submit(self, id, application, remote, wait) - def create_user(self, cluster_id: str, username: str, password: str = None, ssh_key: str = None): - return create_user.create_user(self, cluster_id, username, ssh_key, password) + def create_user(self, id: str, username: str, password: str = None, ssh_key: str = None): + return create_user.create_user(self, id, username, ssh_key, password) - def get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): - return get_application_log.get_application_log(self, cluster_id, application_name, tail, current_bytes) + def get_application_log(self, id: str, application_name: str, tail=False, current_bytes: int = 0): + return get_application_log.get_application_log(self, id, application_name, tail, current_bytes) - def get_application_status(self, cluster_id: str, application_name: str): - return get_application_status.get_application_status(self, cluster_id, application_name) + def get_application_status(self, id: str, application_name: str): + return get_application_status.get_application_status(self, id, application_name) - def run(self, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): - return run.cluster_run(self, cluster_id, command, host, internal, timeout) + def run(self, id: str, command: str, host=False, internal: bool = False, timeout=None): + return run.cluster_run(self, id, command, host, internal, timeout) - def node_run(self, cluster_id: str, node_id: str, command: str, host=False, internal: bool = False, timeout=None): - return node_run.node_run(self, cluster_id, node_id, command, host, internal, timeout) + def node_run(self, id: str, node_id: str, command: str, host=False, internal: bool = False, timeout=None): + return node_run.node_run(self, id, node_id, command, host, internal, timeout) def copy(self, - cluster_id: str, + id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False, timeout: int = None): - return copy.cluster_copy(self, cluster_id, source_path, destination_path, host, internal, timeout) + return copy.cluster_copy(self, id, source_path, destination_path, host, internal, timeout) def download(self, - cluster_id: str, + id: str, source_path: str, destination_path: str = None, host: bool = False, internal: bool = False, timeout: int = None): - return download.cluster_download(self, cluster_id, source_path, destination_path, host, internal, timeout) + return download.cluster_download(self, id, source_path, destination_path, host, internal, timeout) - def diagnostics(self, cluster_id, output_directory=None): - return diagnostics.run_cluster_diagnostics(self, cluster_id, output_directory) + def diagnostics(self, id, output_directory=None): + return diagnostics.run_cluster_diagnostics(self, id, output_directory) diff --git a/aztk/spark/client/job/helpers/stop_application.py b/aztk/spark/client/job/helpers/stop_application.py new file mode 100644 index 00000000..a55269ac --- /dev/null +++ b/aztk/spark/client/job/helpers/stop_application.py @@ -0,0 +1,16 @@ +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import models +from aztk.utils import helpers +from .get_recent_job import get_recent_job + +def stop_app(spark_job_operations, job_id, application_name): + recent_run_job = get_recent_job(spark_job_operations, job_id) + + # stop batch task + try: + spark_job_operations.batch_client.task.terminate(job_id=recent_run_job.id, task_id=application_name) + return True + except batch_error.BatchErrorException: + return False diff --git a/aztk/spark/client/job/operations.py b/aztk/spark/client/job/operations.py index 4e57267f..76e6393b 100644 --- a/aztk/spark/client/job/operations.py +++ b/aztk/spark/client/job/operations.py @@ -2,8 +2,8 @@ from aztk.spark import models from aztk.spark.client.base import SparkBaseOperations -from .helpers import (delete, get, get_application, get_application_log, list, list_applications, stop, submit, - wait_until_complete) +from .helpers import (delete, get, get_application, get_application_log, list, list_applications, stop, + stop_application, submit, wait_until_complete) class JobOperations(CoreJobOperations, SparkBaseOperations): @@ -28,6 +28,9 @@ def list_applications(self, id): def stop(self, id): return stop.stop(self, id) + def stop_application(self, id, application_name): + return stop_application.stop_app(self, id, application_name) + def submit(self, job_configuration: models.JobConfiguration): return submit.submit_job(self, job_configuration) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_add_user.py b/aztk_cli/spark/endpoints/cluster/cluster_add_user.py index 04dedf7a..f0fdb855 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_add_user.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_add_user.py @@ -34,8 +34,8 @@ def execute(args: typing.NamedTuple): ssh_key, password = utils.get_ssh_key_or_prompt(ssh_key, args.username, args.password, spark_client.secrets_config) - spark_client.create_user( - cluster_id=args.cluster_id, + spark_client.cluster.create_user( + id=args.cluster_id, username=args.username, password=password, ssh_key=ssh_key diff --git a/aztk_cli/spark/endpoints/cluster/cluster_app_logs.py b/aztk_cli/spark/endpoints/cluster/cluster_app_logs.py index 27ffa748..fb25cbd4 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_app_logs.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_app_logs.py @@ -30,7 +30,7 @@ def execute(args: typing.NamedTuple): if args.tail: utils.stream_logs(client=spark_client, cluster_id=args.cluster_id, application_name=args.app_name) else: - app_log = spark_client.get_application_log(cluster_id=args.cluster_id, application_name=args.app_name) + app_log = spark_client.cluster.get_application_log(id=args.cluster_id, application_name=args.app_name) if args.output: with utils.Spinner(): with open(os.path.abspath(os.path.expanduser(args.output)), "w", encoding="UTF-8") as f: diff --git a/aztk_cli/spark/endpoints/cluster/cluster_copy.py b/aztk_cli/spark/endpoints/cluster/cluster_copy.py index f5a8fcc3..455ae49e 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_copy.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_copy.py @@ -24,8 +24,8 @@ def setup_parser(parser: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) with utils.Spinner(): - copy_output = spark_client.cluster_copy( - cluster_id=args.cluster_id, + copy_output = spark_client.cluster.copy( + id=args.cluster_id, source_path=args.source_path, destination_path=args.dest_path, internal=args.internal diff --git a/aztk_cli/spark/endpoints/cluster/cluster_create.py b/aztk_cli/spark/endpoints/cluster/cluster_create.py index 63df76a2..eb272bcf 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_create.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_create.py @@ -82,8 +82,8 @@ def execute(args: typing.NamedTuple): utils.print_cluster_conf(cluster_conf, wait) with utils.Spinner(): # create spark cluster - cluster = spark_client.create_cluster( - cluster_conf, + cluster = spark_client.cluster.create( + cluster_configuration=cluster_conf, wait=wait ) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_debug.py b/aztk_cli/spark/endpoints/cluster/cluster_debug.py index 7fe3d5d2..21a16c16 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_debug.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_debug.py @@ -22,5 +22,5 @@ def execute(args: typing.NamedTuple): if not args.output: args.output = os.path.join(os.getcwd(), "debug-{0}-{1}".format(args.cluster_id, timestr)) with utils.Spinner(): - spark_client.run_cluster_diagnostics(cluster_id=args.cluster_id, output_directory=args.output) + spark_client.cluster.diagnostics(id=args.cluster_id, output_directory=args.output) # TODO: analyze results, display some info about status diff --git a/aztk_cli/spark/endpoints/cluster/cluster_delete.py b/aztk_cli/spark/endpoints/cluster/cluster_delete.py index 54d40007..48c9b0f5 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_delete.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_delete.py @@ -40,7 +40,7 @@ def execute(args: typing.NamedTuple): log.error("Confirmation cluster id does not match. Please try again.") return - if spark_client.delete_cluster(cluster_id, args.keep_logs): + if spark_client.cluster.delete(id=cluster_id, keep_logs=args.keep_logs): log.info("Deleting cluster %s", cluster_id) else: log.error("Cluster with id '%s' doesn't exist or was already deleted.", cluster_id) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_get.py b/aztk_cli/spark/endpoints/cluster/cluster_get.py index 01393b16..0f2e3cf2 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_get.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_get.py @@ -23,7 +23,7 @@ def setup_parser(parser: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) cluster_id = args.cluster_id - cluster = spark_client.get_cluster(cluster_id) + cluster = spark_client.cluster.get(cluster_id) utils.print_cluster(spark_client, cluster, args.internal) configuration = spark_client.get_cluster_config(cluster_id) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_list.py b/aztk_cli/spark/endpoints/cluster/cluster_list.py index e0965d77..85b42139 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_list.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_list.py @@ -16,7 +16,7 @@ def setup_parser(parser: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) - clusters = spark_client.list_clusters() + clusters = spark_client.cluster.list() if args.quiet: utils.print_clusters_quiet(clusters) else: diff --git a/aztk_cli/spark/endpoints/cluster/cluster_run.py b/aztk_cli/spark/endpoints/cluster/cluster_run.py index 1b066e56..7306e0fd 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_run.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_run.py @@ -27,8 +27,8 @@ def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) with utils.Spinner(): if args.node_id: - results = [spark_client.node_run(args.cluster_id, args.node_id, args.command, args.host, args.internal)] + results = [spark_client.cluster.node_run(args.cluster_id, args.node_id, args.command, args.host, args.internal)] else: - results = spark_client.cluster_run(args.cluster_id, args.command, args.host, args.internal) + results = spark_client.cluster.run(args.cluster_id, args.command, args.host, args.internal) [utils.log_node_run_output(node_output) for node_output in results] diff --git a/aztk_cli/spark/endpoints/cluster/cluster_ssh.py b/aztk_cli/spark/endpoints/cluster/cluster_ssh.py index ae191805..70b047aa 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_ssh.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_ssh.py @@ -104,7 +104,7 @@ def native_python_ssh_into_master(spark_client, cluster, ssh_conf, password): plugin_ports.extend(ports) print("Press ctrl+c to exit...") - spark_client.cluster_ssh_into_master( + spark_client.cluster.ssh_into_master( cluster.id, cluster.master_node_id, ssh_conf.username, diff --git a/aztk_cli/spark/endpoints/cluster/cluster_submit.py b/aztk_cli/spark/endpoints/cluster/cluster_submit.py index b69ec1e4..388467ba 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_submit.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_submit.py @@ -134,8 +134,8 @@ def execute(args: typing.NamedTuple): log.info("-------------------------------------------") - spark_client.submit( - cluster_id=args.cluster_id, + spark_client.cluster.submit( + id=args.cluster_id, application = aztk.spark.models.ApplicationConfiguration( name=args.name, application=args.app, @@ -162,8 +162,8 @@ def execute(args: typing.NamedTuple): exit_code = utils.stream_logs(client=spark_client, cluster_id=args.cluster_id, application_name=args.name) else: with utils.Spinner(): - spark_client.wait_until_application_done(cluster_id=args.cluster_id, task_id=args.name) - application_log = spark_client.get_application_log(cluster_id=args.cluster_id, application_name=args.name) + spark_client.cluster.wait_until_application_done(cluster_id=args.cluster_id, task_id=args.name) + application_log = spark_client.cluster.get_application_log(id=args.cluster_id, application_name=args.name) with open(os.path.abspath(os.path.expanduser(args.output)), "w", encoding="UTF-8") as f: f.write(application_log.log) exit_code = application_log.exit_code diff --git a/aztk_cli/spark/endpoints/job/delete.py b/aztk_cli/spark/endpoints/job/delete.py index 445b0ad4..8e5bf232 100644 --- a/aztk_cli/spark/endpoints/job/delete.py +++ b/aztk_cli/spark/endpoints/job/delete.py @@ -29,7 +29,7 @@ def execute(args: typing.NamedTuple): if not args.force: # check if job exists before prompting for confirmation - spark_client.get_job(job_id) + spark_client.job.get(id=job_id) if not args.keep_logs: log.warning("All logs persisted for this job will be deleted.") @@ -40,7 +40,7 @@ def execute(args: typing.NamedTuple): log.error("Confirmation cluster id does not match. Please try again.") return - if spark_client.delete_job(job_id, args.keep_logs): + if spark_client.job.delete(id=job_id, keep_logs=args.keep_logs): log.info("Deleting Job %s", job_id) else: log.error("Job with id '%s' doesn't exist or was already deleted.", job_id) diff --git a/aztk_cli/spark/endpoints/job/get.py b/aztk_cli/spark/endpoints/job/get.py index 026a3cc7..1d5a0a90 100644 --- a/aztk_cli/spark/endpoints/job/get.py +++ b/aztk_cli/spark/endpoints/job/get.py @@ -16,4 +16,4 @@ def setup_parser(parser: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) - utils.print_job(spark_client, spark_client.get_job(args.job_id)) + utils.print_job(spark_client, spark_client.job.get(id=args.job_id)) diff --git a/aztk_cli/spark/endpoints/job/get_app.py b/aztk_cli/spark/endpoints/job/get_app.py index 1405432c..47b4faf1 100644 --- a/aztk_cli/spark/endpoints/job/get_app.py +++ b/aztk_cli/spark/endpoints/job/get_app.py @@ -20,4 +20,4 @@ def setup_parser(parser: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) - utils.print_application(spark_client.get_application(args.job_id, args.app_name)) + utils.print_application(spark_client.job.get_application(args.job_id, args.app_name)) diff --git a/aztk_cli/spark/endpoints/job/get_app_logs.py b/aztk_cli/spark/endpoints/job/get_app_logs.py index 3981f4f2..06700943 100644 --- a/aztk_cli/spark/endpoints/job/get_app_logs.py +++ b/aztk_cli/spark/endpoints/job/get_app_logs.py @@ -22,7 +22,7 @@ def setup_parser(parser: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) - app_log = spark_client.get_job_application_log(args.job_id, args.app_name) + app_log = spark_client.job.get_application_log(args.job_id, args.app_name) if args.output: with utils.Spinner(): with open(os.path.abspath(os.path.expanduser(args.output)), "w", encoding="UTF-8") as f: diff --git a/aztk_cli/spark/endpoints/job/list.py b/aztk_cli/spark/endpoints/job/list.py index 0be7541b..0c169705 100644 --- a/aztk_cli/spark/endpoints/job/list.py +++ b/aztk_cli/spark/endpoints/job/list.py @@ -13,4 +13,4 @@ def setup_parser(_: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) - utils.print_jobs(spark_client.list_jobs()) + utils.print_jobs(spark_client.job.list()) diff --git a/aztk_cli/spark/endpoints/job/list_apps.py b/aztk_cli/spark/endpoints/job/list_apps.py index 6db5af97..d7dfdd78 100644 --- a/aztk_cli/spark/endpoints/job/list_apps.py +++ b/aztk_cli/spark/endpoints/job/list_apps.py @@ -14,4 +14,4 @@ def setup_parser(parser: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) - utils.print_applications(spark_client.list_applications(args.job_id)) + utils.print_applications(spark_client.job.list_applications(args.job_id)) diff --git a/aztk_cli/spark/endpoints/job/stop.py b/aztk_cli/spark/endpoints/job/stop.py index 9232d106..afdbc644 100644 --- a/aztk_cli/spark/endpoints/job/stop.py +++ b/aztk_cli/spark/endpoints/job/stop.py @@ -15,5 +15,5 @@ def setup_parser(parser: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) - spark_client.stop_job(args.job_id) + spark_client.job.stop(args.job_id) log.print("Stopped Job {0}".format(args.job_id)) diff --git a/aztk_cli/spark/endpoints/job/stop_app.py b/aztk_cli/spark/endpoints/job/stop_app.py index da3e297c..4fc316d2 100644 --- a/aztk_cli/spark/endpoints/job/stop_app.py +++ b/aztk_cli/spark/endpoints/job/stop_app.py @@ -20,7 +20,7 @@ def setup_parser(parser: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) - if spark_client.stop_job_app(args.job_id, args.app_name): + if spark_client.job.stop_application(args.job_id, args.app_name): log.info("Stopped app {0}".format(args.app_name)) else: log.error("App with name {0} does not exist or was already deleted") diff --git a/aztk_cli/spark/endpoints/job/submit.py b/aztk_cli/spark/endpoints/job/submit.py index 91c5b768..bc519346 100644 --- a/aztk_cli/spark/endpoints/job/submit.py +++ b/aztk_cli/spark/endpoints/job/submit.py @@ -48,4 +48,4 @@ def execute(args: typing.NamedTuple): ) #TODO: utils.print_job_conf(job_configuration) - spark_client.submit_job(job_configuration) + spark_client.job.submit(job_configuration) diff --git a/aztk_cli/utils.py b/aztk_cli/utils.py index 7841223d..4f27b194 100644 --- a/aztk_cli/utils.py +++ b/aztk_cli/utils.py @@ -61,7 +61,7 @@ def print_cluster(client, cluster: models.Cluster, internal: bool = False): if not cluster.nodes: return for node in cluster.nodes: - remote_login_settings = client.get_remote_login_settings(cluster.id, node.id) + remote_login_settings = client.cluster.get_remote_login_settings(cluster.id, node.id) if internal: ip = node.ip_address else: @@ -130,8 +130,8 @@ def print_clusters_quiet(clusters: List[models.Cluster]): def stream_logs(client, cluster_id, application_name): current_bytes = 0 while True: - app_logs = client.get_application_log( - cluster_id=cluster_id, + app_logs = client.cluster.get_application_log( + id=cluster_id, application_name=application_name, tail=True, current_bytes=current_bytes) @@ -141,6 +141,7 @@ def stream_logs(client, cluster_id, application_name): current_bytes = app_logs.total_bytes time.sleep(3) + def ssh_in_master( client, cluster_id: str, @@ -165,7 +166,7 @@ def ssh_in_master( subprocess.call(["ssh"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Get master node id from task (job and task are both named pool_id) - cluster = client.get_cluster(cluster_id) + cluster = client.cluster.get(cluster_id) configuration = client.get_cluster_config(cluster_id) master_node_id = cluster.master_node_id @@ -174,7 +175,7 @@ def ssh_in_master( raise error.ClusterNotReadyError("Master node has not yet been picked!") # get remote login settings for the user - remote_login_settings = client.get_remote_login_settings(cluster.id, master_node_id) + remote_login_settings = client.cluster.get_remote_login_settings(cluster.id, master_node_id) master_internal_node_ip = [node.ip_address for node in cluster.nodes if node.id == master_node_id][0] master_node_ip = remote_login_settings.ip_address master_node_port = remote_login_settings.port @@ -288,7 +289,7 @@ def print_job(client, job: models.Job): if job.applications: application_summary(job.applications) else: - application_summary(client.list_applications(job.id)) + application_summary(client.job.list_applications(job.id)) log.info("") From bd1665386f41747d597a23a59db517173c31db59 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 3 Jul 2018 15:42:04 -0700 Subject: [PATCH 14/52] start fixes --- aztk/client.py | 472 ------------------ aztk/client/base/base_operations.py | 10 +- .../base/helpers/get_application_log.py | 2 +- aztk/client/client.py | 3 +- aztk/client/cluster/operations.py | 1 + aztk/models/__init__.py | 1 + aztk/models/application_log.py | 12 + aztk/spark/client.py | 370 -------------- .../base/helpers/get_application_log.py | 9 + aztk/spark/client/base/operations.py | 5 +- aztk/spark/client/client.py | 2 +- aztk/spark/client/cluster/helpers/copy.py | 2 +- aztk/spark/client/job/helpers/list.py | 5 +- aztk/spark/models/models.py | 10 +- .../endpoints/cluster/cluster_add_user.py | 4 +- .../spark/sdk/cluster/test_cluster_new.py | 22 +- 16 files changed, 56 insertions(+), 874 deletions(-) delete mode 100644 aztk/client.py create mode 100644 aztk/models/application_log.py delete mode 100644 aztk/spark/client.py create mode 100644 aztk/spark/client/base/helpers/get_application_log.py diff --git a/aztk/client.py b/aztk/client.py deleted file mode 100644 index 789a3e28..00000000 --- a/aztk/client.py +++ /dev/null @@ -1,472 +0,0 @@ -import asyncio -import concurrent.futures -from datetime import datetime, timedelta, timezone - -import azure.batch.models as batch_models -import azure.batch.models.batch_error as batch_error -from Cryptodome.PublicKey import RSA - -import aztk.error as error -import aztk.models as models -import aztk.utils.azure_api as azure_api -import aztk.utils.constants as constants -import aztk.utils.get_ssh_key as get_ssh_key -import aztk.utils.helpers as helpers -import aztk.utils.ssh as ssh_lib -from aztk.internal import cluster_data -from aztk.utils import secure_utils - - -class Client: - def __init__(self, secrets_config: models.SecretsConfiguration): - self.secrets_config = secrets_config - - azure_api.validate_secrets(secrets_config) - self.batch_client = azure_api.make_batch_client(secrets_config) - self.blob_client = azure_api.make_blob_client(secrets_config) - - def get_cluster_config(self, cluster_id: str) -> models.ClusterConfiguration: - return self._get_cluster_data(cluster_id).read_cluster_config() - - def _get_cluster_data(self, cluster_id: str) -> cluster_data.ClusterData: - """ - Returns ClusterData object to manage data related to the given cluster id - """ - return cluster_data.ClusterData(self.blob_client, cluster_id) - - ''' - General Batch Operations - ''' - - def __delete_pool_and_job(self, pool_id: str, keep_logs: bool = False): - """ - Delete a pool and it's associated job - :param cluster_id: the pool to add the user to - :return bool: deleted the pool if exists and job if exists - """ - # job id is equal to pool id - job_id = pool_id - job_exists = True - - try: - self.batch_client.job.get(job_id) - except batch_models.batch_error.BatchErrorException: - job_exists = False - - pool_exists = self.batch_client.pool.exists(pool_id) - - if job_exists: - self.batch_client.job.delete(job_id) - - if pool_exists: - self.batch_client.pool.delete(pool_id) - - if not keep_logs: - cluster_data = self._get_cluster_data(pool_id) - cluster_data.delete_container(pool_id) - - return job_exists or pool_exists - - def __create_pool_and_job(self, cluster_conf: models.ClusterConfiguration, software_metadata_key: str, start_task, VmImageModel): - """ - Create a pool and job - :param cluster_conf: the configuration object used to create the cluster - :type cluster_conf: aztk.models.ClusterConfiguration - :parm software_metadata_key: the id of the software being used on the cluster - :param start_task: the start task for the cluster - :param VmImageModel: the type of image to provision for the cluster - :param wait: wait until the cluster is ready - """ - self._get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf) - # reuse pool_id as job_id - pool_id = cluster_conf.cluster_id - job_id = cluster_conf.cluster_id - - # Get a verified node agent sku - sku_to_use, image_ref_to_use = \ - helpers.select_latest_verified_vm_image_with_node_agent_sku( - VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, self.batch_client) - - network_conf = None - if cluster_conf.subnet_id is not None: - network_conf = batch_models.NetworkConfiguration( - subnet_id=cluster_conf.subnet_id) - auto_scale_formula = "$TargetDedicatedNodes={0}; $TargetLowPriorityNodes={1}".format( - cluster_conf.size, cluster_conf.size_low_priority) - - # Configure the pool - pool = batch_models.PoolAddParameter( - id=pool_id, - virtual_machine_configuration=batch_models.VirtualMachineConfiguration( - image_reference=image_ref_to_use, - node_agent_sku_id=sku_to_use), - vm_size=cluster_conf.vm_size, - enable_auto_scale=True, - auto_scale_formula=auto_scale_formula, - auto_scale_evaluation_interval=timedelta(minutes=5), - start_task=start_task, - enable_inter_node_communication=True if not cluster_conf.subnet_id else False, - max_tasks_per_node=4, - network_configuration=network_conf, - metadata=[ - batch_models.MetadataItem( - name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), - batch_models.MetadataItem( - name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_CLUSTER_MODE_METADATA) - ]) - - # Create the pool + create user for the pool - helpers.create_pool_if_not_exist(pool, self.batch_client) - - # Create job - job = batch_models.JobAddParameter( - id=job_id, - pool_info=batch_models.PoolInformation(pool_id=pool_id)) - - # Add job to batch - self.batch_client.job.add(job) - - return helpers.get_cluster(cluster_conf.cluster_id, self.batch_client) - - def __get_pool_details(self, cluster_id: str): - """ - Print the information for the given cluster - :param cluster_id: Id of the cluster - :return pool: CloudPool, nodes: ComputeNodePaged - """ - pool = self.batch_client.pool.get(cluster_id) - nodes = self.batch_client.compute_node.list(pool_id=cluster_id) - return pool, nodes - - def __list_clusters(self, software_metadata_key): - """ - List all the cluster on your account. - """ - pools = self.batch_client.pool.list() - software_metadata = ( - constants.AZTK_SOFTWARE_METADATA_KEY, software_metadata_key) - cluster_metadata = ( - constants.AZTK_MODE_METADATA_KEY, constants.AZTK_CLUSTER_MODE_METADATA) - - aztk_pools = [] - for pool in [pool for pool in pools if pool.metadata]: - pool_metadata = [(metadata.name, metadata.value) for metadata in pool.metadata] - if all([metadata in pool_metadata for metadata in [software_metadata, cluster_metadata]]): - aztk_pools.append(pool) - return aztk_pools - - def __create_user(self, pool_id: str, node_id: str, username: str, password: str = None, ssh_key: str = None) -> str: - """ - Create a pool user - :param pool: the pool to add the user to - :param node: the node to add the user to - :param username: username of the user to add - :param password: password of the user to add - :param ssh_key: ssh_key of the user to add - """ - # Create new ssh user for the given node - self.batch_client.compute_node.add_user( - pool_id, - node_id, - batch_models.ComputeNodeUser( - name=username, - is_admin=True, - password=password, - ssh_public_key=get_ssh_key.get_user_public_key( - ssh_key, self.secrets_config), - expiry_time=datetime.now(timezone.utc) + timedelta(days=365))) - - def __delete_user(self, pool_id: str, node_id: str, username: str) -> str: - """ - Create a pool user - :param pool: the pool to add the user to - :param node: the node to add the user to - :param username: username of the user to add - """ - # Delete a user on the given node - self.batch_client.compute_node.delete_user(pool_id, node_id, username) - - def __get_remote_login_settings(self, pool_id: str, node_id: str): - """ - Get the remote_login_settings for node - :param pool_id - :param node_id - :returns aztk.models.RemoteLogin - """ - result = self.batch_client.compute_node.get_remote_login_settings( - pool_id, node_id) - return models.RemoteLogin(ip_address=result.remote_login_ip_address, port=str(result.remote_login_port)) - - def __create_user_on_node(self, username, pool_id, node_id, ssh_key=None, password=None): - try: - self.__create_user(pool_id=pool_id, node_id=node_id, username=username, ssh_key=ssh_key, password=password) - except batch_error.BatchErrorException as error: - try: - self.__delete_user(pool_id, node_id, username) - self.__create_user(pool_id=pool_id, node_id=node_id, username=username, ssh_key=ssh_key) - except batch_error.BatchErrorException as error: - raise error - - def __generate_user_on_node(self, pool_id, node_id): - generated_username = secure_utils.generate_random_string() - ssh_key = RSA.generate(2048) - ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8') - self.__create_user_on_node(generated_username, pool_id, node_id, ssh_pub_key) - return generated_username, ssh_key - - def __generate_user_on_pool(self, pool_id, nodes): - generated_username = secure_utils.generate_random_string() - ssh_key = RSA.generate(2048) - ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8') - with concurrent.futures.ThreadPoolExecutor() as executor: - futures = {executor.submit(self.__create_user_on_node, - generated_username, - pool_id, - node.id, - ssh_pub_key): node for node in nodes} - concurrent.futures.wait(futures) - - return generated_username, ssh_key - - def __create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key=None, password=None): - with concurrent.futures.ThreadPoolExecutor() as executor: - futures = {executor.submit(self.__create_user_on_node, - username, - pool_id, - node.id, - ssh_pub_key, - password): node for node in nodes} - concurrent.futures.wait(futures) - - def __delete_user_on_pool(self, username, pool_id, nodes): - with concurrent.futures.ThreadPoolExecutor() as executor: - futures = [executor.submit(self.__delete_user, pool_id, node.id, username) for node in nodes] - concurrent.futures.wait(futures) - - def __node_run(self, cluster_id, node_id, command, internal, container_name=None, timeout=None): - pool, nodes = self.__get_pool_details(cluster_id) - try: - node = next(node for node in nodes if node.id == node_id) - except StopIteration: - raise error.AztkError("Node with id {} not found".format(node_id)) - - if internal: - node_rls = models.RemoteLogin(ip_address=node.ip_address, port="22") - else: - node_rls = self.__get_remote_login_settings(pool.id, node.id) - - try: - generated_username, ssh_key = self.__generate_user_on_node(pool.id, node.id) - output = ssh_lib.node_exec_command( - node.id, - command, - generated_username, - node_rls.ip_address, - node_rls.port, - ssh_key=ssh_key.exportKey().decode('utf-8'), - container_name=container_name, - timeout=timeout - ) - return output - finally: - self.__delete_user(cluster_id, node.id, generated_username) - - def __cluster_run(self, cluster_id, command, internal, container_name=None, timeout=None): - pool, nodes = self.__get_pool_details(cluster_id) - nodes = list(nodes) - if internal: - cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] - else: - cluster_nodes = [(node, self.__get_remote_login_settings(pool.id, node.id)) for node in nodes] - - try: - generated_username, ssh_key = self.__generate_user_on_pool(pool.id, nodes) - output = asyncio.get_event_loop().run_until_complete( - ssh_lib.clus_exec_command( - command, - generated_username, - cluster_nodes, - ssh_key=ssh_key.exportKey().decode('utf-8'), - container_name=container_name, - timeout=timeout - ) - ) - return output - except OSError as exc: - raise exc - finally: - self.__delete_user_on_pool(generated_username, pool.id, nodes) - - def __cluster_copy(self, cluster_id, source_path, destination_path=None, container_name=None, internal=False, get=False, timeout=None): - pool, nodes = self.__get_pool_details(cluster_id) - nodes = list(nodes) - if internal: - cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] - else: - cluster_nodes = [(node, self.__get_remote_login_settings(pool.id, node.id)) for node in nodes] - - try: - generated_username, ssh_key = self.__generate_user_on_pool(pool.id, nodes) - output = asyncio.get_event_loop().run_until_complete( - ssh_lib.clus_copy( - container_name=container_name, - username=generated_username, - nodes=cluster_nodes, - source_path=source_path, - destination_path=destination_path, - ssh_key=ssh_key.exportKey().decode('utf-8'), - get=get, - timeout=timeout - ) - ) - return output - except (OSError, batch_error.BatchErrorException) as exc: - raise exc - finally: - self.__delete_user_on_pool(generated_username, pool.id, nodes) - - def __ssh_into_node(self, pool_id, node_id, username, ssh_key=None, password=None, port_forward_list=None, internal=False): - if internal: - result = self.batch_client.compute_node.get(pool_id=pool_id, node_id=node_id) - rls = models.RemoteLogin(ip_address=result.ip_address, port="22") - else: - result = self.batch_client.compute_node.get_remote_login_settings(pool_id, node_id) - rls = models.RemoteLogin(ip_address=result.remote_login_ip_address, port=str(result.remote_login_port)) - - ssh_lib.node_ssh( - username=username, - hostname=rls.ip_address, - port=rls.port, - ssh_key=ssh_key, - password=password, - port_forward_list=port_forward_list, - ) - - def __submit_job(self, - job_configuration, - start_task, - job_manager_task, - autoscale_formula, - software_metadata_key: str, - vm_image_model, - application_metadata): - """ - Job Submission - :param job_configuration -> aztk_sdk.spark.models.JobConfiguration - :param start_task -> batch_models.StartTask - :param job_manager_task -> batch_models.TaskAddParameter - :param autoscale_formula -> str - :param software_metadata_key -> str - :param vm_image_model -> aztk_sdk.models.VmImage - :returns None - """ - self._get_cluster_data(job_configuration.id).save_cluster_config(job_configuration.to_cluster_config()) - - # get a verified node agent sku - sku_to_use, image_ref_to_use = \ - helpers.select_latest_verified_vm_image_with_node_agent_sku( - vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, self.batch_client) - - # set up subnet if necessary - network_conf = None - if job_configuration.subnet_id: - network_conf = batch_models.NetworkConfiguration( - subnet_id=job_configuration.subnet_id) - - # set up a schedule for a recurring job - auto_pool_specification = batch_models.AutoPoolSpecification( - pool_lifetime_option=batch_models.PoolLifetimeOption.job_schedule, - auto_pool_id_prefix=job_configuration.id, - keep_alive=False, - pool=batch_models.PoolSpecification( - display_name=job_configuration.id, - virtual_machine_configuration=batch_models.VirtualMachineConfiguration( - image_reference=image_ref_to_use, - node_agent_sku_id=sku_to_use), - vm_size=job_configuration.vm_size, - enable_auto_scale=True, - auto_scale_formula=autoscale_formula, - auto_scale_evaluation_interval=timedelta(minutes=5), - start_task=start_task, - enable_inter_node_communication=not job_configuration.mixed_mode(), - network_configuration=network_conf, - max_tasks_per_node=4, - metadata=[ - batch_models.MetadataItem( - name=constants.AZTK_SOFTWARE_METADATA_KEY, value=software_metadata_key), - batch_models.MetadataItem( - name=constants.AZTK_MODE_METADATA_KEY, value=constants.AZTK_JOB_MODE_METADATA) - ] - ) - ) - - # define job specification - job_spec = batch_models.JobSpecification( - pool_info=batch_models.PoolInformation(auto_pool_specification=auto_pool_specification), - display_name=job_configuration.id, - on_all_tasks_complete=batch_models.OnAllTasksComplete.terminate_job, - job_manager_task=job_manager_task, - metadata=[ - batch_models.MetadataItem( - name='applications', value=application_metadata) - ] - ) - - # define schedule - schedule = batch_models.Schedule( - do_not_run_until=None, - do_not_run_after=None, - start_window=None, - recurrence_interval=None - ) - - # create job schedule and add task - setup = batch_models.JobScheduleAddParameter( - id=job_configuration.id, - schedule=schedule, - job_specification=job_spec) - - self.batch_client.job_schedule.add(setup) - - return self.batch_client.job_schedule.get(job_schedule_id=job_configuration.id) - - - ''' - Define Public Interface - ''' - - def create_cluster(self, cluster_conf, wait: bool = False): - raise NotImplementedError() - - def create_clusters_in_parallel(self, cluster_confs): - raise NotImplementedError() - - def delete_cluster(self, cluster_id: str): - raise NotImplementedError() - - def get_cluster(self, cluster_id: str): - raise NotImplementedError() - - def list_clusters(self): - raise NotImplementedError() - - def wait_until_cluster_is_ready(self, cluster_id): - raise NotImplementedError() - - def create_user(self, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str: - raise NotImplementedError() - - def get_remote_login_settings(self, cluster_id, node_id): - raise NotImplementedError() - - def cluster_run(self, cluster_id, command): - raise NotImplementedError() - - def cluster_copy(self, cluster_id, source_path, destination_path): - raise NotImplementedError() - - def cluster_download(self, cluster_id, source_path, destination_path): - raise NotImplementedError() - - def submit_job(self, job): - raise NotImplementedError() diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index cec5e30c..c09e1c88 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -1,10 +1,12 @@ -import aztk.models as models +from aztk import models from aztk.internal import cluster_data from aztk.utils import ssh as ssh_lib -from .helpers import (create_user_on_node, create_user_on_pool, delete_user_on_node, delete_user_on_pool, - generate_user_on_node, generate_user_on_pool, get_application_log, get_remote_login_settings, - node_run, run, ssh_into_node) +from .helpers import (create_user_on_node, create_user_on_pool, + delete_user_on_node, delete_user_on_pool, + generate_user_on_node, generate_user_on_pool, + get_application_log, get_remote_login_settings, node_run, + run, ssh_into_node) class BaseOperations: diff --git a/aztk/client/base/helpers/get_application_log.py b/aztk/client/base/helpers/get_application_log.py index b64a6ccc..46a70b5b 100644 --- a/aztk/client/base/helpers/get_application_log.py +++ b/aztk/client/base/helpers/get_application_log.py @@ -5,7 +5,7 @@ import azure.batch.models.batch_error as batch_error from aztk import error -from aztk.spark import models +from aztk import models from aztk.utils import constants, helpers output_file = constants.TASK_WORKING_DIR + \ diff --git a/aztk/client/client.py b/aztk/client/client.py index cd2cc9ff..6b4985ff 100644 --- a/aztk/client/client.py +++ b/aztk/client/client.py @@ -21,6 +21,7 @@ class CoreClient: def __init__(self, secrets_configuration: models.SecretsConfiguration): # make accept secrets_config and secrets_configuration + self.secrets_configuration = None context = self.get_context(secrets_configuration) self.cluster = CoreClusterOperations(context) self.job = CoreJobOperations(context) @@ -188,7 +189,7 @@ def __create_user(self, pool_id: str, node_id: str, username: str, password: str is_admin=True, password=password, ssh_public_key=get_ssh_key.get_user_public_key( - ssh_key, self.secrets_config), + ssh_key, self.secrets_configuration), expiry_time=datetime.now(timezone.utc) + timedelta(days=365))) def __delete_user(self, pool_id: str, node_id: str, username: str) -> str: diff --git a/aztk/client/cluster/operations.py b/aztk/client/cluster/operations.py index 87a2b9d6..afaaeec2 100644 --- a/aztk/client/cluster/operations.py +++ b/aztk/client/cluster/operations.py @@ -1,4 +1,5 @@ from aztk.client.base import BaseOperations +from aztk.client.base.base_operations import BaseOperations from aztk.models import ClusterConfiguration from .helpers import copy, create, delete, get, list diff --git a/aztk/models/__init__.py b/aztk/models/__init__.py index 77c06f62..a77fd027 100644 --- a/aztk/models/__init__.py +++ b/aztk/models/__init__.py @@ -18,4 +18,5 @@ from .cluster import Cluster from .scheduling_target import SchedulingTarget from .port_forward_specification import PortForwardingSpecification +from .application_log import ApplicationLog from .plugins import * diff --git a/aztk/models/application_log.py b/aztk/models/application_log.py new file mode 100644 index 00000000..58c215ec --- /dev/null +++ b/aztk/models/application_log.py @@ -0,0 +1,12 @@ +import azure.batch.models as batch_models + + +class ApplicationLog(): + def __init__(self, name: str, cluster_id: str, log: str, total_bytes: int, + application_state: batch_models.TaskState, exit_code: int): + self.name = name + self.cluster_id = cluster_id # TODO: change to something cluster/job agnostic + self.log = log + self.total_bytes = total_bytes + self.application_state = application_state + self.exit_code = exit_code diff --git a/aztk/spark/client.py b/aztk/spark/client.py deleted file mode 100644 index 5693fa4a..00000000 --- a/aztk/spark/client.py +++ /dev/null @@ -1,370 +0,0 @@ -from typing import List - -import azure.batch.models.batch_error as batch_error - -import aztk -from aztk import error -from aztk.client import CoreClient as BaseClient -from aztk.internal.cluster_data import NodeData -from aztk.spark import models -from aztk.spark.helpers import create_cluster as create_cluster_helper -from aztk.spark.helpers import get_log as get_log_helper -from aztk.spark.helpers import job_submission as job_submit_helper -from aztk.spark.helpers import submit as cluster_submit_helper -from aztk.spark.helpers import cluster_diagnostic_helper -from aztk.spark.utils import util -from aztk.utils import helpers - - -class Client(BaseClient): - """ - Aztk Spark Client - This is the main entry point for using aztk for spark - - Args: - secrets_config(aztk.spark.models.models.SecretsConfiguration): Configuration with all the needed credentials - """ - - def create_cluster(self, cluster_conf: models.ClusterConfiguration, wait: bool = False): - """ - Create a new aztk spark cluster - - Args: - cluster_conf(aztk.spark.models.models.ClusterConfiguration): Configuration for the the cluster to be created - wait(bool): If you should wait for the cluster to be ready before returning - - Returns: - aztk.spark.models.Cluster - """ - cluster_conf = _apply_default_for_cluster_config(cluster_conf) - cluster_conf.validate() - - cluster_data = self._get_cluster_data(cluster_conf.cluster_id) - try: - zip_resource_files = None - node_data = NodeData(cluster_conf).add_core().done() - zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() - - start_task = create_cluster_helper.generate_cluster_start_task(self, - zip_resource_files, - cluster_conf.cluster_id, - cluster_conf.gpu_enabled(), - cluster_conf.get_docker_repo(), - cluster_conf.file_shares, - cluster_conf.plugins, - cluster_conf.mixed_mode(), - cluster_conf.worker_on_master) - - software_metadata_key = "spark" - - vm_image = models.VmImage( - publisher='Canonical', - offer='UbuntuServer', - sku='16.04') - - cluster = self.__create_pool_and_job( - cluster_conf, software_metadata_key, start_task, vm_image) - - # Wait for the master to be ready - if wait: - util.wait_for_master_to_be_ready(self, cluster.id) - cluster = self.get_cluster(cluster.id) - - return cluster - - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def create_clusters_in_parallel(self, cluster_confs): # NOT IMPLEMENTED - for cluster_conf in cluster_confs: - self.create_cluster(cluster_conf) - - def delete_cluster(self, cluster_id: str, keep_logs: bool = False): - try: - return self.__delete_pool_and_job(cluster_id, keep_logs) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def get_cluster(self, cluster_id: str): - try: - pool, nodes = self.__get_pool_details(cluster_id) - return models.Cluster(pool, nodes) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def list_clusters(self): - try: - return [models.Cluster(pool) for pool in self.__list_clusters(aztk.models.Software.spark)] - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def get_remote_login_settings(self, cluster_id: str, node_id: str): - try: - return self.__get_remote_login_settings(cluster_id, node_id) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def submit(self, cluster_id: str, application: models.ApplicationConfiguration, remote: bool = False, wait: bool = False): - try: - cluster_submit_helper.submit_application(self, cluster_id, application, remote, wait) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - - def submit_all_applications(self, cluster_id: str, applications): # NOT IMPLEMENTED - for application in applications: - self.submit(cluster_id, application) - - - def wait_until_application_done(self, cluster_id: str, task_id: str): # NOT IMPLEMENTED - try: - helpers.wait_for_task_to_complete(job_id=cluster_id, task_id=task_id, batch_client=self.batch_client) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def wait_until_applications_done(self, cluster_id: str): # NOT IMPLEMENTED - try: - helpers.wait_for_tasks_to_complete(job_id=cluster_id, batch_client=self.batch_client) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def wait_until_cluster_is_ready(self, cluster_id: str): # NOT IMPLEMENTED - try: - util.wait_for_master_to_be_ready(self, cluster_id) - pool = self.batch_client.pool.get(cluster_id) - nodes = self.batch_client.compute_node.list(pool_id=cluster_id) - return models.Cluster(pool, nodes) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def wait_until_all_clusters_are_ready(self, clusters: List[str]): # NOT IMPLEMENTED - for cluster_id in clusters: - self.wait_until_cluster_is_ready(cluster_id) - - - def create_user(self, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str: - try: - cluster = self.get_cluster(cluster_id) - master_node_id = cluster.master_node_id - if not master_node_id: - raise error.ClusterNotReadyError("The master has not yet been picked, a user cannot be added.") - self.__create_user_on_pool(username, cluster.id, cluster.nodes, ssh_key, password) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - - def get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): - try: - return get_log_helper.get_log(self.batch_client, self.blob_client, - cluster_id, application_name, tail, current_bytes) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - - def get_application_status(self, cluster_id: str, app_name: str): - try: - task = self.batch_client.task.get(cluster_id, app_name) - return task.state._value_ - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - - def cluster_run(self, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): - try: - return self.__cluster_run(cluster_id, - command, - internal, - container_name='spark' if not host else None, - timeout=timeout) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def node_run(self, cluster_id: str, node_id: str, command: str, host=False, internal: bool = False, timeout=None): - try: - return self.__node_run(cluster_id, - node_id, - command, - internal, - container_name='spark' if not host else None, - timeout=timeout) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - -########################################### CURRENT PROGRESS ##################################################### - - - def cluster_copy(self, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False, timeout: int = None): - try: - container_name = None if host else 'spark' - return self.__cluster_copy(cluster_id, - source_path, - destination_path=destination_path, - container_name=container_name, - get=False, - internal=internal, - timeout=timeout) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def cluster_download(self, cluster_id: str, source_path: str, destination_path: str = None, host: bool = False, internal: bool = False, timeout: int = None): - try: - container_name = None if host else 'spark' - return self.__cluster_copy(cluster_id, - source_path, - destination_path=destination_path, - container_name=container_name, - get=True, - internal=internal, - timeout=timeout) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def cluster_ssh_into_master(self, cluster_id, node_id, username, ssh_key=None, password=None, port_forward_list=None, internal=False): - try: - self.__ssh_into_node(cluster_id, node_id, username, ssh_key, password, port_forward_list, internal) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - ''' - job submission - ''' - def submit_job(self, job_configuration: models.JobConfiguration): - try: - job_configuration = _apply_default_for_job_config(job_configuration) - job_configuration.validate() - cluster_data = self._get_cluster_data(job_configuration.id) - node_data = NodeData(job_configuration.to_cluster_config()).add_core().done() - zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() - - start_task = create_cluster_helper.generate_cluster_start_task(self, - zip_resource_files, - job_configuration.id, - job_configuration.gpu_enabled, - job_configuration.get_docker_repo(), - mixed_mode=job_configuration.mixed_mode(), - worker_on_master=job_configuration.worker_on_master) - - application_tasks = [] - for application in job_configuration.applications: - application_tasks.append( - (application, cluster_submit_helper.generate_task(self, job_configuration.id, application)) - ) - - job_manager_task = job_submit_helper.generate_task(self, job_configuration, application_tasks) - - - software_metadata_key = "spark" - - vm_image = models.VmImage( - publisher='Canonical', - offer='UbuntuServer', - sku='16.04') - - autoscale_formula = "$TargetDedicatedNodes = {0}; " \ - "$TargetLowPriorityNodes = {1}".format( - job_configuration.max_dedicated_nodes, - job_configuration.max_low_pri_nodes) - - job = self.__submit_job( - job_configuration=job_configuration, - start_task=start_task, - job_manager_task=job_manager_task, - autoscale_formula=autoscale_formula, - software_metadata_key=software_metadata_key, - vm_image_model=vm_image, - application_metadata='\n'.join(application.name for application in (job_configuration.applications or []))) - - return models.Job(job) - - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def list_jobs(self): - try: - return [models.Job(cloud_job_schedule) for cloud_job_schedule in job_submit_helper.list_jobs(self)] - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def list_applications(self, job_id): - try: - applications = job_submit_helper.list_applications(self, job_id) - for item in applications: - if applications[item]: - applications[item] = models.Application(applications[item]) - return applications - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def get_job(self, job_id): - try: - job, apps, pool, nodes = job_submit_helper.get_job(self, job_id) - return models.Job(job, apps, pool, nodes) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def stop_job(self, job_id): - try: - return job_submit_helper.stop(self, job_id) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def delete_job(self, job_id: str, keep_logs: bool = False): - try: - return job_submit_helper.delete(self, job_id, keep_logs) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def get_application(self, job_id, application_name): - try: - return models.Application(job_submit_helper.get_application(self, job_id, application_name)) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def get_job_application_log(self, job_id, application_name): - try: - return job_submit_helper.get_application_log(self, job_id, application_name) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def stop_job_app(self, job_id, application_name): # NOT IMPLEMENTED - try: - return job_submit_helper.stop_app(self, job_id, application_name) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def wait_until_job_finished(self, job_id): - try: - job_submit_helper.wait_until_job_finished(self, job_id) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - def wait_until_all_jobs_finished(self, jobs): # NOT IMPLEMENTED - for job in jobs: - self.wait_until_job_finished(job) - - def run_cluster_diagnostics(self, cluster_id, output_directory=None): - try: - output = cluster_diagnostic_helper.run(self, cluster_id, output_directory) - return output - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) - - -def _default_scheduling_target(vm_count: int): - if vm_count == 0: - return models.SchedulingTarget.Any - else: - return models.SchedulingTarget.Dedicated - -def _apply_default_for_cluster_config(configuration: models.ClusterConfiguration): - cluster_conf = models.ClusterConfiguration() - cluster_conf.merge(configuration) - if cluster_conf.scheduling_target is None: - cluster_conf.scheduling_target = _default_scheduling_target(cluster_conf.size) - return cluster_conf - -def _apply_default_for_job_config(job_conf: models.JobConfiguration): - if job_conf.scheduling_target is None: - job_conf.scheduling_target = _default_scheduling_target(job_conf.max_dedicated_nodes) - - return job_conf diff --git a/aztk/spark/client/base/helpers/get_application_log.py b/aztk/spark/client/base/helpers/get_application_log.py new file mode 100644 index 00000000..f17fcc75 --- /dev/null +++ b/aztk/spark/client/base/helpers/get_application_log.py @@ -0,0 +1,9 @@ +from aztk.spark import models + + +def get_application_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): + base_application_log = super(type(base_operations), base_operations).cluster.get_application_log( + cluster_id, application_name, tail, current_bytes) + return models.ApplicationLog(base_application_log.name, base_application_log.cluster_id, base_application_log.log, + base_application_log.total_bytes, base_application_log.application_state, + base_application_log.exit_code) diff --git a/aztk/spark/client/base/operations.py b/aztk/spark/client/base/operations.py index 31894792..64b3bcea 100644 --- a/aztk/spark/client/base/operations.py +++ b/aztk/spark/client/base/operations.py @@ -5,7 +5,7 @@ from aztk.client.base import BaseOperations as CoreBaseOperations from aztk.spark import models -from .helpers import generate_cluster_start_task, generate_application_task +from .helpers import generate_cluster_start_task, generate_application_task, get_application_log class SparkBaseOperations(CoreBaseOperations): @@ -24,3 +24,6 @@ def generate_cluster_start_task(self, def generate_application_task(self, container_id, application, remote=False): return generate_application_task.generate_application_task(self, container_id, application, remote) + + def get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): + return get_application_log.get_application_log(self, cluster_id, application_name, tail, current_bytes) diff --git a/aztk/spark/client/client.py b/aztk/spark/client/client.py index 0e4cd862..0cc16d9b 100644 --- a/aztk/spark/client/client.py +++ b/aztk/spark/client/client.py @@ -46,7 +46,7 @@ def create_cluster(self, cluster_conf: models.ClusterConfiguration, wait: bool = @deprecated("0.10.0") def create_clusters_in_parallel(self, cluster_confs): # NOT IMPLEMENTED for cluster_conf in cluster_confs: - self.cluster.create_cluster(cluster_conf) + self.cluster.create(cluster_conf) @deprecated("0.10.0") def delete_cluster(self, cluster_id: str, keep_logs: bool = False): diff --git a/aztk/spark/client/cluster/helpers/copy.py b/aztk/spark/client/cluster/helpers/copy.py index 1326d933..e2658b6f 100644 --- a/aztk/spark/client/cluster/helpers/copy.py +++ b/aztk/spark/client/cluster/helpers/copy.py @@ -1,7 +1,7 @@ import azure.batch.models.batch_error as batch_error from aztk import error -from aztk.spark import helpers +from aztk.utils import helpers def cluster_copy(spark_cluster_operations, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False, timeout: int = None): diff --git a/aztk/spark/client/job/helpers/list.py b/aztk/spark/client/job/helpers/list.py index 907949e6..068e4057 100644 --- a/aztk/spark/client/job/helpers/list.py +++ b/aztk/spark/client/job/helpers/list.py @@ -1,8 +1,9 @@ import azure.batch.models.batch_error as batch_error -import aztk.models # TODO: get rid of this import and use aztk.spark.models +import aztk.models # TODO: get rid of this import and use aztk.spark.models from aztk import error -from aztk.spark import helpers, models +from aztk.spark import models +from aztk.utils import helpers def _list_jobs(spark_job_operations): diff --git a/aztk/spark/models/models.py b/aztk/spark/models/models.py index a7e18233..c1ae1105 100644 --- a/aztk/spark/models/models.py +++ b/aztk/spark/models/models.py @@ -291,11 +291,5 @@ def __init__(self, cloud_job_schedule: batch_models.CloudJobSchedule, self.cluster = None -class ApplicationLog(): - def __init__(self, name: str, cluster_id: str, log: str, total_bytes: int, application_state: batch_models.TaskState, exit_code: int): - self.name = name - self.cluster_id = cluster_id # TODO: change to something cluster/job agnostic - self.log = log - self.total_bytes = total_bytes - self.application_state = application_state - self.exit_code = exit_code +class ApplicationLog(aztk.models.ApplicationLog): + pass \ No newline at end of file diff --git a/aztk_cli/spark/endpoints/cluster/cluster_add_user.py b/aztk_cli/spark/endpoints/cluster/cluster_add_user.py index f0fdb855..c59bdd5a 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_add_user.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_add_user.py @@ -30,9 +30,9 @@ def execute(args: typing.NamedTuple): if args.ssh_key: ssh_key = args.ssh_key else: - ssh_key = spark_client.secrets_config.ssh_pub_key + ssh_key = spark_client.secrets_configuration.ssh_pub_key - ssh_key, password = utils.get_ssh_key_or_prompt(ssh_key, args.username, args.password, spark_client.secrets_config) + ssh_key, password = utils.get_ssh_key_or_prompt(ssh_key, args.username, args.password, spark_client.secrets_configuration) spark_client.cluster.create_user( id=args.cluster_id, diff --git a/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py b/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py index 4d20a448..47f09277 100644 --- a/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py +++ b/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py @@ -20,7 +20,7 @@ def clean_up_cluster(cluster_id): try: - spark_client.cluster.delete(cluster_id=cluster_id) + spark_client.cluster.delete(id=cluster_id) except (BatchErrorException, AztkError): # pass in the event that the cluster does not exist pass @@ -134,8 +134,8 @@ def test_get_remote_login_settings(): spark_configuration=None) try: spark_client.cluster.create(cluster_configuration, wait=True) - cluster = spark_client.cluster.get(cluster_id=cluster_configuration.cluster_id) - rls = spark_client.cluster.get_remote_login_settings(cluster_id=cluster.id, node_id=cluster.master_node_id) + cluster = spark_client.cluster.get(id=cluster_configuration.cluster_id) + rls = spark_client.cluster.get_remote_login_settings(id=cluster.id, node_id=cluster.master_node_id) assert rls.ip_address is not None assert rls.port is not None @@ -179,7 +179,7 @@ def test_submit(): spark_client.cluster.create(cluster_configuration, wait=True) spark_client.cluster.submit( - cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + id=cluster_configuration.cluster_id, application=application_configuration, wait=True) assert True except (AztkError, BatchErrorException): @@ -220,9 +220,9 @@ def test_get_application_log(): spark_client.cluster.create(cluster_configuration, wait=True) spark_client.cluster.submit( - cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + id=cluster_configuration.cluster_id, application=application_configuration, wait=True) application_log = spark_client.cluster.get_application_log( - cluster_id=cluster_configuration.cluster_id, + id=cluster_configuration.cluster_id, application_name=application_configuration.name, tail=False, current_bytes=0) @@ -281,9 +281,9 @@ def test_get_application_status_complete(): spark_client.cluster.create(cluster_configuration, wait=True) spark_client.cluster.submit( - cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + id=cluster_configuration.cluster_id, application=application_configuration, wait=True) status = spark_client.cluster.get_application_status( - cluster_id=cluster_configuration.cluster_id, application_name=application_configuration.name) + id=cluster_configuration.cluster_id, application_name=application_configuration.name) assert status == "completed" @@ -309,7 +309,7 @@ def test_delete_cluster(): try: spark_client.cluster.create(cluster_configuration, wait=True) - success = spark_client.cluster.delete(cluster_id=cluster_configuration.cluster_id) + success = spark_client.cluster.delete(id=cluster_configuration.cluster_id) assert success is True @@ -336,7 +336,7 @@ def test_spark_processes_up(): try: cluster = spark_client.cluster.create(cluster_configuration, wait=True) wait_for_all_nodes(cluster.id, cluster.nodes) - success = spark_client.cluster.delete(cluster_id=cluster_configuration.cluster_id) + success = spark_client.cluster.delete(id=cluster_configuration.cluster_id) assert success is True @@ -367,7 +367,7 @@ def test_debug_tool(): cluster = spark_client.cluster.create(cluster_configuration, wait=True) nodes = [node for node in cluster.nodes] wait_for_all_nodes(cluster.id, nodes) - cluster_output = spark_client.cluster.diagnostics(cluster_id=cluster.id) + cluster_output = spark_client.cluster.diagnostics(id=cluster.id) for node_output in cluster_output: node_output.output.seek(0) # tempfile requires seek 0 before reading debug_zip = ZipFile(node_output.output) From adb0e0a1ae55a207f714544e7221ff8f1f845133 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 9 Jul 2018 14:16:29 -0700 Subject: [PATCH 15/52] fix pylint errors, bugs --- aztk/client/base/base_operations.py | 4 ++-- .../client/base/helpers/get_application_log.py | 4 ++-- aztk/spark/client/base/operations.py | 4 ++-- aztk/spark/client/cluster/helpers/create.py | 12 ++++++------ .../client/cluster/helpers/get_application_log.py | 11 ----------- aztk/spark/client/cluster/operations.py | 13 +++---------- .../spark/sdk/cluster/test_cluster_new.py | 2 ++ 7 files changed, 17 insertions(+), 33 deletions(-) delete mode 100644 aztk/spark/client/cluster/helpers/get_application_log.py diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index c09e1c88..827d852b 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -62,8 +62,8 @@ def delete_user_on_pool(self, username, pool_id, nodes): #TODO: change from p def node_run(self, cluster_id, node_id, command, internal, container_name=None, timeout=None): return node_run.node_run(self, cluster_id, node_id, command, internal, container_name, timeout) - def get_remote_login_settings(self, cluster_id: str, node_id: str): - return get_remote_login_settings.get_remote_login_settings(self, cluster_id, node_id) + def get_remote_login_settings(self, id: str, node_id: str): + return get_remote_login_settings.get_remote_login_settings(self, id, node_id) def run(self, cluster_id, command, internal, container_name=None, timeout=None): return run.cluster_run(self, cluster_id, command, internal, container_name, timeout) diff --git a/aztk/spark/client/base/helpers/get_application_log.py b/aztk/spark/client/base/helpers/get_application_log.py index f17fcc75..9065a292 100644 --- a/aztk/spark/client/base/helpers/get_application_log.py +++ b/aztk/spark/client/base/helpers/get_application_log.py @@ -1,8 +1,8 @@ from aztk.spark import models -def get_application_log(base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): - base_application_log = super(type(base_operations), base_operations).cluster.get_application_log( +def get_application_log(super_type, spark_base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): + base_application_log = super(super_type, spark_base_operations).get_application_log( cluster_id, application_name, tail, current_bytes) return models.ApplicationLog(base_application_log.name, base_application_log.cluster_id, base_application_log.log, base_application_log.total_bytes, base_application_log.application_state, diff --git a/aztk/spark/client/base/operations.py b/aztk/spark/client/base/operations.py index 64b3bcea..47c72cff 100644 --- a/aztk/spark/client/base/operations.py +++ b/aztk/spark/client/base/operations.py @@ -25,5 +25,5 @@ def generate_cluster_start_task(self, def generate_application_task(self, container_id, application, remote=False): return generate_application_task.generate_application_task(self, container_id, application, remote) - def get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): - return get_application_log.get_application_log(self, cluster_id, application_name, tail, current_bytes) + def get_application_log(self, id: str, application_name: str, tail=False, current_bytes: int = 0): + return get_application_log.get_application_log(SparkBaseOperations, self, id, application_name, tail, current_bytes) diff --git a/aztk/spark/client/cluster/helpers/create.py b/aztk/spark/client/cluster/helpers/create.py index 16f6c66d..e941f97d 100644 --- a/aztk/spark/client/cluster/helpers/create.py +++ b/aztk/spark/client/cluster/helpers/create.py @@ -26,7 +26,7 @@ def _apply_default_for_cluster_config(configuration: models.ClusterConfiguration return cluster_conf -def create_cluster(spark_cluster_client, cluster_conf: models.ClusterConfiguration, wait: bool = False): +def create_cluster(spark_cluster_operations, cluster_conf: models.ClusterConfiguration, wait: bool = False): """ Create a new aztk spark cluster @@ -40,13 +40,13 @@ def create_cluster(spark_cluster_client, cluster_conf: models.ClusterConfigurati cluster_conf = _apply_default_for_cluster_config(cluster_conf) cluster_conf.validate() - cluster_data = spark_cluster_client.get_cluster_data(cluster_conf.cluster_id) + cluster_data = spark_cluster_operations.get_cluster_data(cluster_conf.cluster_id) try: zip_resource_files = None node_data = NodeData(cluster_conf).add_core().done() zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() - start_task = spark_cluster_client.generate_cluster_start_task(zip_resource_files, cluster_conf.cluster_id, + start_task = spark_cluster_operations.generate_cluster_start_task(zip_resource_files, cluster_conf.cluster_id, cluster_conf.gpu_enabled(), cluster_conf.get_docker_repo(), cluster_conf.file_shares, cluster_conf.plugins, cluster_conf.mixed_mode(), cluster_conf.worker_on_master) @@ -55,12 +55,12 @@ def create_cluster(spark_cluster_client, cluster_conf: models.ClusterConfigurati vm_image = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04') - cluster = super(type(spark_cluster_client), spark_cluster_client).create(cluster_conf, software_metadata_key, start_task, vm_image) + cluster = super(type(spark_cluster_operations), spark_cluster_operations).create(cluster_conf, software_metadata_key, start_task, vm_image) # Wait for the master to be ready if wait: - util.wait_for_master_to_be_ready(spark_cluster_client, cluster.id) - cluster = spark_cluster_client.get(cluster.id) + util.wait_for_master_to_be_ready(spark_cluster_operations, cluster.id) + cluster = spark_cluster_operations.get(cluster.id) return cluster diff --git a/aztk/spark/client/cluster/helpers/get_application_log.py b/aztk/spark/client/cluster/helpers/get_application_log.py deleted file mode 100644 index 9ecaed5c..00000000 --- a/aztk/spark/client/cluster/helpers/get_application_log.py +++ /dev/null @@ -1,11 +0,0 @@ -from azure.batch.models import batch_error - -from aztk import error -from aztk.utils import helpers - - -def get_application_log(spark_cluster_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): - try: - return super(type(spark_cluster_operations), spark_cluster_operations).get_application_log(cluster_id, application_name, tail, current_bytes) - except batch_error.BatchErrorException as e: - raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index efa3da9d..b4fe168c 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -2,8 +2,8 @@ from aztk.spark import models from aztk.spark.client.base import SparkBaseOperations -from .helpers import (copy, create, create_user, delete, diagnostics, download, get, get_application_log, - get_application_status, list, node_run, run, submit) +from .helpers import (copy, create, create_user, delete, diagnostics, download, + get, get_application_status, list, node_run, run, submit) class ClusterOperations(CoreClusterOperations, SparkBaseOperations): @@ -19,19 +19,12 @@ def get(self, id: str): def list(self): return list.list_clusters(self) - def submit(self, - id: str, - application: models.ApplicationConfiguration, - remote: bool = False, - wait: bool = False): + def submit(self, id: str, application: models.ApplicationConfiguration, remote: bool = False, wait: bool = False): return submit.submit(self, id, application, remote, wait) def create_user(self, id: str, username: str, password: str = None, ssh_key: str = None): return create_user.create_user(self, id, username, ssh_key, password) - def get_application_log(self, id: str, application_name: str, tail=False, current_bytes: int = 0): - return get_application_log.get_application_log(self, id, application_name, tail, current_bytes) - def get_application_status(self, id: str, application_name: str): return get_application_status.get_application_status(self, id, application_name) diff --git a/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py b/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py index 47f09277..1f329dd9 100644 --- a/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py +++ b/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py @@ -55,6 +55,8 @@ def ensure_spark_processes(cluster_id): def wait_for_all_nodes(cluster_id, nodes): while True: for node in nodes: + if node.state in [batch_models.ComputeNodeState.unusable, batch_models.ComputeNodeState.start_task_failed]: + raise AztkError("Node {} in failed state.".format(node.id)) if node.state not in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running]: break else: From 6830614353990a0fd9a76e8ddce0cb52d0e1cea0 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 9 Jul 2018 15:24:41 -0700 Subject: [PATCH 16/52] add deprecated warning checks, rename tests --- .../spark/sdk/cluster/test_cluster.py | 93 +++++-------- ...ster_new.py => test_cluster_deprecated.py} | 130 +++++++++++++----- .../spark/sdk/job/test_job.py | 46 +++---- ...test_job_new.py => test_job_deprecated.py} | 71 ++++++---- 4 files changed, 195 insertions(+), 145 deletions(-) rename tests/integration_tests/spark/sdk/cluster/{test_cluster_new.py => test_cluster_deprecated.py} (70%) rename tests/integration_tests/spark/sdk/job/{test_job_new.py => test_job_deprecated.py} (73%) diff --git a/tests/integration_tests/spark/sdk/cluster/test_cluster.py b/tests/integration_tests/spark/sdk/cluster/test_cluster.py index adff6e8e..1f329dd9 100644 --- a/tests/integration_tests/spark/sdk/cluster/test_cluster.py +++ b/tests/integration_tests/spark/sdk/cluster/test_cluster.py @@ -14,21 +14,20 @@ from aztk_cli import config from tests.integration_tests.spark.sdk.get_client import get_spark_client, get_test_suffix - base_cluster_id = get_test_suffix("cluster") spark_client = get_spark_client() def clean_up_cluster(cluster_id): try: - spark_client.delete_cluster(cluster_id=cluster_id) + spark_client.cluster.delete(id=cluster_id) except (BatchErrorException, AztkError): # pass in the event that the cluster does not exist pass def ensure_spark_master(cluster_id): - results = spark_client.cluster_run(cluster_id, + results = spark_client.cluster.run(cluster_id, "if $AZTK_IS_MASTER ; then $SPARK_HOME/sbin/spark-daemon.sh status org.apache.spark.deploy.master.Master 1 ;" \ " else echo AZTK_IS_MASTER is false ; fi") for _, result in results: @@ -39,7 +38,7 @@ def ensure_spark_master(cluster_id): def ensure_spark_worker(cluster_id): - results = spark_client.cluster_run(cluster_id, + results = spark_client.cluster.run(cluster_id, "if $AZTK_IS_WORKER ; then $SPARK_HOME/sbin/spark-daemon.sh status org.apache.spark.deploy.worker.Worker 1 ;" \ " else echo AZTK_IS_WORKER is false ; fi") for _, result in results: @@ -56,10 +55,12 @@ def ensure_spark_processes(cluster_id): def wait_for_all_nodes(cluster_id, nodes): while True: for node in nodes: + if node.state in [batch_models.ComputeNodeState.unusable, batch_models.ComputeNodeState.start_task_failed]: + raise AztkError("Node {} in failed state.".format(node.id)) if node.state not in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running]: break else: - nodes = spark_client.get_cluster(cluster_id).nodes + nodes = spark_client.cluster.get(cluster_id).nodes continue break @@ -78,39 +79,7 @@ def test_create_cluster(): toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), spark_configuration=None) try: - cluster = spark_client.create_cluster(cluster_configuration, wait=True) - - assert cluster.pool is not None - assert cluster.nodes is not None - assert cluster.id == cluster_configuration.cluster_id - assert cluster.vm_size == "standard_f2" - assert cluster.current_dedicated_nodes == 2 - assert cluster.gpu_enabled is False - assert cluster.master_node_id is not None - assert cluster.current_low_pri_nodes == 0 - - except (AztkError, BatchErrorException) as e: - assert False - - finally: - clean_up_cluster(cluster_configuration.cluster_id) - - -def test_get_cluster(): - test_id = "test-get-" - cluster_configuration = aztk.spark.models.ClusterConfiguration( - cluster_id=test_id + base_cluster_id, - vm_count=2, - vm_low_pri_count=0, - vm_size="standard_f2", - subnet_id=None, - custom_scripts=None, - file_shares=None, - toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), - spark_configuration=None) - try: - spark_client.create_cluster(cluster_configuration, wait=True) - cluster = spark_client.get_cluster(cluster_id=cluster_configuration.cluster_id) + cluster = spark_client.cluster.create(cluster_configuration, wait=True) assert cluster.pool is not None assert cluster.nodes is not None @@ -141,8 +110,8 @@ def test_list_clusters(): toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), spark_configuration=None) try: - spark_client.create_cluster(cluster_configuration, wait=True) - clusters = spark_client.list_clusters() + spark_client.cluster.create(cluster_configuration, wait=True) + clusters = spark_client.cluster.list() assert cluster_configuration.cluster_id in [cluster.id for cluster in clusters] @@ -166,9 +135,9 @@ def test_get_remote_login_settings(): toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), spark_configuration=None) try: - spark_client.create_cluster(cluster_configuration, wait=True) - cluster = spark_client.get_cluster(cluster_id=cluster_configuration.cluster_id) - rls = spark_client.get_remote_login_settings(cluster_id=cluster.id, node_id=cluster.master_node_id) + spark_client.cluster.create(cluster_configuration, wait=True) + cluster = spark_client.cluster.get(id=cluster_configuration.cluster_id) + rls = spark_client.cluster.get_remote_login_settings(id=cluster.id, node_id=cluster.master_node_id) assert rls.ip_address is not None assert rls.port is not None @@ -209,10 +178,10 @@ def test_submit(): executor_cores=None, max_retry_count=None) try: - spark_client.create_cluster(cluster_configuration, wait=True) + spark_client.cluster.create(cluster_configuration, wait=True) - spark_client.submit( - cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + spark_client.cluster.submit( + id=cluster_configuration.cluster_id, application=application_configuration, wait=True) assert True except (AztkError, BatchErrorException): @@ -250,12 +219,12 @@ def test_get_application_log(): executor_cores=None, max_retry_count=None) try: - spark_client.create_cluster(cluster_configuration, wait=True) + spark_client.cluster.create(cluster_configuration, wait=True) - spark_client.submit( - cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) - application_log = spark_client.get_application_log( - cluster_id=cluster_configuration.cluster_id, + spark_client.cluster.submit( + id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + application_log = spark_client.cluster.get_application_log( + id=cluster_configuration.cluster_id, application_name=application_configuration.name, tail=False, current_bytes=0) @@ -311,12 +280,12 @@ def test_get_application_status_complete(): executor_cores=None, max_retry_count=None) try: - spark_client.create_cluster(cluster_configuration, wait=True) + spark_client.cluster.create(cluster_configuration, wait=True) - spark_client.submit( - cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) - status = spark_client.get_application_status( - cluster_id=cluster_configuration.cluster_id, app_name=application_configuration.name) + spark_client.cluster.submit( + id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + status = spark_client.cluster.get_application_status( + id=cluster_configuration.cluster_id, application_name=application_configuration.name) assert status == "completed" @@ -341,8 +310,8 @@ def test_delete_cluster(): spark_configuration=None) try: - spark_client.create_cluster(cluster_configuration, wait=True) - success = spark_client.delete_cluster(cluster_id=cluster_configuration.cluster_id) + spark_client.cluster.create(cluster_configuration, wait=True) + success = spark_client.cluster.delete(id=cluster_configuration.cluster_id) assert success is True @@ -367,9 +336,9 @@ def test_spark_processes_up(): spark_configuration=None) try: - cluster = spark_client.create_cluster(cluster_configuration, wait=True) + cluster = spark_client.cluster.create(cluster_configuration, wait=True) wait_for_all_nodes(cluster.id, cluster.nodes) - success = spark_client.delete_cluster(cluster_id=cluster_configuration.cluster_id) + success = spark_client.cluster.delete(id=cluster_configuration.cluster_id) assert success is True @@ -397,10 +366,10 @@ def test_debug_tool(): "spark/logs", "spark/wd" ] try: - cluster = spark_client.create_cluster(cluster_configuration, wait=True) + cluster = spark_client.cluster.create(cluster_configuration, wait=True) nodes = [node for node in cluster.nodes] wait_for_all_nodes(cluster.id, nodes) - cluster_output = spark_client.run_cluster_diagnostics(cluster_id=cluster.id) + cluster_output = spark_client.cluster.diagnostics(id=cluster.id) for node_output in cluster_output: node_output.output.seek(0) # tempfile requires seek 0 before reading debug_zip = ZipFile(node_output.output) diff --git a/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py b/tests/integration_tests/spark/sdk/cluster/test_cluster_deprecated.py similarity index 70% rename from tests/integration_tests/spark/sdk/cluster/test_cluster_new.py rename to tests/integration_tests/spark/sdk/cluster/test_cluster_deprecated.py index 1f329dd9..d1ad53ea 100644 --- a/tests/integration_tests/spark/sdk/cluster/test_cluster_new.py +++ b/tests/integration_tests/spark/sdk/cluster/test_cluster_deprecated.py @@ -14,20 +14,21 @@ from aztk_cli import config from tests.integration_tests.spark.sdk.get_client import get_spark_client, get_test_suffix + base_cluster_id = get_test_suffix("cluster") spark_client = get_spark_client() def clean_up_cluster(cluster_id): try: - spark_client.cluster.delete(id=cluster_id) + spark_client.delete_cluster(cluster_id=cluster_id) except (BatchErrorException, AztkError): # pass in the event that the cluster does not exist pass def ensure_spark_master(cluster_id): - results = spark_client.cluster.run(cluster_id, + results = spark_client.cluster_run(cluster_id, "if $AZTK_IS_MASTER ; then $SPARK_HOME/sbin/spark-daemon.sh status org.apache.spark.deploy.master.Master 1 ;" \ " else echo AZTK_IS_MASTER is false ; fi") for _, result in results: @@ -38,7 +39,7 @@ def ensure_spark_master(cluster_id): def ensure_spark_worker(cluster_id): - results = spark_client.cluster.run(cluster_id, + results = spark_client.cluster_run(cluster_id, "if $AZTK_IS_WORKER ; then $SPARK_HOME/sbin/spark-daemon.sh status org.apache.spark.deploy.worker.Worker 1 ;" \ " else echo AZTK_IS_WORKER is false ; fi") for _, result in results: @@ -55,12 +56,10 @@ def ensure_spark_processes(cluster_id): def wait_for_all_nodes(cluster_id, nodes): while True: for node in nodes: - if node.state in [batch_models.ComputeNodeState.unusable, batch_models.ComputeNodeState.start_task_failed]: - raise AztkError("Node {} in failed state.".format(node.id)) if node.state not in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running]: break else: - nodes = spark_client.cluster.get(cluster_id).nodes + nodes = spark_client.get_cluster(cluster_id).nodes continue break @@ -79,7 +78,42 @@ def test_create_cluster(): toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), spark_configuration=None) try: - cluster = spark_client.cluster.create(cluster_configuration, wait=True) + with pytest.warns(DeprecationWarning): + cluster = spark_client.create_cluster(cluster_configuration, wait=True) + + assert cluster.pool is not None + assert cluster.nodes is not None + assert cluster.id == cluster_configuration.cluster_id + assert cluster.vm_size == "standard_f2" + assert cluster.current_dedicated_nodes == 2 + assert cluster.gpu_enabled is False + assert cluster.master_node_id is not None + assert cluster.current_low_pri_nodes == 0 + + except (AztkError, BatchErrorException) as e: + assert False + + finally: + clean_up_cluster(cluster_configuration.cluster_id) + + +def test_get_cluster(): + test_id = "test-get-" + cluster_configuration = aztk.spark.models.ClusterConfiguration( + cluster_id=test_id + base_cluster_id, + vm_count=2, + vm_low_pri_count=0, + vm_size="standard_f2", + subnet_id=None, + custom_scripts=None, + file_shares=None, + toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), + spark_configuration=None) + try: + with pytest.warns(DeprecationWarning): + spark_client.create_cluster(cluster_configuration, wait=True) + with pytest.warns(DeprecationWarning): + cluster = spark_client.get_cluster(cluster_id=cluster_configuration.cluster_id) assert cluster.pool is not None assert cluster.nodes is not None @@ -110,8 +144,10 @@ def test_list_clusters(): toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), spark_configuration=None) try: - spark_client.cluster.create(cluster_configuration, wait=True) - clusters = spark_client.cluster.list() + with pytest.warns(DeprecationWarning): + spark_client.create_cluster(cluster_configuration, wait=True) + with pytest.warns(DeprecationWarning): + clusters = spark_client.list_clusters() assert cluster_configuration.cluster_id in [cluster.id for cluster in clusters] @@ -135,9 +171,12 @@ def test_get_remote_login_settings(): toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), spark_configuration=None) try: - spark_client.cluster.create(cluster_configuration, wait=True) - cluster = spark_client.cluster.get(id=cluster_configuration.cluster_id) - rls = spark_client.cluster.get_remote_login_settings(id=cluster.id, node_id=cluster.master_node_id) + with pytest.warns(DeprecationWarning): + spark_client.create_cluster(cluster_configuration, wait=True) + with pytest.warns(DeprecationWarning): + cluster = spark_client.get_cluster(cluster_id=cluster_configuration.cluster_id) + with pytest.warns(DeprecationWarning): + rls = spark_client.get_remote_login_settings(cluster_id=cluster.id, node_id=cluster.master_node_id) assert rls.ip_address is not None assert rls.port is not None @@ -178,10 +217,13 @@ def test_submit(): executor_cores=None, max_retry_count=None) try: - spark_client.cluster.create(cluster_configuration, wait=True) + with pytest.warns(DeprecationWarning): + spark_client.create_cluster(cluster_configuration, wait=True) - spark_client.cluster.submit( - id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + with pytest.warns(DeprecationWarning): + spark_client.submit( + cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + assert True except (AztkError, BatchErrorException): @@ -219,15 +261,18 @@ def test_get_application_log(): executor_cores=None, max_retry_count=None) try: - spark_client.cluster.create(cluster_configuration, wait=True) - - spark_client.cluster.submit( - id=cluster_configuration.cluster_id, application=application_configuration, wait=True) - application_log = spark_client.cluster.get_application_log( - id=cluster_configuration.cluster_id, - application_name=application_configuration.name, - tail=False, - current_bytes=0) + with pytest.warns(DeprecationWarning): + spark_client.create_cluster(cluster_configuration, wait=True) + + with pytest.warns(DeprecationWarning): + spark_client.submit( + cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + with pytest.warns(DeprecationWarning): + application_log = spark_client.get_application_log( + cluster_id=cluster_configuration.cluster_id, + application_name=application_configuration.name, + tail=False, + current_bytes=0) assert application_log.exit_code == 0 assert application_log.name == application_configuration.name == "pipy100" @@ -280,12 +325,14 @@ def test_get_application_status_complete(): executor_cores=None, max_retry_count=None) try: - spark_client.cluster.create(cluster_configuration, wait=True) - - spark_client.cluster.submit( - id=cluster_configuration.cluster_id, application=application_configuration, wait=True) - status = spark_client.cluster.get_application_status( - id=cluster_configuration.cluster_id, application_name=application_configuration.name) + with pytest.warns(DeprecationWarning): + spark_client.create_cluster(cluster_configuration, wait=True) + with pytest.warns(DeprecationWarning): + spark_client.submit( + cluster_id=cluster_configuration.cluster_id, application=application_configuration, wait=True) + with pytest.warns(DeprecationWarning): + status = spark_client.get_application_status( + cluster_id=cluster_configuration.cluster_id, app_name=application_configuration.name) assert status == "completed" @@ -310,8 +357,9 @@ def test_delete_cluster(): spark_configuration=None) try: - spark_client.cluster.create(cluster_configuration, wait=True) - success = spark_client.cluster.delete(id=cluster_configuration.cluster_id) + with pytest.warns(DeprecationWarning): + spark_client.create_cluster(cluster_configuration, wait=True) + success = spark_client.delete_cluster(cluster_id=cluster_configuration.cluster_id) assert success is True @@ -336,9 +384,12 @@ def test_spark_processes_up(): spark_configuration=None) try: - cluster = spark_client.cluster.create(cluster_configuration, wait=True) - wait_for_all_nodes(cluster.id, cluster.nodes) - success = spark_client.cluster.delete(id=cluster_configuration.cluster_id) + with pytest.warns(DeprecationWarning): + cluster = spark_client.create_cluster(cluster_configuration, wait=True) + wait_for_all_nodes(cluster.id, cluster.nodes) + + with pytest.warns(DeprecationWarning): + success = spark_client.delete_cluster(cluster_id=cluster_configuration.cluster_id) assert success is True @@ -366,10 +417,15 @@ def test_debug_tool(): "spark/logs", "spark/wd" ] try: - cluster = spark_client.cluster.create(cluster_configuration, wait=True) + with pytest.warns(DeprecationWarning): + cluster = spark_client.create_cluster(cluster_configuration, wait=True) + nodes = [node for node in cluster.nodes] wait_for_all_nodes(cluster.id, nodes) - cluster_output = spark_client.cluster.diagnostics(id=cluster.id) + + with pytest.warns(DeprecationWarning): + cluster_output = spark_client.run_cluster_diagnostics(cluster_id=cluster.id) + for node_output in cluster_output: node_output.output.seek(0) # tempfile requires seek 0 before reading debug_zip = ZipFile(node_output.output) diff --git a/tests/integration_tests/spark/sdk/job/test_job.py b/tests/integration_tests/spark/sdk/job/test_job.py index 1303c221..0350c6cb 100644 --- a/tests/integration_tests/spark/sdk/job/test_job.py +++ b/tests/integration_tests/spark/sdk/job/test_job.py @@ -36,8 +36,8 @@ def test_submit_job(): max_low_pri_nodes=0 ) try: - job = spark_client.submit_job(job_configuration=job_configuration) - spark_client.wait_until_job_finished(job_id=job_configuration.id) + job = spark_client.job.submit(job_configuration=job_configuration) + spark_client.job.wait_until_job_finished(id=job_configuration.id) assert job.id == job_configuration.id assert job.state is not None @@ -73,10 +73,10 @@ def test_list_jobs(): worker_on_master=True ) try: - spark_client.submit_job(job_configuration=job_configuration) - spark_client.wait_until_job_finished(job_configuration.id) + spark_client.job.submit(job_configuration=job_configuration) + spark_client.job.wait_until_job_finished(job_configuration.id) - jobs = spark_client.list_jobs() + jobs = spark_client.job.list() assert jobs is not None assert job_configuration.id in [job.id for job in jobs] @@ -111,10 +111,10 @@ def test_list_applications(): max_low_pri_nodes=0 ) try: - spark_client.submit_job(job_configuration=job_configuration) - spark_client.wait_until_job_finished(job_configuration.id) + spark_client.job.submit(job_configuration=job_configuration) + spark_client.job.wait_until_job_finished(job_configuration.id) - applications = spark_client.list_applications(job_id=job_configuration.id) + applications = spark_client.job.list_applications(id=job_configuration.id) assert applications not in (None, []) assert len(applications) == 2 @@ -152,10 +152,10 @@ def test_get_job(): worker_on_master=True ) try: - spark_client.submit_job(job_configuration=job_configuration) - spark_client.wait_until_job_finished(job_configuration.id) + spark_client.job.submit(job_configuration=job_configuration) + spark_client.job.wait_until_job_finished(job_configuration.id) - job = spark_client.get_job(job_id=job_configuration.id) + job = spark_client.job.get(id=job_configuration.id) assert job.id == job_configuration.id assert app1.name in [app.name for app in job.applications] assert app2.name in [app.name for app in job.applications] @@ -185,9 +185,9 @@ def test_get_application(): max_low_pri_nodes=0 ) try: - spark_client.submit_job(job_configuration=job_configuration) - spark_client.wait_until_job_finished(job_configuration.id) - application = spark_client.get_application(job_id=job_configuration.id, application_name=app1.name) + spark_client.job.submit(job_configuration=job_configuration) + spark_client.job.wait_until_job_finished(job_configuration.id) + application = spark_client.job.get_application(id=job_configuration.id, application_name=app1.name) assert isinstance(application, aztk.spark.models.Application) assert application.exit_code == 0 assert application.state == "completed" @@ -216,10 +216,10 @@ def test_get_application_log(): max_low_pri_nodes=0 ) try: - spark_client.submit_job(job_configuration=job_configuration) - spark_client.wait_until_job_finished(job_configuration.id) + spark_client.job.submit(job_configuration=job_configuration) + spark_client.job.wait_until_job_finished(job_configuration.id) - application_log = spark_client.get_job_application_log(job_id=job_configuration.id, application_name=app1.name) + application_log = spark_client.job.get_application_log(id=job_configuration.id, application_name=app1.name) assert isinstance(application_log, aztk.spark.models.ApplicationLog) assert application_log.log is not None @@ -253,12 +253,12 @@ def test_delete_job(): worker_on_master=True ) try: - spark_client.submit_job(job_configuration=job_configuration) - spark_client.wait_until_job_finished(job_configuration.id) - spark_client.delete_job(job_configuration.id) - assert job_configuration.id not in spark_client.list_jobs() + spark_client.job.submit(job_configuration=job_configuration) + spark_client.job.wait_until_job_finished(job_configuration.id) + spark_client.job.delete(job_configuration.id) + assert job_configuration.id not in spark_client.job.list() try: - spark_client.get_job(job_configuration.id) + spark_client.job.get(job_configuration.id) except AztkError: # this should fail assert True @@ -270,6 +270,6 @@ def test_delete_job(): def clean_up_job(job_id): try: - spark_client.delete_job(job_id) + spark_client.job.delete(job_id) except (BatchErrorException, AztkError): pass diff --git a/tests/integration_tests/spark/sdk/job/test_job_new.py b/tests/integration_tests/spark/sdk/job/test_job_deprecated.py similarity index 73% rename from tests/integration_tests/spark/sdk/job/test_job_new.py rename to tests/integration_tests/spark/sdk/job/test_job_deprecated.py index 0350c6cb..36e0542e 100644 --- a/tests/integration_tests/spark/sdk/job/test_job_new.py +++ b/tests/integration_tests/spark/sdk/job/test_job_deprecated.py @@ -1,6 +1,8 @@ import os import subprocess from datetime import datetime +import pytest + from azure.batch.models import BatchErrorException @@ -36,8 +38,10 @@ def test_submit_job(): max_low_pri_nodes=0 ) try: - job = spark_client.job.submit(job_configuration=job_configuration) - spark_client.job.wait_until_job_finished(id=job_configuration.id) + with pytest.warns(DeprecationWarning): + job = spark_client.submit_job(job_configuration=job_configuration) + with pytest.warns(DeprecationWarning): + spark_client.wait_until_job_finished(job_id=job_configuration.id) assert job.id == job_configuration.id assert job.state is not None @@ -73,10 +77,12 @@ def test_list_jobs(): worker_on_master=True ) try: - spark_client.job.submit(job_configuration=job_configuration) - spark_client.job.wait_until_job_finished(job_configuration.id) + with pytest.warns(DeprecationWarning): + spark_client.submit_job(job_configuration=job_configuration) + with pytest.warns(DeprecationWarning): + spark_client.wait_until_job_finished(job_configuration.id) - jobs = spark_client.job.list() + jobs = spark_client.list_jobs() assert jobs is not None assert job_configuration.id in [job.id for job in jobs] @@ -111,10 +117,12 @@ def test_list_applications(): max_low_pri_nodes=0 ) try: - spark_client.job.submit(job_configuration=job_configuration) - spark_client.job.wait_until_job_finished(job_configuration.id) + with pytest.warns(DeprecationWarning): + spark_client.submit_job(job_configuration=job_configuration) + with pytest.warns(DeprecationWarning): + spark_client.wait_until_job_finished(job_configuration.id) - applications = spark_client.job.list_applications(id=job_configuration.id) + applications = spark_client.list_applications(job_id=job_configuration.id) assert applications not in (None, []) assert len(applications) == 2 @@ -152,10 +160,13 @@ def test_get_job(): worker_on_master=True ) try: - spark_client.job.submit(job_configuration=job_configuration) - spark_client.job.wait_until_job_finished(job_configuration.id) + with pytest.warns(DeprecationWarning): + spark_client.submit_job(job_configuration=job_configuration) + with pytest.warns(DeprecationWarning): + spark_client.wait_until_job_finished(job_configuration.id) + with pytest.warns(DeprecationWarning): + job = spark_client.get_job(job_id=job_configuration.id) - job = spark_client.job.get(id=job_configuration.id) assert job.id == job_configuration.id assert app1.name in [app.name for app in job.applications] assert app2.name in [app.name for app in job.applications] @@ -185,9 +196,13 @@ def test_get_application(): max_low_pri_nodes=0 ) try: - spark_client.job.submit(job_configuration=job_configuration) - spark_client.job.wait_until_job_finished(job_configuration.id) - application = spark_client.job.get_application(id=job_configuration.id, application_name=app1.name) + with pytest.warns(DeprecationWarning): + spark_client.submit_job(job_configuration=job_configuration) + with pytest.warns(DeprecationWarning): + spark_client.wait_until_job_finished(job_configuration.id) + with pytest.warns(DeprecationWarning): + application = spark_client.get_application(job_id=job_configuration.id, application_name=app1.name) + assert isinstance(application, aztk.spark.models.Application) assert application.exit_code == 0 assert application.state == "completed" @@ -216,10 +231,13 @@ def test_get_application_log(): max_low_pri_nodes=0 ) try: - spark_client.job.submit(job_configuration=job_configuration) - spark_client.job.wait_until_job_finished(job_configuration.id) + with pytest.warns(DeprecationWarning): + spark_client.submit_job(job_configuration=job_configuration) + with pytest.warns(DeprecationWarning): + spark_client.wait_until_job_finished(job_configuration.id) - application_log = spark_client.job.get_application_log(id=job_configuration.id, application_name=app1.name) + with pytest.warns(DeprecationWarning): + application_log = spark_client.get_job_application_log(job_id=job_configuration.id, application_name=app1.name) assert isinstance(application_log, aztk.spark.models.ApplicationLog) assert application_log.log is not None @@ -253,12 +271,19 @@ def test_delete_job(): worker_on_master=True ) try: - spark_client.job.submit(job_configuration=job_configuration) - spark_client.job.wait_until_job_finished(job_configuration.id) - spark_client.job.delete(job_configuration.id) - assert job_configuration.id not in spark_client.job.list() + + with pytest.warns(DeprecationWarning): + spark_client.submit_job(job_configuration=job_configuration) + with pytest.warns(DeprecationWarning): + spark_client.wait_until_job_finished(job_configuration.id) + with pytest.warns(DeprecationWarning): + spark_client.delete_job(job_configuration.id) + + with pytest.warns(DeprecationWarning): + assert job_configuration.id not in spark_client.list_jobs() try: - spark_client.job.get(job_configuration.id) + with pytest.warns(DeprecationWarning): + spark_client.get_job(job_configuration.id) except AztkError: # this should fail assert True @@ -270,6 +295,6 @@ def test_delete_job(): def clean_up_job(job_id): try: - spark_client.job.delete(job_id) + spark_client.delete_job(job_id) except (BatchErrorException, AztkError): pass From 52d27f9428eded1080b8b85269e85c409563ad7d Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 10 Jul 2018 11:00:56 -0700 Subject: [PATCH 17/52] add docstrings for baseoperations --- .vscode/settings.json | 5 +- aztk/client/base/base_operations.py | 179 +++++++++++++++++++++++++--- 2 files changed, 168 insertions(+), 16 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 44354925..7de7bfbc 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -14,5 +14,6 @@ "python.formatting.provider": "yapf", "python.venvPath": "${workspaceFolder}/.venv/", "python.pythonPath": "${workspaceFolder}/.venv/Scripts/python.exe", - "python.unitTest.pyTestEnabled": true -} + "python.unitTest.pyTestEnabled": true, + // "editor.formatOnSave": true, +} \ No newline at end of file diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index 827d852b..f3f54593 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -2,17 +2,19 @@ from aztk.internal import cluster_data from aztk.utils import ssh as ssh_lib -from .helpers import (create_user_on_node, create_user_on_pool, - delete_user_on_node, delete_user_on_pool, - generate_user_on_node, generate_user_on_pool, - get_application_log, get_remote_login_settings, node_run, - run, ssh_into_node) +from .helpers import (create_user_on_node, create_user_on_pool, delete_user_on_node, delete_user_on_pool, + generate_user_on_node, generate_user_on_pool, get_application_log, get_remote_login_settings, + node_run, run, ssh_into_node) class BaseOperations: - ''' - Base operations that all other operations inherit from - ''' + """Base operations that all other operations inherit from + + Attributes: + batch_client (:obj:`azure.batch.batch_service_client.BatchServiceClient`): Client used to interact with the Azure Batch service. + blob_client (:obj:`azure.storage.blob.BlockBlobService`): Client used to interact with the Azure Storage Blob service. + secrets_configuration (:obj:`aztk.models.SecretsConfiguration`): Model that holds AZTK secrets used to authenticate with Azure and the clusters. + """ def __init__(self, context): self.batch_client = context['batch_client'] @@ -20,14 +22,35 @@ def __init__(self, context): self.secrets_configuration = context['secrets_configuration'] def get_cluster_config(self, cluster_id: str) -> models.ClusterConfiguration: + """Open an ssh tunnel to a node + + Args: + pool_id (:obj:`str`): the id of the cluster the node is in + node_id (:obj:`str`): the id of the node to open the ssh tunnel to + username (:obj:`str`): the username to authenticate the ssh session + ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None. + password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None. + port_forward_list (:obj:`List[PortForwardingSpecification`, optional): list of PortForwardingSpecifications. + The defined ports will be forwarded to the client. + internal (:obj:`bool`, optional): if True, this will connect to the node using its internal IP. + Only use this if running within the same VNET as the cluster. + Returns: + ClusterConfiguration: Object representing the cluster's configuration + """ return self.get_cluster_data(cluster_id).read_cluster_config() def get_cluster_data(self, cluster_id: str) -> cluster_data.ClusterData: - """ - Returns ClusterData object to manage data related to the given cluster id + """ Gets the ClusterData object to manage data related to the given cluster + + Args: + cluster_id (:obj:`str`): the id of the cluster to get + + Returns: + ClusterData: Object used to manage the data and storage functions for a cluster """ return cluster_data.ClusterData(self.blob_client, cluster_id) + #TODO: rename pool to cluster def ssh_into_node(self, pool_id, node_id, @@ -36,37 +59,165 @@ def ssh_into_node(self, password=None, port_forward_list=None, internal=False): - ''' - Opens a ssh tunnel to the node for port forwarding - ''' + """Open an ssh tunnel to a node + + Args: + pool_id (:obj:`str`): the id of the cluster the node is in + node_id (:obj:`str`): the id of the node to open the ssh tunnel to + username (:obj:`str`): the username to authenticate the ssh session + ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None. + password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None. + port_forward_list (:obj:`List[PortForwardingSpecification`, optional): list of PortForwardingSpecifications. + The defined ports will be forwarded to the client. + internal (:obj:`bool`, optional): if True, this will connect to the node using its internal IP. + Only use this if running within the same VNET as the cluster. + Returns: + None + """ ssh_into_node.ssh_into_node(self, pool_id, node_id, username, ssh_key, password, port_forward_list, internal) + #TODO: rename pool to cluster def create_user_on_node(self, username, pool_id, node_id, ssh_key=None, password=None): + """ Create a user on a node + + Args: + username (:obj:`str`): name of the user to create. + pool_id (:obj:`str`): id of the cluster to create the user on. + node_id (:obj:`str`): id of the node in the cluster to create the user on. + ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. + password (:obj:`str`, optional): password for the user, must use ssh_key or password. + """ return create_user_on_node.create_user_on_node(self, username, pool_id, node_id, ssh_key, password) + #TODO: rename pool to cluster def create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key=None, password=None): + """ Create a user on every node in the cluster + + Args: + username (:obj:`str`): name of the user to create. + pool_id (:obj:`str`): id of the cluster to create the user on. + node_id (:obj:`str`): id of the node in the cluster to create the user on. + ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None. + password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None. + + Returns: + None + """ return create_user_on_pool.create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key, password) + #TODO: rename pool to cluster def generate_user_on_node(self, pool_id, node_id): + """ Create a user with an autogenerated username and ssh_key on the given node. + + Args: + pool_id (:obj:`str`): the id of the cluster to generate the user on. + node_id (:obj:`str`): the id of the node in the cluster to generate the user on. + + Returns: + tuple: A tuple of the form (username (:obj:`str`), ssh_key) where ssh_key is a Cryptodome.RSA key. + """ return generate_user_on_node.generate_user_on_node(self, pool_id, node_id) + #TODO: rename pool to cluster def generate_user_on_pool(self, pool_id, nodes): + """ Create a user with an autogenerated username and ssh_key on the cluster + + Args: + pool_id (:obj:`str`): the id of the cluster to generate the user on. + node_id (:obj:`str`): the id of the node in the cluster to generate the user on. + + Returns: + tuple: A tuple of the form (username (:obj:`str`), ssh_key) where ssh_key is a Cryptodome.RSA key. + """ return generate_user_on_pool.generate_user_on_pool(self, pool_id, nodes) + #TODO: rename pool to cluster def delete_user_on_node(self, pool_id: str, node_id: str, username: str) -> str: + """ Delete a user on a node + + Args: + pool_id (:obj:`str`): the id of the cluster to delete the user on. + node_id (:obj:`str`): the id of the node in the cluster to delete the user on. + username (:obj:`str`): the name of the user to delete. + + Returns: + None + """ return delete_user_on_node.delete_user(self, pool_id, node_id, username) - def delete_user_on_pool(self, username, pool_id, nodes): #TODO: change from pool_id, nodes to cluster_id + #TODO: rename pool to cluster + def delete_user_on_pool(self, username, pool_id, nodes): + """ Delete a user on every node in the cluster + + Args: + pool_id (:obj:`str`): the id of the cluster to delete the user on. + node_id (:obj:`str`): the id of the node in the cluster to delete the user on. + username (:obj:`str`): the name of the user to delete. + + Returns: + None + """ return delete_user_on_pool.delete_user_on_pool(self, username, pool_id, nodes) def node_run(self, cluster_id, node_id, command, internal, container_name=None, timeout=None): + """ Run a bash command on the given node + + Args: + cluster_id (:obj:`str`): the id of the cluster to run the command on. + node_id (:obj:`str`): the id of the node in the cluster to run the command on. + command (:obj:`str`): the bash command to execute on the node. + internal (:obj:`bool`): if True, this will connect to the node using its internal IP. + Only use this if running within the same VNET as the cluster. + container_name=None (:obj:`str`, optional): the name of the container to run the command in. + If None, the command will run on the host VM. Defaults to None. + timeout=None (:obj:`str`, optional): The timeout in seconds for establishing a connection to the node. + + Returns: + NodeOutput: + """ return node_run.node_run(self, cluster_id, node_id, command, internal, container_name, timeout) def get_remote_login_settings(self, id: str, node_id: str): + """ Get the remote login information for a node in a cluster + + Args: + id (:obj:`str`): the id of the cluster the node is in + node_id (:obj:`str`): the id of the node in the cluster + + Returns: + RemoteLogin: + """ return get_remote_login_settings.get_remote_login_settings(self, id, node_id) def run(self, cluster_id, command, internal, container_name=None, timeout=None): + """ Run a bash command on every node in the cluster + + Args: + cluster_id (:obj:`str`): the id of the cluster to run the command on. + command (:obj:`str`): the bash command to execute on the node. + internal (:obj:`bool`): if true, this will connect to the node using its internal IP. + Only use this if running within the same VNET as the cluster. + container_name=None (:obj:`str`, optional): the name of the container to run the command in. + If None, the command will run on the host VM. Defaults to None. + timeout=None (:obj:`str`, optional): The timeout in seconds for establishing a connection to the node. + + Returns: + List[NodeOutput] + """ return run.cluster_run(self, cluster_id, command, internal, container_name, timeout) def get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): + """ Get the log for a running or completed application + + Args: + cluster_id (:obj:`str`): the id of the cluster to run the command on. + application_name (:obj:`str`): str + tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes. Otherwise, the whole log will be retrieved. + Only use this if streaming the log as it is being written. Defaults to False. + current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes are retrieved. + Only useful is streaming the log as it is being written. Only used if tail is True. + + Returns: + List[NodeOutput] + """ return get_application_log.get_application_log(self, cluster_id, application_name, tail, current_bytes) From 581f9c8b45b1bc188c1a6c51394dc9bfbd2daa0e Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 10 Jul 2018 11:38:00 -0700 Subject: [PATCH 18/52] add docstrings --- aztk/client/base/base_operations.py | 10 +++-- aztk/client/cluster/operations.py | 61 +++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index f3f54593..d1003cf4 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -33,7 +33,7 @@ def get_cluster_config(self, cluster_id: str) -> models.ClusterConfiguration: port_forward_list (:obj:`List[PortForwardingSpecification`, optional): list of PortForwardingSpecifications. The defined ports will be forwarded to the client. internal (:obj:`bool`, optional): if True, this will connect to the node using its internal IP. - Only use this if running within the same VNET as the cluster. + Only use this if running within the same VNET as the cluster. Defaults to False. Returns: ClusterConfiguration: Object representing the cluster's configuration """ @@ -70,7 +70,7 @@ def ssh_into_node(self, port_forward_list (:obj:`List[PortForwardingSpecification`, optional): list of PortForwardingSpecifications. The defined ports will be forwarded to the client. internal (:obj:`bool`, optional): if True, this will connect to the node using its internal IP. - Only use this if running within the same VNET as the cluster. + Only use this if running within the same VNET as the cluster. Defaults to False. Returns: None """ @@ -167,10 +167,11 @@ def node_run(self, cluster_id, node_id, command, internal, container_name=None, node_id (:obj:`str`): the id of the node in the cluster to run the command on. command (:obj:`str`): the bash command to execute on the node. internal (:obj:`bool`): if True, this will connect to the node using its internal IP. - Only use this if running within the same VNET as the cluster. + Only use this if running within the same VNET as the cluster. Defaults to False. container_name=None (:obj:`str`, optional): the name of the container to run the command in. If None, the command will run on the host VM. Defaults to None. timeout=None (:obj:`str`, optional): The timeout in seconds for establishing a connection to the node. + Defaults to None. Returns: NodeOutput: @@ -196,10 +197,11 @@ def run(self, cluster_id, command, internal, container_name=None, timeout=None): cluster_id (:obj:`str`): the id of the cluster to run the command on. command (:obj:`str`): the bash command to execute on the node. internal (:obj:`bool`): if true, this will connect to the node using its internal IP. - Only use this if running within the same VNET as the cluster. + Only use this if running within the same VNET as the cluster. Defaults to False. container_name=None (:obj:`str`, optional): the name of the container to run the command in. If None, the command will run on the host VM. Defaults to None. timeout=None (:obj:`str`, optional): The timeout in seconds for establishing a connection to the node. + Defaults to None. Returns: List[NodeOutput] diff --git a/aztk/client/cluster/operations.py b/aztk/client/cluster/operations.py index afaaeec2..eea7de9c 100644 --- a/aztk/client/cluster/operations.py +++ b/aztk/client/cluster/operations.py @@ -8,11 +8,32 @@ class CoreClusterOperations(BaseOperations): def create(self, cluster_configuration: ClusterConfiguration, software_metadata_key: str, start_task, vm_image_model): + """ Create a cluster + + Args: + cluster_configuration (:obj:`ClusterConfiguration`): Configuration for the cluster to be created + software_metadata_key (:obj:`str`): the key for the primary software that will be run on the cluster + start_task (:obj:`azure.batch.models.StartTask`): Batch StartTask defintion to configure the Batch Pool + vm_image_model (:obj:`azure.batch.models.VirtualMachineConfiguration`): Configuration of the virtual machine image and settings + + Returns: + Cluster: An aztk.models.Cluster object representing the state and configuration of the cluster. + """ return create.create_pool_and_job(self, cluster_configuration, software_metadata_key, start_task, vm_image_model) + # TODO: change cluster_id to id def get(self, cluster_id: str): + """ Get the state and configuration of a cluster + + Args: + cluster_id (:obj:`str`): the id of the cluster to get. + + Returns: + Cluster: An aztk.models.Cluster object representing the state and configuration of the cluster. + """ return get.get_pool_details(self, cluster_id) + # TODO: change cluster_id to id def copy(self, cluster_id, source_path, @@ -21,11 +42,51 @@ def copy(self, internal=False, get=False, timeout=None): + """ Copy files to or from every node in a cluster. + + Args: + cluster_id (:obj:`str`): the id of the cluster to copy files with + source_path (:obj:`str`): the path of the file to copy from + destination_path (:obj:`str`, optional): the path of the file to copy to. + If None, a SpooledTemporaryFile will be returned, else the file will be written to this path. + Defaults to None. + container_name (:obj:`str`, optional): the name of the container to copy to or from. + If None, the copy operation will occur on the host VM, Defaults to None. + internal (:obj:`bool`, optional): if True, this will connect to the node using its internal IP. + Only use this if running within the same VNET as the cluster. Defaults to False. + get (:obj:`bool`, optional): If True, the file are downloaded from every node in the cluster. + Else, the file is copied from the client to the node. Defaults to False. + timeout (:obj:`int`, optional): The timeout in seconds for establishing a connection to the node. + Defaults to None. + + Returns: + List[NodeOutput]: A list of NodeOutput objects representing the output of the copy command. + """ return copy.cluster_copy(self, cluster_id, source_path, destination_path, container_name, internal, get, timeout) + #TODO: change pool_id to id def delete(self, pool_id: str, keep_logs: bool = False): + """ Copy files to or from every node in a cluster. + + Args: + pool_id (:obj:`str`): the id of the cluster to delete + keep_logs (:obj:`bool`): If True, the logs related to this cluster in Azure Storage are not deleted. + Defaults to False. + + Returns: + List[NodeOutput]: A list of NodeOutput objects representing the output of the copy command. + """ return delete.delete_pool_and_job(self, pool_id, keep_logs) def list(self, software_metadata_key): + """ List clusters running the specified software. + + Args: + software_metadata_key(:obj:`str`): the key of the primary softare running on the cluster. + This filters out non-aztk clusters and aztk clusters running other software. + + Returns: + List[Cluster]: list of clusters running the software defined by software_metadata_key + """ return list.list_clusters(self, software_metadata_key) From b159c618ab16596042c00397f97bedb8b69416c6 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 10 Jul 2018 12:14:34 -0700 Subject: [PATCH 19/52] docstrings, add back compat for coreclient, fix init for spark client --- aztk/client/client.py | 12 +++++++++--- aztk/client/job/operations.py | 24 ++++++++++++++++++++++-- aztk/spark/client/client.py | 22 ++++++++-------------- 3 files changed, 39 insertions(+), 19 deletions(-) diff --git a/aztk/client/client.py b/aztk/client/client.py index 6b4985ff..48fc0f28 100644 --- a/aztk/client/client.py +++ b/aztk/client/client.py @@ -20,13 +20,19 @@ class CoreClient: - def __init__(self, secrets_configuration: models.SecretsConfiguration): # make accept secrets_config and secrets_configuration + # TODO: remove ability to specify secrets_config in 0.10.0 + def __init__(self, secrets_configuration: models.SecretsConfiguration = None, **kwargs): self.secrets_configuration = None - context = self.get_context(secrets_configuration) + context = None + if kwargs.get("secrets_config"): + # TODO: add deprecated warning + context = self._get_context(kwargs.get("secrets_config")) + else: + context = self._get_context(secrets_configuration) self.cluster = CoreClusterOperations(context) self.job = CoreJobOperations(context) - def get_context(self, secrets_configuration: models.SecretsConfiguration): + def _get_context(self, secrets_configuration: models.SecretsConfiguration): self.secrets_configuration = secrets_configuration azure_api.validate_secrets(secrets_configuration) diff --git a/aztk/client/job/operations.py b/aztk/client/job/operations.py index 5af0eff3..f15792ec 100644 --- a/aztk/client/job/operations.py +++ b/aztk/client/job/operations.py @@ -1,10 +1,30 @@ from aztk.client.base import BaseOperations -from .helpers import ( - submit, ) + +from .helpers import submit class CoreJobOperations(BaseOperations): def submit(self, job_configuration, start_task, job_manager_task, autoscale_formula, software_metadata_key: str, vm_image_model, application_metadata): + """ Submit a job + + Jobs are a cluster definition and one or many application definitions which run on the cluster. The job's + cluster will be allocated and configured, then the applications will be executed with their output stored + in Azure Storage. When all applications have completed, the cluster will be automatically deleted. + + Args: + job_configuration (:obj:`aztk.models.JobConfiguration`): Model defining the job's configuration. + start_task (:obj:`azure.batch.models.StartTask`): Batch StartTask defintion to configure the Batch Pool + job_manager_task (:obj:`azure.batch.models.JobManagerTask`): Batch JobManagerTask defintion to schedule + the defined applications on the cluster. + autoscale_formula (:obj:`str`): formula that defines the numbers of nodes allocated to the cluster. + software_metadata_key (:obj:`str`): the key of the primary softare running on the cluster. + vm_image_model + application_metadata (:obj:`List[str]`): list of the names of all applications that will be run as a + part of the job + + Returns: + azure.batch.models.CloudJobSchedule: Model representing the Azure Batch JobSchedule state. + """ return submit.submit_job(self, job_configuration, start_task, job_manager_task, autoscale_formula, software_metadata_key, vm_image_model, application_metadata) diff --git a/aztk/spark/client/client.py b/aztk/spark/client/client.py index 0cc16d9b..143256eb 100644 --- a/aztk/spark/client/client.py +++ b/aztk/spark/client/client.py @@ -19,23 +19,17 @@ class Client(CoreClient): - def __init__(self, secrets_configuration: models.SecretsConfiguration): - context = self.get_context(secrets_configuration) + def __init__(self, secrets_configuration: models.SecretsConfiguration, **kwargs): + self.secrets_configuration = None + context = None + if kwargs.get("secrets_config"): + # TODO: add deprecated warning + context = self.get_context(kwargs.get("secrets_config")) + else: + context = self._get_context(secrets_configuration) self.cluster = ClusterOperations(context) self.job = JobOperations(context) - def get_context(self, secrets_configuration: models.SecretsConfiguration): - self.secrets_configuration = secrets_configuration - - azure_api.validate_secrets(secrets_configuration) - self.batch_client = azure_api.make_batch_client(secrets_configuration) - self.blob_client = azure_api.make_blob_client(secrets_configuration) - context = { - 'batch_client': self.batch_client, - 'blob_client': self.blob_client, - 'secrets_configuration': secrets_configuration, - } - return context # ALL THE FOLLOWING METHODS ARE DEPRECATED AND WILL BE REMOVED IN 0.10.0 From c6da487472ea4b996dfb0e97507b7681287e0725 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 10 Jul 2018 12:15:49 -0700 Subject: [PATCH 20/52] whitespace --- .vscode/settings.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 7de7bfbc..941779c2 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -16,4 +16,4 @@ "python.pythonPath": "${workspaceFolder}/.venv/Scripts/python.exe", "python.unitTest.pyTestEnabled": true, // "editor.formatOnSave": true, -} \ No newline at end of file +} From 4aa8774a82f45ee91e88050e3f59b568056abf83 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 10 Jul 2018 13:05:11 -0700 Subject: [PATCH 21/52] docstrings, whitespace --- aztk/client/base/base_operations.py | 24 ++++++------- aztk/client/cluster/operations.py | 10 +++--- aztk/client/job/operations.py | 2 +- aztk/spark/client/base/operations.py | 46 +++++++++++++++++++++++++ aztk/spark/client/cluster/operations.py | 42 ++++++++++++++++++++++ 5 files changed, 106 insertions(+), 18 deletions(-) diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index d1003cf4..c2e53aac 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -40,7 +40,7 @@ def get_cluster_config(self, cluster_id: str) -> models.ClusterConfiguration: return self.get_cluster_data(cluster_id).read_cluster_config() def get_cluster_data(self, cluster_id: str) -> cluster_data.ClusterData: - """ Gets the ClusterData object to manage data related to the given cluster + """Gets the ClusterData object to manage data related to the given cluster Args: cluster_id (:obj:`str`): the id of the cluster to get @@ -78,7 +78,7 @@ def ssh_into_node(self, #TODO: rename pool to cluster def create_user_on_node(self, username, pool_id, node_id, ssh_key=None, password=None): - """ Create a user on a node + """Create a user on a node Args: username (:obj:`str`): name of the user to create. @@ -91,7 +91,7 @@ def create_user_on_node(self, username, pool_id, node_id, ssh_key=None, password #TODO: rename pool to cluster def create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key=None, password=None): - """ Create a user on every node in the cluster + """Create a user on every node in the cluster Args: username (:obj:`str`): name of the user to create. @@ -107,7 +107,7 @@ def create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key=None, passwo #TODO: rename pool to cluster def generate_user_on_node(self, pool_id, node_id): - """ Create a user with an autogenerated username and ssh_key on the given node. + """Create a user with an autogenerated username and ssh_key on the given node. Args: pool_id (:obj:`str`): the id of the cluster to generate the user on. @@ -120,7 +120,7 @@ def generate_user_on_node(self, pool_id, node_id): #TODO: rename pool to cluster def generate_user_on_pool(self, pool_id, nodes): - """ Create a user with an autogenerated username and ssh_key on the cluster + """Create a user with an autogenerated username and ssh_key on the cluster Args: pool_id (:obj:`str`): the id of the cluster to generate the user on. @@ -133,7 +133,7 @@ def generate_user_on_pool(self, pool_id, nodes): #TODO: rename pool to cluster def delete_user_on_node(self, pool_id: str, node_id: str, username: str) -> str: - """ Delete a user on a node + """Delete a user on a node Args: pool_id (:obj:`str`): the id of the cluster to delete the user on. @@ -147,7 +147,7 @@ def delete_user_on_node(self, pool_id: str, node_id: str, username: str) -> str: #TODO: rename pool to cluster def delete_user_on_pool(self, username, pool_id, nodes): - """ Delete a user on every node in the cluster + """Delete a user on every node in the cluster Args: pool_id (:obj:`str`): the id of the cluster to delete the user on. @@ -160,7 +160,7 @@ def delete_user_on_pool(self, username, pool_id, nodes): return delete_user_on_pool.delete_user_on_pool(self, username, pool_id, nodes) def node_run(self, cluster_id, node_id, command, internal, container_name=None, timeout=None): - """ Run a bash command on the given node + """Run a bash command on the given node Args: cluster_id (:obj:`str`): the id of the cluster to run the command on. @@ -179,7 +179,7 @@ def node_run(self, cluster_id, node_id, command, internal, container_name=None, return node_run.node_run(self, cluster_id, node_id, command, internal, container_name, timeout) def get_remote_login_settings(self, id: str, node_id: str): - """ Get the remote login information for a node in a cluster + """Get the remote login information for a node in a cluster Args: id (:obj:`str`): the id of the cluster the node is in @@ -191,7 +191,7 @@ def get_remote_login_settings(self, id: str, node_id: str): return get_remote_login_settings.get_remote_login_settings(self, id, node_id) def run(self, cluster_id, command, internal, container_name=None, timeout=None): - """ Run a bash command on every node in the cluster + """Run a bash command on every node in the cluster Args: cluster_id (:obj:`str`): the id of the cluster to run the command on. @@ -209,7 +209,7 @@ def run(self, cluster_id, command, internal, container_name=None, timeout=None): return run.cluster_run(self, cluster_id, command, internal, container_name, timeout) def get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): - """ Get the log for a running or completed application + """Get the log for a running or completed application Args: cluster_id (:obj:`str`): the id of the cluster to run the command on. @@ -220,6 +220,6 @@ def get_application_log(self, cluster_id: str, application_name: str, tail=False Only useful is streaming the log as it is being written. Only used if tail is True. Returns: - List[NodeOutput] + aztk.models.ApplicationLog: a model representing the output of the application. """ return get_application_log.get_application_log(self, cluster_id, application_name, tail, current_bytes) diff --git a/aztk/client/cluster/operations.py b/aztk/client/cluster/operations.py index eea7de9c..39d334f8 100644 --- a/aztk/client/cluster/operations.py +++ b/aztk/client/cluster/operations.py @@ -8,7 +8,7 @@ class CoreClusterOperations(BaseOperations): def create(self, cluster_configuration: ClusterConfiguration, software_metadata_key: str, start_task, vm_image_model): - """ Create a cluster + """Create a cluster Args: cluster_configuration (:obj:`ClusterConfiguration`): Configuration for the cluster to be created @@ -23,7 +23,7 @@ def create(self, cluster_configuration: ClusterConfiguration, software_metadata_ # TODO: change cluster_id to id def get(self, cluster_id: str): - """ Get the state and configuration of a cluster + """Get the state and configuration of a cluster Args: cluster_id (:obj:`str`): the id of the cluster to get. @@ -42,7 +42,7 @@ def copy(self, internal=False, get=False, timeout=None): - """ Copy files to or from every node in a cluster. + """Copy files to or from every node in a cluster. Args: cluster_id (:obj:`str`): the id of the cluster to copy files with @@ -67,7 +67,7 @@ def copy(self, #TODO: change pool_id to id def delete(self, pool_id: str, keep_logs: bool = False): - """ Copy files to or from every node in a cluster. + """Copy files to or from every node in a cluster. Args: pool_id (:obj:`str`): the id of the cluster to delete @@ -80,7 +80,7 @@ def delete(self, pool_id: str, keep_logs: bool = False): return delete.delete_pool_and_job(self, pool_id, keep_logs) def list(self, software_metadata_key): - """ List clusters running the specified software. + """List clusters running the specified software. Args: software_metadata_key(:obj:`str`): the key of the primary softare running on the cluster. diff --git a/aztk/client/job/operations.py b/aztk/client/job/operations.py index f15792ec..b18442c1 100644 --- a/aztk/client/job/operations.py +++ b/aztk/client/job/operations.py @@ -6,7 +6,7 @@ class CoreJobOperations(BaseOperations): def submit(self, job_configuration, start_task, job_manager_task, autoscale_formula, software_metadata_key: str, vm_image_model, application_metadata): - """ Submit a job + """Submit a job Jobs are a cluster definition and one or many application definitions which run on the cluster. The job's cluster will be allocated and configured, then the applications will be executed with their output stored diff --git a/aztk/spark/client/base/operations.py b/aztk/spark/client/base/operations.py index 47c72cff..5395a5b1 100644 --- a/aztk/spark/client/base/operations.py +++ b/aztk/spark/client/base/operations.py @@ -18,12 +18,58 @@ def generate_cluster_start_task(self, plugins: List[models.PluginConfiguration] = None, mixed_mode: bool = False, worker_on_master: bool = True): + """Generate the Azure Batch Start Task to provision a Spark cluster. + + Args: + zip_resource_file (:obj:`azure.batch.models.ResourceFile`): a single zip file of all necessary data + to upload to the cluster. + cluster_id (:obj:`str`): the id of the cluster. + gpu_enabled (:obj:`bool`): if True, the cluster is GPU enabled. + docker_repo (:obj:`str`, optional): the docker repository and tag that identifies the docker image to use. + If None, the default Docker image will be used. Defaults to None. + file_shares (:obj:`aztk.spark.models.FileShare`, optional): a list of FileShares to mount on the cluster. + Defaults to None. + plugins (:obj:`aztk.spark.models.PluginConfiguration`, optional): a list of plugins to set up on the cluster. + Defaults to None. + mixed_mode (:obj:`bool`, optional): If True, the cluster is configured to use both dedicated and low priority VMs. + Defaults to False. + worker_on_master (:obj:`bool`, optional): If True, the cluster is configured to provision a Spark worker + on the VM that runs the Spark master. Defaults to True. + + Returns: + azure.batch.models.StartTask: the StartTask definition to provision the cluster. + """ return generate_cluster_start_task.generate_cluster_start_task(self, zip_resource_file, cluster_id, gpu_enabled, docker_repo, file_shares, plugins, mixed_mode, worker_on_master) def generate_application_task(self, container_id, application, remote=False): + """Generate the Azure Batch Start Task to provision a Spark cluster. + + Args: + container_id (:obj:`str`): the id of the container to run the application in + application (:obj:`aztk.spark.models.ApplicationConfiguration): the Application Definition + remote (:obj`bool`): If True, the application file will not be uploaded, it is assumed to be reachable + by the cluster already. This is useful when your application is stored in a mounted Azure File Share + and not the client.Defaults to False. + + Returns: + azure.batch.models.TaskAddParameter: the Task definition for the Application. + """ return generate_application_task.generate_application_task(self, container_id, application, remote) def get_application_log(self, id: str, application_name: str, tail=False, current_bytes: int = 0): + """Get the log for a running or completed application + + Args: + id (:obj:`str`): the id of the cluster to run the command on. + application_name (:obj:`str`): str + tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes. Otherwise, the whole log will be retrieved. + Only use this if streaming the log as it is being written. Defaults to False. + current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes are retrieved. + Only useful is streaming the log as it is being written. Only used if tail is True. + + Returns: + aztk.spark.models.ApplicationLog: a model representing the output of the application. + """ return get_application_log.get_application_log(SparkBaseOperations, self, id, application_name, tail, current_bytes) diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index b4fe168c..903ce3e5 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -8,18 +8,60 @@ class ClusterOperations(CoreClusterOperations, SparkBaseOperations): def create(self, cluster_configuration: models.ClusterConfiguration, wait: bool = False): + """Create a cluster. + + Args: + cluster_configuration (:obj:`ClusterConfiguration`): Configuration for the cluster to be created. + wait (:obj:`bool`): if True, this function will block until the cluster creation is finished. + + Returns: + Cluster: An aztk.models.Cluster object representing the state and configuration of the cluster. + """ return create.create_cluster(self, cluster_configuration, wait) def delete(self, id: str, keep_logs: bool = False): + """Delete a cluster. + + Args: + id (:obj:`str`): the id of the cluster to delete. + keep_logs (:obj:`bool`): if True, this function will block until the cluster creation is finished. + + Returns: + True if the deletion process was successful. + """ return delete.delete_cluster(self, id, keep_logs) def get(self, id: str): + """Get details about the state of a cluster. + + Args: + id (:obj:`str`): the id of the cluster to get. + + Returns: + Cluster: An aztk.models.Cluster object representing the state and configuration of the cluster. + """ return get.get_cluster(self, id) def list(self): + """List all clusters. + + Returns: + List[Cluster]: List of aztk.models.Cluster objects each representing the state and configuration of the cluster. + """ return list.list_clusters(self) def submit(self, id: str, application: models.ApplicationConfiguration, remote: bool = False, wait: bool = False): + """Submit an application to a cluster. + + Args: + id (:obj:`str`): the id of the cluster to submit the application to. + application (:obj:`aztk.spark.models.ApplicationConfiguration`): Application definition + remote (:obj:`bool`, optional): Defaults to False. + wait (:obj:`bool`, optional): If True, this function blocks until the application has completed. Defaults to False. + + Returns: + Cluster: An aztk.models.Cluster object representing the state and configuration of the cluster. + """ return submit.submit(self, id, application, remote, wait) def create_user(self, id: str, username: str, password: str = None, ssh_key: str = None): From 8cf466779a16f8ac30890a1fd8f2ea38639f38c0 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 10 Jul 2018 14:40:07 -0700 Subject: [PATCH 22/52] docstrings, fixes --- aztk/client/base/base_operations.py | 8 +- aztk/client/cluster/operations.py | 10 +-- aztk/spark/client/base/operations.py | 4 +- aztk/spark/client/cluster/operations.py | 100 +++++++++++++++++++++++- 4 files changed, 110 insertions(+), 12 deletions(-) diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index c2e53aac..8f2f126e 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -89,14 +89,14 @@ def create_user_on_node(self, username, pool_id, node_id, ssh_key=None, password """ return create_user_on_node.create_user_on_node(self, username, pool_id, node_id, ssh_key, password) - #TODO: rename pool to cluster + #TODO: rename pool to cluster, get rid of nodes parameter def create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key=None, password=None): """Create a user on every node in the cluster Args: username (:obj:`str`): name of the user to create. pool_id (:obj:`str`): id of the cluster to create the user on. - node_id (:obj:`str`): id of the node in the cluster to create the user on. + nodes (:obj:`List[ComputeNode]`): list of nodes to create the user on ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None. password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None. @@ -174,7 +174,7 @@ def node_run(self, cluster_id, node_id, command, internal, container_name=None, Defaults to None. Returns: - NodeOutput: + NodeOutput: object containing the output of the run command """ return node_run.node_run(self, cluster_id, node_id, command, internal, container_name, timeout) @@ -204,7 +204,7 @@ def run(self, cluster_id, command, internal, container_name=None, timeout=None): Defaults to None. Returns: - List[NodeOutput] + List[NodeOutput]: list of NodeOutput objects containing the output of the run command """ return run.cluster_run(self, cluster_id, command, internal, container_name, timeout) diff --git a/aztk/client/cluster/operations.py b/aztk/client/cluster/operations.py index 39d334f8..40abbc90 100644 --- a/aztk/client/cluster/operations.py +++ b/aztk/client/cluster/operations.py @@ -45,11 +45,11 @@ def copy(self, """Copy files to or from every node in a cluster. Args: - cluster_id (:obj:`str`): the id of the cluster to copy files with - source_path (:obj:`str`): the path of the file to copy from - destination_path (:obj:`str`, optional): the path of the file to copy to. - If None, a SpooledTemporaryFile will be returned, else the file will be written to this path. - Defaults to None. + cluster_id (:obj:`str`): the id of the cluster to copy files with. + source_path (:obj:`str`): the path of the file to copy from. + destination_path (:obj:`str`, optional): the local directory path where the output should be written. + If None, a SpooledTemporaryFile will be returned in the NodeOutput object, else the file will be + written to this path. Defaults to None. container_name (:obj:`str`, optional): the name of the container to copy to or from. If None, the copy operation will occur on the host VM, Defaults to None. internal (:obj:`bool`, optional): if True, this will connect to the node using its internal IP. diff --git a/aztk/spark/client/base/operations.py b/aztk/spark/client/base/operations.py index 5395a5b1..548f8da9 100644 --- a/aztk/spark/client/base/operations.py +++ b/aztk/spark/client/base/operations.py @@ -49,9 +49,9 @@ def generate_application_task(self, container_id, application, remote=False): Args: container_id (:obj:`str`): the id of the container to run the application in application (:obj:`aztk.spark.models.ApplicationConfiguration): the Application Definition - remote (:obj`bool`): If True, the application file will not be uploaded, it is assumed to be reachable + remote (:obj:`bool`): If True, the application file will not be uploaded, it is assumed to be reachable by the cluster already. This is useful when your application is stored in a mounted Azure File Share - and not the client.Defaults to False. + and not the client. Defaults to False. Returns: azure.batch.models.TaskAddParameter: the Task definition for the Application. diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index 903ce3e5..d0451a8b 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -56,7 +56,9 @@ def submit(self, id: str, application: models.ApplicationConfiguration, remote: Args: id (:obj:`str`): the id of the cluster to submit the application to. application (:obj:`aztk.spark.models.ApplicationConfiguration`): Application definition - remote (:obj:`bool`, optional): Defaults to False. + remote (:obj:`bool`): If True, the application file will not be uploaded, it is assumed to be reachable + by the cluster already. This is useful when your application is stored in a mounted Azure File Share + and not the client. Defaults to False. wait (:obj:`bool`, optional): If True, this function blocks until the application has completed. Defaults to False. Returns: @@ -65,15 +67,66 @@ def submit(self, id: str, application: models.ApplicationConfiguration, remote: return submit.submit(self, id, application, remote, wait) def create_user(self, id: str, username: str, password: str = None, ssh_key: str = None): + """Create a user on every node in the cluster + + Args: + username (:obj:`str`): name of the user to create. + pool_id (:obj:`str`): id of the cluster to create the user on. + ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None. + password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None. + + Returns: + None + """ return create_user.create_user(self, id, username, ssh_key, password) def get_application_status(self, id: str, application_name: str): + """Get the status of a submitted application + + Args: + id (:obj:`str`): the name of the cluster the application was submitted to + application_name (:obj:`str`): the name of the application to get + + Returns: + str: the status state of the application + """ return get_application_status.get_application_status(self, id, application_name) def run(self, id: str, command: str, host=False, internal: bool = False, timeout=None): + """Run a bash command on every node in the cluster + + Args: + cluster_id (:obj:`str`): the id of the cluster to run the command on. + command (:obj:`str`): the bash command to execute on the node. + internal (:obj:`bool`): if true, this will connect to the node using its internal IP. + Only use this if running within the same VNET as the cluster. Defaults to False. + container_name=None (:obj:`str`, optional): the name of the container to run the command in. + If None, the command will run on the host VM. Defaults to None. + timeout=None (:obj:`str`, optional): The timeout in seconds for establishing a connection to the node. + Defaults to None. + + Returns: + List[NodeOutput]: list of NodeOutput objects containing the output of the run command + """ return run.cluster_run(self, id, command, host, internal, timeout) def node_run(self, id: str, node_id: str, command: str, host=False, internal: bool = False, timeout=None): + """Run a bash command on the given node + + Args: + cluster_id (:obj:`str`): the id of the cluster to run the command on. + node_id (:obj:`str`): the id of the node in the cluster to run the command on. + command (:obj:`str`): the bash command to execute on the node. + internal (:obj:`bool`): if True, this will connect to the node using its internal IP. + Only use this if running within the same VNET as the cluster. Defaults to False. + container_name=None (:obj:`str`, optional): the name of the container to run the command in. + If None, the command will run on the host VM. Defaults to None. + timeout=None (:obj:`str`, optional): The timeout in seconds for establishing a connection to the node. + Defaults to None. + + Returns: + NodeOutput: object containing the output of the run command + """ return node_run.node_run(self, id, node_id, command, host, internal, timeout) def copy(self, @@ -83,6 +136,22 @@ def copy(self, host: bool = False, internal: bool = False, timeout: int = None): + """Copy a file to every node in a cluster. + + Args: + cluster_id (:obj:`str`): the id of the cluster to copy files with. + source_path (:obj:`str`): the local path of the file to copy. + destination_path (:obj:`str`, optional): the path on each node the file is copied to. + container_name (:obj:`str`, optional): the name of the container to copy to or from. + If None, the copy operation will occur on the host VM, Defaults to None. + internal (:obj:`bool`, optional): if True, this will connect to the node using its internal IP. + Only use this if running within the same VNET as the cluster. Defaults to False. + timeout (:obj:`int`, optional): The timeout in seconds for establishing a connection to the node. + Defaults to None. + + Returns: + List[NodeOutput]: A list of NodeOutput objects representing the output of the copy command. + """ return copy.cluster_copy(self, id, source_path, destination_path, host, internal, timeout) def download(self, @@ -92,7 +161,36 @@ def download(self, host: bool = False, internal: bool = False, timeout: int = None): + """Download a file from every node in a cluster. + + Args: + cluster_id (:obj:`str`): the id of the cluster to copy files with. + source_path (:obj:`str`): the path of the file to copy from. + destination_path (:obj:`str`, optional): the local directory path where the output should be written. + If None, a SpooledTemporaryFile will be returned in the NodeOutput object, else the file will be + written to this path. Defaults to None. + container_name (:obj:`str`, optional): the name of the container to copy to or from. + If None, the copy operation will occur on the host VM, Defaults to None. + internal (:obj:`bool`, optional): if True, this will connect to the node using its internal IP. + Only use this if running within the same VNET as the cluster. Defaults to False. + timeout (:obj:`int`, optional): The timeout in seconds for establishing a connection to the node. + Defaults to None. + + Returns: + List[NodeOutput]: A list of NodeOutput objects representing the output of the copy command. + """ return download.cluster_download(self, id, source_path, destination_path, host, internal, timeout) def diagnostics(self, id, output_directory=None): + """Download a file from every node in a cluster. + + Args: + id (:obj:`str`): the id of the cluster to copy files with. + output_directory (:obj:`str`, optional): the local directory path where the output should be written. + If None, a SpooledTemporaryFile will be returned in the NodeOutput object, else the file will be + written to this path. Defaults to None. + + Returns: + List[NodeOutput]: A list of NodeOutput objects representing the output of the copy command. + """ return diagnostics.run_cluster_diagnostics(self, id, output_directory) From b4c13e143aff28bb7abcfb58aaf99d778ab5c1c2 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 10 Jul 2018 15:02:08 -0700 Subject: [PATCH 23/52] docstrings, fixes --- aztk/spark/client/cluster/operations.py | 4 +- aztk/spark/client/job/operations.py | 77 +++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index d0451a8b..c9374294 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -24,8 +24,8 @@ def delete(self, id: str, keep_logs: bool = False): Args: id (:obj:`str`): the id of the cluster to delete. - keep_logs (:obj:`bool`): if True, this function will block until the cluster creation is finished. - + keep_logs (:obj:`bool`): If True, the logs related to this cluster in Azure Storage are not deleted. + Defaults to False. Returns: True if the deletion process was successful. """ diff --git a/aztk/spark/client/job/operations.py b/aztk/spark/client/job/operations.py index 76e6393b..1f4b1d2b 100644 --- a/aztk/spark/client/job/operations.py +++ b/aztk/spark/client/job/operations.py @@ -8,30 +8,107 @@ class JobOperations(CoreJobOperations, SparkBaseOperations): def list(self): + """List all jobs. + + Returns: + List[Job]: List of aztk.models.Job objects each representing the state and configuration of the job. + """ return list.list_jobs(self) def delete(self, id, keep_logs: bool = False): + """Delete a job. + + Args: + id (:obj:`str`): the id of the cluster to delete. + keep_logs (:obj:`bool`): If True, the logs related to this cluster in Azure Storage are not deleted. + Defaults to False. + Returns: + True if the deletion process was successful. + """ return delete.delete(self, id, keep_logs) def get(self, id): + """Get details about the state of a job. + + Args: + id (:obj:`str`): the id of the cluster to get. + + Returns: + Cluster: An aztk.models.Cluster object representing the state and configuration of the cluster. + """ return get.get_job(self, id) def get_application(self, id, application_name): + """Get information on a submitted application + + Args: + id (:obj:`str`): the name of the cluster the application was submitted to + application_name (:obj:`str`): the name of the application to get + + Returns: + aztk.spark.models.Application: object representing that state and output of an application + """ return get_application.get_application(self, id, application_name) def get_application_log(self, id, application_name): + """Get the log for a running or completed application + + Args: + id (:obj:`str`): the id of the cluster to run the command on. + application_name (:obj:`str`): str + + Returns: + aztk.spark.models.ApplicationLog: a model representing the output of the application. + """ return get_application_log.get_job_application_log(self, id, application_name) def list_applications(self, id): + """List all application defined as a part of a job + + Args: + id (:obj:`str`): the id of the job to list the applications of + + Returns: + List[aztk.spark.models.Application]: a list of all applications defined as a part of the job + """ return list_applications.list_applications(self, id) def stop(self, id): + """Stop a submitted job + + Args: + id (:obj:`str`): the id of the job to stop + + Returns: + None + """ return stop.stop(self, id) def stop_application(self, id, application_name): + """Stops a submitted application + + Args: + id (:obj:`str`): the id of the job the application belongs to + application_name (:obj:`str`): the name of the application to stop + + Returns: + bool: True if the stop was successful, else False + """ return stop_application.stop_app(self, id, application_name) def submit(self, job_configuration: models.JobConfiguration): + """Submit a job + + Jobs are a cluster definition and one or many application definitions which run on the cluster. The job's + cluster will be allocated and configured, then the applications will be executed with their output stored + in Azure Storage. When all applications have completed, the cluster will be automatically deleted. + + Args: + job_configuration (:obj:`aztk.models.JobConfiguration`): Model defining the job's configuration. + + Returns: + aztk.spark.models.Job: Model representing the state of the job. + """ return submit.submit_job(self, job_configuration) def wait_until_job_finished(self, id): #TODO: rename to something better From 5769e713b969cb5f87e6993a6abaeed2bccb99c3 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 10 Jul 2018 16:15:27 -0700 Subject: [PATCH 24/52] fix the sdk documentation, bugs --- aztk/client/client.py | 10 +++++++ aztk/client/cluster/operations.py | 14 ++++----- aztk/spark/client/client.py | 6 ++++ aztk/spark/client/cluster/operations.py | 22 +++++++------- aztk/spark/client/job/operations.py | 38 ++++++++++++------------- docs/aztk.models.rst | 1 + docs/aztk.rst | 14 ++++++++- docs/aztk.spark.rst | 11 +++++-- 8 files changed, 76 insertions(+), 40 deletions(-) diff --git a/aztk/client/client.py b/aztk/client/client.py index 48fc0f28..d0735fa5 100644 --- a/aztk/client/client.py +++ b/aztk/client/client.py @@ -20,6 +20,16 @@ class CoreClient: + """The base AZTK client that all other clients inherit from. + + **This client should not be used directly. Only software specific clients + should be used.** + + Attributes: + cluster (:obj:`aztk.client.cluster.CoreClusterOperations`): Cluster + job (:obj:`aztk.client.job.CoreJobOperations`): Job + """ + # TODO: remove ability to specify secrets_config in 0.10.0 def __init__(self, secrets_configuration: models.SecretsConfiguration = None, **kwargs): self.secrets_configuration = None diff --git a/aztk/client/cluster/operations.py b/aztk/client/cluster/operations.py index 40abbc90..02e3a51e 100644 --- a/aztk/client/cluster/operations.py +++ b/aztk/client/cluster/operations.py @@ -8,16 +8,16 @@ class CoreClusterOperations(BaseOperations): def create(self, cluster_configuration: ClusterConfiguration, software_metadata_key: str, start_task, vm_image_model): - """Create a cluster + """Create a cluster. Args: - cluster_configuration (:obj:`ClusterConfiguration`): Configuration for the cluster to be created + cluster_configuration (:obj:`aztk.models.ClusterConfiguration`): Configuration for the cluster to be created software_metadata_key (:obj:`str`): the key for the primary software that will be run on the cluster start_task (:obj:`azure.batch.models.StartTask`): Batch StartTask defintion to configure the Batch Pool vm_image_model (:obj:`azure.batch.models.VirtualMachineConfiguration`): Configuration of the virtual machine image and settings Returns: - Cluster: An aztk.models.Cluster object representing the state and configuration of the cluster. + :obj:`aztk.models.Cluster`: A Cluster object representing the state and configuration of the cluster. """ return create.create_pool_and_job(self, cluster_configuration, software_metadata_key, start_task, vm_image_model) @@ -29,7 +29,7 @@ def get(self, cluster_id: str): cluster_id (:obj:`str`): the id of the cluster to get. Returns: - Cluster: An aztk.models.Cluster object representing the state and configuration of the cluster. + :obj:`aztk.models.Cluster`: A Cluster object representing the state and configuration of the cluster. """ return get.get_pool_details(self, cluster_id) @@ -60,7 +60,7 @@ def copy(self, Defaults to None. Returns: - List[NodeOutput]: A list of NodeOutput objects representing the output of the copy command. + :obj:`List[aztk.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. """ return copy.cluster_copy(self, cluster_id, source_path, destination_path, container_name, internal, get, timeout) @@ -75,7 +75,7 @@ def delete(self, pool_id: str, keep_logs: bool = False): Defaults to False. Returns: - List[NodeOutput]: A list of NodeOutput objects representing the output of the copy command. + :obj:`List[aztk.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. """ return delete.delete_pool_and_job(self, pool_id, keep_logs) @@ -87,6 +87,6 @@ def list(self, software_metadata_key): This filters out non-aztk clusters and aztk clusters running other software. Returns: - List[Cluster]: list of clusters running the software defined by software_metadata_key + :obj:`List[aztk.models.Cluster]`: list of clusters running the software defined by software_metadata_key """ return list.list_clusters(self, software_metadata_key) diff --git a/aztk/spark/client/client.py b/aztk/spark/client/client.py index 143256eb..14a7d7ad 100644 --- a/aztk/spark/client/client.py +++ b/aztk/spark/client/client.py @@ -19,6 +19,12 @@ class Client(CoreClient): + """The client used to create and manage Spark clusters + + Attributes: + cluster (:obj:`aztk.spark.client.cluster.ClusterOperations`): Cluster + job (:obj:`aztk.spark.client.job.JobOperations`): Job + """ def __init__(self, secrets_configuration: models.SecretsConfiguration, **kwargs): self.secrets_configuration = None context = None diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index c9374294..1fde2ce7 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -15,7 +15,7 @@ def create(self, cluster_configuration: models.ClusterConfiguration, wait: bool wait (:obj:`bool`): if True, this function will block until the cluster creation is finished. Returns: - Cluster: An aztk.models.Cluster object representing the state and configuration of the cluster. + :obj:`aztk.spark.models.Cluster`: An Cluster object representing the state and configuration of the cluster. """ return create.create_cluster(self, cluster_configuration, wait) @@ -27,7 +27,7 @@ def delete(self, id: str, keep_logs: bool = False): keep_logs (:obj:`bool`): If True, the logs related to this cluster in Azure Storage are not deleted. Defaults to False. Returns: - True if the deletion process was successful. + :obj:`bool`: True if the deletion process was successful. """ return delete.delete_cluster(self, id, keep_logs) @@ -38,7 +38,7 @@ def get(self, id: str): id (:obj:`str`): the id of the cluster to get. Returns: - Cluster: An aztk.models.Cluster object representing the state and configuration of the cluster. + :obj:`aztk.spark.models.Cluster`: A Cluster object representing the state and configuration of the cluster. """ return get.get_cluster(self, id) @@ -46,7 +46,7 @@ def list(self): """List all clusters. Returns: - List[Cluster]: List of aztk.models.Cluster objects each representing the state and configuration of the cluster. + :obj:`List[aztk.spark.models.Cluster]`: List of Cluster objects each representing the state and configuration of the cluster. """ return list.list_clusters(self) @@ -62,7 +62,7 @@ def submit(self, id: str, application: models.ApplicationConfiguration, remote: wait (:obj:`bool`, optional): If True, this function blocks until the application has completed. Defaults to False. Returns: - Cluster: An aztk.models.Cluster object representing the state and configuration of the cluster. + None """ return submit.submit(self, id, application, remote, wait) @@ -88,7 +88,7 @@ def get_application_status(self, id: str, application_name: str): application_name (:obj:`str`): the name of the application to get Returns: - str: the status state of the application + :obj:`str`: the status state of the application """ return get_application_status.get_application_status(self, id, application_name) @@ -106,7 +106,7 @@ def run(self, id: str, command: str, host=False, internal: bool = False, timeout Defaults to None. Returns: - List[NodeOutput]: list of NodeOutput objects containing the output of the run command + :obj:`List[aztk.spark.models.NodeOutput]`: list of NodeOutput objects containing the output of the run command """ return run.cluster_run(self, id, command, host, internal, timeout) @@ -125,7 +125,7 @@ def node_run(self, id: str, node_id: str, command: str, host=False, internal: bo Defaults to None. Returns: - NodeOutput: object containing the output of the run command + :obj:`aztk.spark.models.NodeOutput`: object containing the output of the run command """ return node_run.node_run(self, id, node_id, command, host, internal, timeout) @@ -150,7 +150,7 @@ def copy(self, Defaults to None. Returns: - List[NodeOutput]: A list of NodeOutput objects representing the output of the copy command. + :obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. """ return copy.cluster_copy(self, id, source_path, destination_path, host, internal, timeout) @@ -177,7 +177,7 @@ def download(self, Defaults to None. Returns: - List[NodeOutput]: A list of NodeOutput objects representing the output of the copy command. + :obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. """ return download.cluster_download(self, id, source_path, destination_path, host, internal, timeout) @@ -191,6 +191,6 @@ def diagnostics(self, id, output_directory=None): written to this path. Defaults to None. Returns: - List[NodeOutput]: A list of NodeOutput objects representing the output of the copy command. + :obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. """ return diagnostics.run_cluster_diagnostics(self, id, output_directory) diff --git a/aztk/spark/client/job/operations.py b/aztk/spark/client/job/operations.py index 1f4b1d2b..d55150fe 100644 --- a/aztk/spark/client/job/operations.py +++ b/aztk/spark/client/job/operations.py @@ -11,7 +11,7 @@ def list(self): """List all jobs. Returns: - List[Job]: List of aztk.models.Job objects each representing the state and configuration of the job. + :obj:`List[Job]`: List of aztk.models.Job objects each representing the state and configuration of the job. """ return list.list_jobs(self) @@ -19,11 +19,11 @@ def delete(self, id, keep_logs: bool = False): """Delete a job. Args: - id (:obj:`str`): the id of the cluster to delete. - keep_logs (:obj:`bool`): If True, the logs related to this cluster in Azure Storage are not deleted. + id (:obj:`str`): the id of the job to delete. + keep_logs (:obj:`bool`): If True, the logs related to this job in Azure Storage are not deleted. Defaults to False. Returns: - True if the deletion process was successful. + :obj:`bool`: True if the deletion process was successful. """ return delete.delete(self, id, keep_logs) @@ -31,10 +31,10 @@ def get(self, id): """Get details about the state of a job. Args: - id (:obj:`str`): the id of the cluster to get. + id (:obj:`str`): the id of the job to get. Returns: - Cluster: An aztk.models.Cluster object representing the state and configuration of the cluster. + :obj:`aztk.spark.models.job`: A job object representing the state and configuration of the job. """ return get.get_job(self, id) @@ -42,11 +42,11 @@ def get_application(self, id, application_name): """Get information on a submitted application Args: - id (:obj:`str`): the name of the cluster the application was submitted to + id (:obj:`str`): the name of the job the application was submitted to application_name (:obj:`str`): the name of the application to get Returns: - aztk.spark.models.Application: object representing that state and output of an application + :obj:`aztk.spark.models.Application`: object representing that state and output of an application """ return get_application.get_application(self, id, application_name) @@ -54,22 +54,22 @@ def get_application_log(self, id, application_name): """Get the log for a running or completed application Args: - id (:obj:`str`): the id of the cluster to run the command on. - application_name (:obj:`str`): str + id (:obj:`str`): the id of the job the application was submitted to. + application_name (:obj:`str`): the name of the application to get the log of Returns: - aztk.spark.models.ApplicationLog: a model representing the output of the application. + :obj:`aztk.spark.models.ApplicationLog`: a model representing the output of the application. """ return get_application_log.get_job_application_log(self, id, application_name) def list_applications(self, id): """List all application defined as a part of a job - + Args: id (:obj:`str`): the id of the job to list the applications of - + Returns: - List[aztk.spark.models.Application]: a list of all applications defined as a part of the job + :obj:`List[aztk.spark.models.Application]`: a list of all applications defined as a part of the job """ return list_applications.list_applications(self, id) @@ -78,7 +78,7 @@ def stop(self, id): Args: id (:obj:`str`): the id of the job to stop - + Returns: None """ @@ -90,9 +90,9 @@ def stop_application(self, id, application_name): Args: id (:obj:`str`): the id of the job the application belongs to application_name (:obj:`str`): the name of the application to stop - + Returns: - bool: True if the stop was successful, else False + :obj:`bool`: True if the stop was successful, else False """ return stop_application.stop_app(self, id, application_name) @@ -104,10 +104,10 @@ def submit(self, job_configuration: models.JobConfiguration): in Azure Storage. When all applications have completed, the cluster will be automatically deleted. Args: - job_configuration (:obj:`aztk.models.JobConfiguration`): Model defining the job's configuration. + job_configuration (:obj:`aztk.spark.models.JobConfiguration`): Model defining the job's configuration. Returns: - aztk.spark.models.Job: Model representing the state of the job. + :obj:`aztk.spark.models.Job`: Model representing the state of the job. """ return submit.submit_job(self, job_configuration) diff --git a/docs/aztk.models.rst b/docs/aztk.models.rst index d3deb6aa..4d6e185e 100644 --- a/docs/aztk.models.rst +++ b/docs/aztk.models.rst @@ -6,3 +6,4 @@ aztk.models package :members: :show-inheritance: :imported-members: + :undoc-members: diff --git a/docs/aztk.rst b/docs/aztk.rst index b9c5c3e4..1ccbe595 100644 --- a/docs/aztk.rst +++ b/docs/aztk.rst @@ -9,7 +9,19 @@ aztk package aztk.client module ------------------ -.. autoclass:: aztk.client.Client +.. autoclass:: aztk.client.CoreClient + :members: + :undoc-members: + :show-inheritance: + + +.. autoclass:: aztk.client.cluster.CoreClusterOperations + :members: + :undoc-members: + :show-inheritance: + + +.. autoclass:: aztk.client.job.CoreJobOperations :members: :undoc-members: :show-inheritance: diff --git a/docs/aztk.spark.rst b/docs/aztk.spark.rst index 706cbc62..ea04b011 100644 --- a/docs/aztk.spark.rst +++ b/docs/aztk.spark.rst @@ -8,12 +8,19 @@ aztk.spark package aztk.spark.client module ------------------------ -.. automodule:: aztk.spark.client +.. autoclass:: aztk.spark.client.Client :members: :undoc-members: :show-inheritance: -.. automodule:: aztk.spark + +.. autoclass:: aztk.spark.client.cluster.ClusterOperations :members: :undoc-members: :show-inheritance: + + +.. autoclass:: aztk.spark.client.job.JobOperations + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file From 2342326cf7bab56d2a7dde2932ef02def556e61f Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 10 Jul 2018 16:16:50 -0700 Subject: [PATCH 25/52] fix method call --- aztk/spark/client/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aztk/spark/client/client.py b/aztk/spark/client/client.py index 14a7d7ad..7291738d 100644 --- a/aztk/spark/client/client.py +++ b/aztk/spark/client/client.py @@ -30,7 +30,7 @@ def __init__(self, secrets_configuration: models.SecretsConfiguration, **kwargs) context = None if kwargs.get("secrets_config"): # TODO: add deprecated warning - context = self.get_context(kwargs.get("secrets_config")) + context = self._get_context(kwargs.get("secrets_config")) else: context = self._get_context(secrets_configuration) self.cluster = ClusterOperations(context) From b3af9ecfb8c8d23352d9eb0ea4d2d2b09d141e95 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 11 Jul 2018 17:33:53 -0700 Subject: [PATCH 26/52] pool_id->id --- aztk/client/base/base_operations.py | 79 ++++++++++--------- .../base/helpers/create_user_on_node.py | 12 +-- .../base/helpers/create_user_on_pool.py | 4 +- .../base/helpers/delete_user_on_pool.py | 4 +- .../base/helpers/generate_user_on_node.py | 2 +- .../base/helpers/generate_user_on_pool.py | 4 +- aztk/client/base/helpers/run.py | 2 +- aztk/client/client.py | 20 ++++- aztk/client/cluster/helpers/copy.py | 2 +- .../spark/endpoints/cluster/cluster_submit.py | 2 +- .../sdk/cluster/test_cluster_deprecated.py | 4 +- 11 files changed, 78 insertions(+), 57 deletions(-) diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index 8f2f126e..39f73c97 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -11,9 +11,12 @@ class BaseOperations: """Base operations that all other operations inherit from Attributes: - batch_client (:obj:`azure.batch.batch_service_client.BatchServiceClient`): Client used to interact with the Azure Batch service. - blob_client (:obj:`azure.storage.blob.BlockBlobService`): Client used to interact with the Azure Storage Blob service. - secrets_configuration (:obj:`aztk.models.SecretsConfiguration`): Model that holds AZTK secrets used to authenticate with Azure and the clusters. + batch_client (:obj:`azure.batch.batch_service_client.BatchServiceClient`): Client used to interact with the + Azure Batch service. + blob_client (:obj:`azure.storage.blob.BlockBlobService`): Client used to interact with the Azure Storage + Blob service. + secrets_configuration (:obj:`aztk.models.SecretsConfiguration`): Model that holds AZTK secrets used to authenticate + with Azure and the clusters. """ def __init__(self, context): @@ -25,10 +28,11 @@ def get_cluster_config(self, cluster_id: str) -> models.ClusterConfiguration: """Open an ssh tunnel to a node Args: - pool_id (:obj:`str`): the id of the cluster the node is in + cluster_id (:obj:`str`): the id of the cluster the node is in node_id (:obj:`str`): the id of the node to open the ssh tunnel to username (:obj:`str`): the username to authenticate the ssh session - ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None. + ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key + or password. Defaults to None. password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None. port_forward_list (:obj:`List[PortForwardingSpecification`, optional): list of PortForwardingSpecifications. The defined ports will be forwarded to the client. @@ -50,9 +54,8 @@ def get_cluster_data(self, cluster_id: str) -> cluster_data.ClusterData: """ return cluster_data.ClusterData(self.blob_client, cluster_id) - #TODO: rename pool to cluster def ssh_into_node(self, - pool_id, + id, node_id, username, ssh_key=None, @@ -62,7 +65,7 @@ def ssh_into_node(self, """Open an ssh tunnel to a node Args: - pool_id (:obj:`str`): the id of the cluster the node is in + id (:obj:`str`): the id of the cluster the node is in node_id (:obj:`str`): the id of the node to open the ssh tunnel to username (:obj:`str`): the username to authenticate the ssh session ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None. @@ -74,28 +77,27 @@ def ssh_into_node(self, Returns: None """ - ssh_into_node.ssh_into_node(self, pool_id, node_id, username, ssh_key, password, port_forward_list, internal) + ssh_into_node.ssh_into_node(self, id, node_id, username, ssh_key, password, port_forward_list, internal) - #TODO: rename pool to cluster - def create_user_on_node(self, username, pool_id, node_id, ssh_key=None, password=None): + def create_user_on_node(self, id, node_id, username, ssh_key=None, password=None): """Create a user on a node Args: - username (:obj:`str`): name of the user to create. - pool_id (:obj:`str`): id of the cluster to create the user on. + id (:obj:`str`): id of the cluster to create the user on. node_id (:obj:`str`): id of the node in the cluster to create the user on. + username (:obj:`str`): name of the user to create. ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. password (:obj:`str`, optional): password for the user, must use ssh_key or password. """ - return create_user_on_node.create_user_on_node(self, username, pool_id, node_id, ssh_key, password) + return create_user_on_node.create_user_on_node(self, id, node_id, username, ssh_key, password) #TODO: rename pool to cluster, get rid of nodes parameter - def create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key=None, password=None): + def create_user_on_pool(self, id, nodes, username, ssh_pub_key=None, password=None): """Create a user on every node in the cluster Args: username (:obj:`str`): name of the user to create. - pool_id (:obj:`str`): id of the cluster to create the user on. + id (:obj:`str`): id of the cluster to create the user on. nodes (:obj:`List[ComputeNode]`): list of nodes to create the user on ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. Defaults to None. password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None. @@ -103,67 +105,66 @@ def create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key=None, passwo Returns: None """ - return create_user_on_pool.create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key, password) + return create_user_on_pool.create_user_on_pool(self, id, nodes, username, ssh_pub_key, password) - #TODO: rename pool to cluster - def generate_user_on_node(self, pool_id, node_id): + def generate_user_on_node(self, id, node_id): """Create a user with an autogenerated username and ssh_key on the given node. Args: - pool_id (:obj:`str`): the id of the cluster to generate the user on. + id (:obj:`str`): the id of the cluster to generate the user on. node_id (:obj:`str`): the id of the node in the cluster to generate the user on. Returns: tuple: A tuple of the form (username (:obj:`str`), ssh_key) where ssh_key is a Cryptodome.RSA key. """ - return generate_user_on_node.generate_user_on_node(self, pool_id, node_id) + return generate_user_on_node.generate_user_on_node(self, id, node_id) #TODO: rename pool to cluster - def generate_user_on_pool(self, pool_id, nodes): + def generate_user_on_pool(self, id, nodes): """Create a user with an autogenerated username and ssh_key on the cluster Args: - pool_id (:obj:`str`): the id of the cluster to generate the user on. + id (:obj:`str`): the id of the cluster to generate the user on. node_id (:obj:`str`): the id of the node in the cluster to generate the user on. Returns: tuple: A tuple of the form (username (:obj:`str`), ssh_key) where ssh_key is a Cryptodome.RSA key. """ - return generate_user_on_pool.generate_user_on_pool(self, pool_id, nodes) + return generate_user_on_pool.generate_user_on_pool(self, id, nodes) #TODO: rename pool to cluster - def delete_user_on_node(self, pool_id: str, node_id: str, username: str) -> str: + def delete_user_on_node(self, id: str, node_id: str, username: str) -> str: """Delete a user on a node Args: - pool_id (:obj:`str`): the id of the cluster to delete the user on. + id (:obj:`str`): the id of the cluster to delete the user on. node_id (:obj:`str`): the id of the node in the cluster to delete the user on. username (:obj:`str`): the name of the user to delete. Returns: None """ - return delete_user_on_node.delete_user(self, pool_id, node_id, username) + return delete_user_on_node.delete_user(self, id, node_id, username) #TODO: rename pool to cluster - def delete_user_on_pool(self, username, pool_id, nodes): + def delete_user_on_pool(self, username, id, nodes): """Delete a user on every node in the cluster Args: - pool_id (:obj:`str`): the id of the cluster to delete the user on. + id (:obj:`str`): the id of the cluster to delete the user on. node_id (:obj:`str`): the id of the node in the cluster to delete the user on. username (:obj:`str`): the name of the user to delete. Returns: None """ - return delete_user_on_pool.delete_user_on_pool(self, username, pool_id, nodes) + return delete_user_on_pool.delete_user_on_pool(self, username, id, nodes) - def node_run(self, cluster_id, node_id, command, internal, container_name=None, timeout=None): + def node_run(self, id, node_id, command, internal, container_name=None, timeout=None): """Run a bash command on the given node Args: - cluster_id (:obj:`str`): the id of the cluster to run the command on. + id (:obj:`str`): the id of the cluster to run the command on. node_id (:obj:`str`): the id of the node in the cluster to run the command on. command (:obj:`str`): the bash command to execute on the node. internal (:obj:`bool`): if True, this will connect to the node using its internal IP. @@ -176,7 +177,7 @@ def node_run(self, cluster_id, node_id, command, internal, container_name=None, Returns: NodeOutput: object containing the output of the run command """ - return node_run.node_run(self, cluster_id, node_id, command, internal, container_name, timeout) + return node_run.node_run(self, id, node_id, command, internal, container_name, timeout) def get_remote_login_settings(self, id: str, node_id: str): """Get the remote login information for a node in a cluster @@ -190,11 +191,11 @@ def get_remote_login_settings(self, id: str, node_id: str): """ return get_remote_login_settings.get_remote_login_settings(self, id, node_id) - def run(self, cluster_id, command, internal, container_name=None, timeout=None): + def run(self, id, command, internal, container_name=None, timeout=None): """Run a bash command on every node in the cluster Args: - cluster_id (:obj:`str`): the id of the cluster to run the command on. + id (:obj:`str`): the id of the cluster to run the command on. command (:obj:`str`): the bash command to execute on the node. internal (:obj:`bool`): if true, this will connect to the node using its internal IP. Only use this if running within the same VNET as the cluster. Defaults to False. @@ -206,13 +207,13 @@ def run(self, cluster_id, command, internal, container_name=None, timeout=None): Returns: List[NodeOutput]: list of NodeOutput objects containing the output of the run command """ - return run.cluster_run(self, cluster_id, command, internal, container_name, timeout) + return run.cluster_run(self, id, command, internal, container_name, timeout) - def get_application_log(self, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): + def get_application_log(self, id: str, application_name: str, tail=False, current_bytes: int = 0): """Get the log for a running or completed application Args: - cluster_id (:obj:`str`): the id of the cluster to run the command on. + id (:obj:`str`): the id of the cluster to run the command on. application_name (:obj:`str`): str tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes. Otherwise, the whole log will be retrieved. Only use this if streaming the log as it is being written. Defaults to False. @@ -222,4 +223,4 @@ def get_application_log(self, cluster_id: str, application_name: str, tail=False Returns: aztk.models.ApplicationLog: a model representing the output of the application. """ - return get_application_log.get_application_log(self, cluster_id, application_name, tail, current_bytes) + return get_application_log.get_application_log(self, id, application_name, tail, current_bytes) diff --git a/aztk/client/base/helpers/create_user_on_node.py b/aztk/client/base/helpers/create_user_on_node.py index 77b21fab..28c9a9c3 100644 --- a/aztk/client/base/helpers/create_user_on_node.py +++ b/aztk/client/base/helpers/create_user_on_node.py @@ -7,7 +7,7 @@ from aztk.utils import get_ssh_key -def __create_user(self, pool_id: str, node_id: str, username: str, password: str = None, ssh_key: str = None) -> str: +def __create_user(self, id: str, node_id: str, username: str, password: str = None, ssh_key: str = None) -> str: """ Create a pool user :param pool: the pool to add the user to @@ -18,7 +18,7 @@ def __create_user(self, pool_id: str, node_id: str, username: str, password: str """ # Create new ssh user for the given node self.batch_client.compute_node.add_user( - pool_id, + id, node_id, batch_models.ComputeNodeUser( name=username, @@ -30,13 +30,13 @@ def __create_user(self, pool_id: str, node_id: str, username: str, password: str ) -def create_user_on_node(base_client, username, pool_id, node_id, ssh_key=None, password=None): +def create_user_on_node(base_client, id, node_id, username, ssh_key=None, password=None): try: __create_user( - base_client, pool_id=pool_id, node_id=node_id, username=username, ssh_key=ssh_key, password=password) + base_client, id=id, node_id=node_id, username=username, ssh_key=ssh_key, password=password) except batch_error.BatchErrorException as error: try: - base_client.delete_user_on_node(pool_id, node_id, username) - base_client.create_user_on_node(pool_id=pool_id, node_id=node_id, username=username, ssh_key=ssh_key) + base_client.delete_user_on_node(id, node_id, username) + base_client.create_user_on_node(id=id, node_id=node_id, username=username, ssh_key=ssh_key) except batch_error.BatchErrorException as error: raise error diff --git a/aztk/client/base/helpers/create_user_on_pool.py b/aztk/client/base/helpers/create_user_on_pool.py index ca7f50ef..f09a58a5 100644 --- a/aztk/client/base/helpers/create_user_on_pool.py +++ b/aztk/client/base/helpers/create_user_on_pool.py @@ -1,10 +1,10 @@ import concurrent.futures -def create_user_on_pool(base_client, username, pool_id, nodes, ssh_pub_key=None, password=None): +def create_user_on_pool(base_client, id, nodes, username, ssh_pub_key=None, password=None): with concurrent.futures.ThreadPoolExecutor() as executor: futures = { - executor.submit(base_client.create_user_on_node, username, pool_id, node.id, ssh_pub_key, password): node + executor.submit(base_client.create_user_on_node, id, node.id, username, ssh_pub_key, password): node for node in nodes } concurrent.futures.wait(futures) diff --git a/aztk/client/base/helpers/delete_user_on_pool.py b/aztk/client/base/helpers/delete_user_on_pool.py index 24ef4633..051cb6a5 100644 --- a/aztk/client/base/helpers/delete_user_on_pool.py +++ b/aztk/client/base/helpers/delete_user_on_pool.py @@ -1,7 +1,7 @@ import concurrent.futures -def delete_user_on_pool(base_client, username, pool_id, nodes): #TODO: change from pool_id, nodes to cluster_id +def delete_user_on_pool(base_client, id, nodes, username): #TODO: remove nodes param with concurrent.futures.ThreadPoolExecutor() as executor: - futures = [executor.submit(base_client.delete_user_on_node, pool_id, node.id, username) for node in nodes] + futures = [executor.submit(base_client.delete_user_on_node, id, node.id, username) for node in nodes] concurrent.futures.wait(futures) diff --git a/aztk/client/base/helpers/generate_user_on_node.py b/aztk/client/base/helpers/generate_user_on_node.py index 8d1517c3..c984f080 100644 --- a/aztk/client/base/helpers/generate_user_on_node.py +++ b/aztk/client/base/helpers/generate_user_on_node.py @@ -7,5 +7,5 @@ def generate_user_on_node(base_client, pool_id, node_id): generated_username = secure_utils.generate_random_string() ssh_key = RSA.generate(2048) ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8') - base_client.create_user_on_node(generated_username, pool_id, node_id, ssh_pub_key) + base_client.create_user_on_node(pool_id, node_id, generated_username, ssh_pub_key) return generated_username, ssh_key diff --git a/aztk/client/base/helpers/generate_user_on_pool.py b/aztk/client/base/helpers/generate_user_on_pool.py index f9c91c10..7452e955 100644 --- a/aztk/client/base/helpers/generate_user_on_pool.py +++ b/aztk/client/base/helpers/generate_user_on_pool.py @@ -5,13 +5,13 @@ from aztk.utils import secure_utils -def generate_user_on_pool(base_client, pool_id, nodes): +def generate_user_on_pool(base_operations, pool_id, nodes): generated_username = secure_utils.generate_random_string() ssh_key = RSA.generate(2048) ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8') with concurrent.futures.ThreadPoolExecutor() as executor: futures = { - executor.submit(base_client.create_user_on_node, generated_username, pool_id, node.id, ssh_pub_key): node + executor.submit(base_operations.create_user_on_node, pool_id, node.id, generated_username, ssh_pub_key): node for node in nodes } concurrent.futures.wait(futures) diff --git a/aztk/client/base/helpers/run.py b/aztk/client/base/helpers/run.py index 9f409994..3c1aa4cc 100644 --- a/aztk/client/base/helpers/run.py +++ b/aztk/client/base/helpers/run.py @@ -33,4 +33,4 @@ def cluster_run(base_client, cluster_id, command, internal, container_name=None, except OSError as exc: raise exc finally: - base_client.delete_user_on_pool(generated_username, pool.id, nodes) + base_client.delete_user_on_pool(pool.id, nodes, generated_username) diff --git a/aztk/client/client.py b/aztk/client/client.py index d0735fa5..2f133a43 100644 --- a/aztk/client/client.py +++ b/aztk/client/client.py @@ -56,10 +56,11 @@ def _get_context(self, secrets_configuration: models.SecretsConfiguration): return context # ALL THE FOLLOWING METHODS ARE DEPRECATED AND WILL BE REMOVED IN 0.10.0 - + @deprecated("0.10.0") def get_cluster_config(self, cluster_id: str) -> models.ClusterConfiguration: return self._get_cluster_data(cluster_id).read_cluster_config() + @deprecated("0.10.0") def _get_cluster_data(self, cluster_id: str) -> cluster_data.ClusterData: """ Returns ClusterData object to manage data related to the given cluster id @@ -70,6 +71,7 @@ def _get_cluster_data(self, cluster_id: str) -> cluster_data.ClusterData: General Batch Operations ''' + @deprecated("0.10.0") def __delete_pool_and_job(self, pool_id: str, keep_logs: bool = False): """ Delete a pool and it's associated job @@ -99,6 +101,7 @@ def __delete_pool_and_job(self, pool_id: str, keep_logs: bool = False): return job_exists or pool_exists + @deprecated("0.10.0") def __create_pool_and_job(self, cluster_conf: models.ClusterConfiguration, software_metadata_key: str, start_task, VmImageModel): """ Create a pool and job @@ -160,6 +163,7 @@ def __create_pool_and_job(self, cluster_conf: models.ClusterConfiguration, softw return helpers.get_cluster(cluster_conf.cluster_id, self.batch_client) + @deprecated("0.10.0") def __get_pool_details(self, cluster_id: str): """ Print the information for the given cluster @@ -170,6 +174,7 @@ def __get_pool_details(self, cluster_id: str): nodes = self.batch_client.compute_node.list(pool_id=cluster_id) return pool, nodes + @deprecated("0.10.0") def __list_clusters(self, software_metadata_key): """ List all the cluster on your account. @@ -187,6 +192,7 @@ def __list_clusters(self, software_metadata_key): aztk_pools.append(pool) return aztk_pools + @deprecated("0.10.0") def __create_user(self, pool_id: str, node_id: str, username: str, password: str = None, ssh_key: str = None) -> str: """ Create a pool user @@ -208,6 +214,7 @@ def __create_user(self, pool_id: str, node_id: str, username: str, password: str ssh_key, self.secrets_configuration), expiry_time=datetime.now(timezone.utc) + timedelta(days=365))) + @deprecated("0.10.0") def __delete_user(self, pool_id: str, node_id: str, username: str) -> str: """ Create a pool user @@ -218,6 +225,7 @@ def __delete_user(self, pool_id: str, node_id: str, username: str) -> str: # Delete a user on the given node self.batch_client.compute_node.delete_user(pool_id, node_id, username) + @deprecated("0.10.0") def __get_remote_login_settings(self, pool_id: str, node_id: str): """ Get the remote_login_settings for node @@ -229,6 +237,7 @@ def __get_remote_login_settings(self, pool_id: str, node_id: str): pool_id, node_id) return models.RemoteLogin(ip_address=result.remote_login_ip_address, port=str(result.remote_login_port)) + @deprecated("0.10.0") def __create_user_on_node(self, username, pool_id, node_id, ssh_key=None, password=None): try: self.__create_user(pool_id=pool_id, node_id=node_id, username=username, ssh_key=ssh_key, password=password) @@ -239,6 +248,7 @@ def __create_user_on_node(self, username, pool_id, node_id, ssh_key=None, passwo except batch_error.BatchErrorException as error: raise error + @deprecated("0.10.0") def __generate_user_on_node(self, pool_id, node_id): generated_username = secure_utils.generate_random_string() ssh_key = RSA.generate(2048) @@ -246,6 +256,7 @@ def __generate_user_on_node(self, pool_id, node_id): self.__create_user_on_node(generated_username, pool_id, node_id, ssh_pub_key) return generated_username, ssh_key + @deprecated("0.10.0") def __generate_user_on_pool(self, pool_id, nodes): generated_username = secure_utils.generate_random_string() ssh_key = RSA.generate(2048) @@ -260,6 +271,7 @@ def __generate_user_on_pool(self, pool_id, nodes): return generated_username, ssh_key + @deprecated("0.10.0") def __create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key=None, password=None): with concurrent.futures.ThreadPoolExecutor() as executor: futures = {executor.submit(self.__create_user_on_node, @@ -270,11 +282,13 @@ def __create_user_on_pool(self, username, pool_id, nodes, ssh_pub_key=None, pass password): node for node in nodes} concurrent.futures.wait(futures) + @deprecated("0.10.0") def __delete_user_on_pool(self, username, pool_id, nodes): with concurrent.futures.ThreadPoolExecutor() as executor: futures = [executor.submit(self.__delete_user, pool_id, node.id, username) for node in nodes] concurrent.futures.wait(futures) + @deprecated("0.10.0") def __node_run(self, cluster_id, node_id, command, internal, container_name=None, timeout=None): pool, nodes = self.__get_pool_details(cluster_id) try: @@ -303,6 +317,7 @@ def __node_run(self, cluster_id, node_id, command, internal, container_name=None finally: self.__delete_user(cluster_id, node.id, generated_username) + @deprecated("0.10.0") def __cluster_run(self, cluster_id, command, internal, container_name=None, timeout=None): pool, nodes = self.__get_pool_details(cluster_id) nodes = list(nodes) @@ -329,6 +344,7 @@ def __cluster_run(self, cluster_id, command, internal, container_name=None, time finally: self.__delete_user_on_pool(generated_username, pool.id, nodes) + @deprecated("0.10.0") def __cluster_copy(self, cluster_id, source_path, destination_path=None, container_name=None, internal=False, get=False, timeout=None): pool, nodes = self.__get_pool_details(cluster_id) nodes = list(nodes) @@ -357,6 +373,7 @@ def __cluster_copy(self, cluster_id, source_path, destination_path=None, contain finally: self.__delete_user_on_pool(generated_username, pool.id, nodes) + @deprecated("0.10.0") def __ssh_into_node(self, pool_id, node_id, username, ssh_key=None, password=None, port_forward_list=None, internal=False): if internal: result = self.batch_client.compute_node.get(pool_id=pool_id, node_id=node_id) @@ -374,6 +391,7 @@ def __ssh_into_node(self, pool_id, node_id, username, ssh_key=None, password=Non port_forward_list=port_forward_list, ) + @deprecated("0.10.0") def __submit_job(self, job_configuration, start_task, diff --git a/aztk/client/cluster/helpers/copy.py b/aztk/client/cluster/helpers/copy.py index adf87d02..699b1c1a 100644 --- a/aztk/client/cluster/helpers/copy.py +++ b/aztk/client/cluster/helpers/copy.py @@ -38,4 +38,4 @@ def cluster_copy(cluster_operations, cluster_id, source_path, destination_path=N except (OSError, batch_error.BatchErrorException) as exc: raise exc finally: - cluster_operations.delete_user_on_pool(generated_username, pool.id, nodes) + cluster_operations.delete_user_on_pool(pool.id, nodes, generated_username) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_submit.py b/aztk_cli/spark/endpoints/cluster/cluster_submit.py index 388467ba..feeb9f56 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_submit.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_submit.py @@ -162,7 +162,7 @@ def execute(args: typing.NamedTuple): exit_code = utils.stream_logs(client=spark_client, cluster_id=args.cluster_id, application_name=args.name) else: with utils.Spinner(): - spark_client.cluster.wait_until_application_done(cluster_id=args.cluster_id, task_id=args.name) + spark_client.cluster.wait_until_application_done(cluster_id=args.cluster_id, task_id=args.name) # TODO: replace wait_until_application_done application_log = spark_client.cluster.get_application_log(id=args.cluster_id, application_name=args.name) with open(os.path.abspath(os.path.expanduser(args.output)), "w", encoding="UTF-8") as f: f.write(application_log.log) diff --git a/tests/integration_tests/spark/sdk/cluster/test_cluster_deprecated.py b/tests/integration_tests/spark/sdk/cluster/test_cluster_deprecated.py index d1ad53ea..2c92c15d 100644 --- a/tests/integration_tests/spark/sdk/cluster/test_cluster_deprecated.py +++ b/tests/integration_tests/spark/sdk/cluster/test_cluster_deprecated.py @@ -56,10 +56,12 @@ def ensure_spark_processes(cluster_id): def wait_for_all_nodes(cluster_id, nodes): while True: for node in nodes: + if node.state in [batch_models.ComputeNodeState.unusable, batch_models.ComputeNodeState.start_task_failed]: + raise AztkError("Node {} in failed state.".format(node.id)) if node.state not in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running]: break else: - nodes = spark_client.get_cluster(cluster_id).nodes + nodes = spark_client.cluster.get(cluster_id).nodes continue break From 9b18ac5d9e89c0334ee5ff3a4ed71aa457e480ee Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 11 Jul 2018 17:35:20 -0700 Subject: [PATCH 27/52] rename ids --- aztk/client/cluster/operations.py | 33 +++++++++++-------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/aztk/client/cluster/operations.py b/aztk/client/cluster/operations.py index 02e3a51e..c8c99d88 100644 --- a/aztk/client/cluster/operations.py +++ b/aztk/client/cluster/operations.py @@ -19,33 +19,26 @@ def create(self, cluster_configuration: ClusterConfiguration, software_metadata_ Returns: :obj:`aztk.models.Cluster`: A Cluster object representing the state and configuration of the cluster. """ - return create.create_pool_and_job(self, cluster_configuration, software_metadata_key, start_task, vm_image_model) + return create.create_pool_and_job(self, cluster_configuration, software_metadata_key, start_task, + vm_image_model) - # TODO: change cluster_id to id - def get(self, cluster_id: str): + def get(self, id: str): """Get the state and configuration of a cluster Args: - cluster_id (:obj:`str`): the id of the cluster to get. + id (:obj:`str`): the id of the cluster to get. Returns: :obj:`aztk.models.Cluster`: A Cluster object representing the state and configuration of the cluster. """ - return get.get_pool_details(self, cluster_id) - - # TODO: change cluster_id to id - def copy(self, - cluster_id, - source_path, - destination_path=None, - container_name=None, - internal=False, - get=False, + return get.get_pool_details(self, id) + + def copy(self, id, source_path, destination_path=None, container_name=None, internal=False, get=False, timeout=None): """Copy files to or from every node in a cluster. Args: - cluster_id (:obj:`str`): the id of the cluster to copy files with. + id (:obj:`str`): the id of the cluster to copy files with. source_path (:obj:`str`): the path of the file to copy from. destination_path (:obj:`str`, optional): the local directory path where the output should be written. If None, a SpooledTemporaryFile will be returned in the NodeOutput object, else the file will be @@ -62,22 +55,20 @@ def copy(self, Returns: :obj:`List[aztk.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. """ - return copy.cluster_copy(self, cluster_id, source_path, destination_path, container_name, internal, get, - timeout) + return copy.cluster_copy(self, id, source_path, destination_path, container_name, internal, get, timeout) - #TODO: change pool_id to id - def delete(self, pool_id: str, keep_logs: bool = False): + def delete(self, id: str, keep_logs: bool = False): """Copy files to or from every node in a cluster. Args: - pool_id (:obj:`str`): the id of the cluster to delete + id (:obj:`str`): the id of the cluster to delete keep_logs (:obj:`bool`): If True, the logs related to this cluster in Azure Storage are not deleted. Defaults to False. Returns: :obj:`List[aztk.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. """ - return delete.delete_pool_and_job(self, pool_id, keep_logs) + return delete.delete_pool_and_job(self, id, keep_logs) def list(self, software_metadata_key): """List clusters running the specified software. From be7c408da05746aec0f7d009626f228f8920ddf6 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 11 Jul 2018 17:36:29 -0700 Subject: [PATCH 28/52] cluster_id->id --- aztk/client/base/base_operations.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index 39f73c97..d37d8f16 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -24,11 +24,11 @@ def __init__(self, context): self.blob_client = context['blob_client'] self.secrets_configuration = context['secrets_configuration'] - def get_cluster_config(self, cluster_id: str) -> models.ClusterConfiguration: + def get_cluster_config(self, id: str) -> models.ClusterConfiguration: """Open an ssh tunnel to a node Args: - cluster_id (:obj:`str`): the id of the cluster the node is in + id (:obj:`str`): the id of the cluster the node is in node_id (:obj:`str`): the id of the node to open the ssh tunnel to username (:obj:`str`): the username to authenticate the ssh session ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key @@ -41,18 +41,18 @@ def get_cluster_config(self, cluster_id: str) -> models.ClusterConfiguration: Returns: ClusterConfiguration: Object representing the cluster's configuration """ - return self.get_cluster_data(cluster_id).read_cluster_config() + return self.get_cluster_data(id).read_cluster_config() - def get_cluster_data(self, cluster_id: str) -> cluster_data.ClusterData: + def get_cluster_data(self, id: str) -> cluster_data.ClusterData: """Gets the ClusterData object to manage data related to the given cluster Args: - cluster_id (:obj:`str`): the id of the cluster to get + id (:obj:`str`): the id of the cluster to get Returns: ClusterData: Object used to manage the data and storage functions for a cluster """ - return cluster_data.ClusterData(self.blob_client, cluster_id) + return cluster_data.ClusterData(self.blob_client, id) def ssh_into_node(self, id, From 935bf7118a5ca3b060138f99ffd2237988fbb86e Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 11 Jul 2018 17:38:14 -0700 Subject: [PATCH 29/52] cluster_id->id --- aztk/spark/client/base/operations.py | 6 +++--- aztk/spark/client/cluster/operations.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/aztk/spark/client/base/operations.py b/aztk/spark/client/base/operations.py index 548f8da9..db48139e 100644 --- a/aztk/spark/client/base/operations.py +++ b/aztk/spark/client/base/operations.py @@ -11,7 +11,7 @@ class SparkBaseOperations(CoreBaseOperations): def generate_cluster_start_task(self, zip_resource_file: batch_models.ResourceFile, - cluster_id: str, + id: str, gpu_enabled: bool, docker_repo: str = None, file_shares: List[models.FileShare] = None, @@ -23,7 +23,7 @@ def generate_cluster_start_task(self, Args: zip_resource_file (:obj:`azure.batch.models.ResourceFile`): a single zip file of all necessary data to upload to the cluster. - cluster_id (:obj:`str`): the id of the cluster. + id (:obj:`str`): the id of the cluster. gpu_enabled (:obj:`bool`): if True, the cluster is GPU enabled. docker_repo (:obj:`str`, optional): the docker repository and tag that identifies the docker image to use. If None, the default Docker image will be used. Defaults to None. @@ -39,7 +39,7 @@ def generate_cluster_start_task(self, Returns: azure.batch.models.StartTask: the StartTask definition to provision the cluster. """ - return generate_cluster_start_task.generate_cluster_start_task(self, zip_resource_file, cluster_id, gpu_enabled, + return generate_cluster_start_task.generate_cluster_start_task(self, zip_resource_file, id, gpu_enabled, docker_repo, file_shares, plugins, mixed_mode, worker_on_master) diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index 1fde2ce7..6a768800 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -96,7 +96,7 @@ def run(self, id: str, command: str, host=False, internal: bool = False, timeout """Run a bash command on every node in the cluster Args: - cluster_id (:obj:`str`): the id of the cluster to run the command on. + id (:obj:`str`): the id of the cluster to run the command on. command (:obj:`str`): the bash command to execute on the node. internal (:obj:`bool`): if true, this will connect to the node using its internal IP. Only use this if running within the same VNET as the cluster. Defaults to False. @@ -114,7 +114,7 @@ def node_run(self, id: str, node_id: str, command: str, host=False, internal: bo """Run a bash command on the given node Args: - cluster_id (:obj:`str`): the id of the cluster to run the command on. + id (:obj:`str`): the id of the cluster to run the command on. node_id (:obj:`str`): the id of the node in the cluster to run the command on. command (:obj:`str`): the bash command to execute on the node. internal (:obj:`bool`): if True, this will connect to the node using its internal IP. @@ -139,7 +139,7 @@ def copy(self, """Copy a file to every node in a cluster. Args: - cluster_id (:obj:`str`): the id of the cluster to copy files with. + id (:obj:`str`): the id of the cluster to copy files with. source_path (:obj:`str`): the local path of the file to copy. destination_path (:obj:`str`, optional): the path on each node the file is copied to. container_name (:obj:`str`, optional): the name of the container to copy to or from. @@ -164,7 +164,7 @@ def download(self, """Download a file from every node in a cluster. Args: - cluster_id (:obj:`str`): the id of the cluster to copy files with. + id (:obj:`str`): the id of the cluster to copy files with. source_path (:obj:`str`): the path of the file to copy from. destination_path (:obj:`str`, optional): the local directory path where the output should be written. If None, a SpooledTemporaryFile will be returned in the NodeOutput object, else the file will be From 15111aa26844f6c00ede646a4067a4db2314d0ee Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 11 Jul 2018 17:53:12 -0700 Subject: [PATCH 30/52] add todo --- aztk/spark/client/base/operations.py | 1 + 1 file changed, 1 insertion(+) diff --git a/aztk/spark/client/base/operations.py b/aztk/spark/client/base/operations.py index db48139e..b72971a4 100644 --- a/aztk/spark/client/base/operations.py +++ b/aztk/spark/client/base/operations.py @@ -9,6 +9,7 @@ class SparkBaseOperations(CoreBaseOperations): + #TODO: make this private or otherwise not public def generate_cluster_start_task(self, zip_resource_file: batch_models.ResourceFile, id: str, From e585751e57851701667e23b46542ae2e3fa9e632 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Thu, 12 Jul 2018 11:16:59 -0700 Subject: [PATCH 31/52] fixes --- aztk/client/base/helpers/run.py | 10 +++++----- aztk_cli/spark/endpoints/cluster/cluster_create.py | 4 ++-- aztk_cli/spark/endpoints/cluster/cluster_get.py | 2 +- aztk_cli/utils.py | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/aztk/client/base/helpers/run.py b/aztk/client/base/helpers/run.py index 3c1aa4cc..d8881e53 100644 --- a/aztk/client/base/helpers/run.py +++ b/aztk/client/base/helpers/run.py @@ -8,15 +8,15 @@ from aztk.utils import helpers -def cluster_run(base_client, cluster_id, command, internal, container_name=None, timeout=None): - cluster = base_client.get(cluster_id) +def cluster_run(base_operations, cluster_id, command, internal, container_name=None, timeout=None): + cluster = base_operations.get(cluster_id) pool, nodes = cluster.pool, list(cluster.nodes) if internal: cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] else: - cluster_nodes = [(node, base_client.get_remote_login_settings(pool.id, node.id)) for node in nodes] + cluster_nodes = [(node, base_operations.get_remote_login_settings(pool.id, node.id)) for node in nodes] try: - generated_username, ssh_key = base_client.generate_user_on_pool(pool.id, nodes) + generated_username, ssh_key = base_operations.generate_user_on_pool(pool.id, nodes) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) @@ -33,4 +33,4 @@ def cluster_run(base_client, cluster_id, command, internal, container_name=None, except OSError as exc: raise exc finally: - base_client.delete_user_on_pool(pool.id, nodes, generated_username) + base_operations.delete_user_on_pool(pool.id, nodes, generated_username) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_create.py b/aztk_cli/spark/endpoints/cluster/cluster_create.py index eb272bcf..410330e8 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_create.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_create.py @@ -66,10 +66,10 @@ def execute(args: typing.NamedTuple): user_configuration = cluster_conf.user_configuration if user_configuration and user_configuration.username: - ssh_key, password = utils.get_ssh_key_or_prompt(spark_client.secrets_config.ssh_pub_key, + ssh_key, password = utils.get_ssh_key_or_prompt(spark_client.secrets_configuration.ssh_pub_key, user_configuration.username, user_configuration.password, - spark_client.secrets_config) + spark_client.secrets_configuration) cluster_conf.user_configuration = aztk.spark.models.UserConfiguration( username=user_configuration.username, password=password, diff --git a/aztk_cli/spark/endpoints/cluster/cluster_get.py b/aztk_cli/spark/endpoints/cluster/cluster_get.py index 0f2e3cf2..97bfd184 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_get.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_get.py @@ -26,7 +26,7 @@ def execute(args: typing.NamedTuple): cluster = spark_client.cluster.get(cluster_id) utils.print_cluster(spark_client, cluster, args.internal) - configuration = spark_client.get_cluster_config(cluster_id) + configuration = spark_client.cluster.get_cluster_config(cluster_id) if configuration and args.show_config: log.info("-------------------------------------------") log.info("Cluster configuration:") diff --git a/aztk_cli/utils.py b/aztk_cli/utils.py index 4f27b194..d0252fd1 100644 --- a/aztk_cli/utils.py +++ b/aztk_cli/utils.py @@ -17,8 +17,8 @@ from . import log -def get_ssh_key_or_prompt(ssh_key, username, password, secrets_config): - ssh_key = get_ssh_key.get_user_public_key(ssh_key, secrets_config) +def get_ssh_key_or_prompt(ssh_key, username, password, secrets_configuration): + ssh_key = get_ssh_key.get_user_public_key(ssh_key, secrets_configuration) if username is not None and password is None and ssh_key is None: log.warning("It is recommended to use an SSH key for user creation instead of a password.") @@ -188,7 +188,7 @@ def ssh_in_master( ssh_command = utils.command_builder.CommandBuilder('ssh') # get ssh private key path if specified - ssh_priv_key = client.secrets_config.ssh_priv_key + ssh_priv_key = client.secrets_configuration.ssh_priv_key if ssh_priv_key is not None: ssh_command.add_option("-i", ssh_priv_key) From dd2b6ac99ae3606f58f54066d05ead2c4693df5c Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Thu, 12 Jul 2018 14:13:19 -0700 Subject: [PATCH 32/52] add some todos --- aztk/client/base/base_operations.py | 10 ++++++---- aztk/spark/client/base/operations.py | 1 + aztk/spark/client/job/operations.py | 3 ++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index d37d8f16..5352f421 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -79,6 +79,7 @@ def ssh_into_node(self, """ ssh_into_node.ssh_into_node(self, id, node_id, username, ssh_key, password, port_forward_list, internal) + #TODO: rename pool to cluster or make private def create_user_on_node(self, id, node_id, username, ssh_key=None, password=None): """Create a user on a node @@ -91,7 +92,7 @@ def create_user_on_node(self, id, node_id, username, ssh_key=None, password=None """ return create_user_on_node.create_user_on_node(self, id, node_id, username, ssh_key, password) - #TODO: rename pool to cluster, get rid of nodes parameter + #TODO: rename pool to cluster or make private, get rid of nodes parameter def create_user_on_pool(self, id, nodes, username, ssh_pub_key=None, password=None): """Create a user on every node in the cluster @@ -107,6 +108,7 @@ def create_user_on_pool(self, id, nodes, username, ssh_pub_key=None, password=No """ return create_user_on_pool.create_user_on_pool(self, id, nodes, username, ssh_pub_key, password) + #TODO: make private? def generate_user_on_node(self, id, node_id): """Create a user with an autogenerated username and ssh_key on the given node. @@ -119,7 +121,7 @@ def generate_user_on_node(self, id, node_id): """ return generate_user_on_node.generate_user_on_node(self, id, node_id) - #TODO: rename pool to cluster + #TODO: rename pool to cluster or make private def generate_user_on_pool(self, id, nodes): """Create a user with an autogenerated username and ssh_key on the cluster @@ -132,7 +134,7 @@ def generate_user_on_pool(self, id, nodes): """ return generate_user_on_pool.generate_user_on_pool(self, id, nodes) - #TODO: rename pool to cluster + #TODO: rename pool to cluster or make private def delete_user_on_node(self, id: str, node_id: str, username: str) -> str: """Delete a user on a node @@ -146,7 +148,7 @@ def delete_user_on_node(self, id: str, node_id: str, username: str) -> str: """ return delete_user_on_node.delete_user(self, id, node_id, username) - #TODO: rename pool to cluster + #TODO: rename pool to cluster or make private def delete_user_on_pool(self, username, id, nodes): """Delete a user on every node in the cluster diff --git a/aztk/spark/client/base/operations.py b/aztk/spark/client/base/operations.py index b72971a4..f6413973 100644 --- a/aztk/spark/client/base/operations.py +++ b/aztk/spark/client/base/operations.py @@ -44,6 +44,7 @@ def generate_cluster_start_task(self, docker_repo, file_shares, plugins, mixed_mode, worker_on_master) + #TODO: make this private or otherwise not public def generate_application_task(self, container_id, application, remote=False): """Generate the Azure Batch Start Task to provision a Spark cluster. diff --git a/aztk/spark/client/job/operations.py b/aztk/spark/client/job/operations.py index d55150fe..545c694e 100644 --- a/aztk/spark/client/job/operations.py +++ b/aztk/spark/client/job/operations.py @@ -111,5 +111,6 @@ def submit(self, job_configuration: models.JobConfiguration): """ return submit.submit_job(self, job_configuration) - def wait_until_job_finished(self, id): #TODO: rename to something better + #TODO: rename to something better or make this a parameter of submit + def wait_until_job_finished(self, id): wait_until_complete.wait_until_job_finished(self, id) From a737237483dacb0cfbfcb94272c71762ae55babc Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 13 Jul 2018 11:19:54 -0700 Subject: [PATCH 33/52] rename pool to cluster, add todo for nodes params --- aztk/client/base/base_operations.py | 34 +++++++------------ ...r_on_pool.py => create_user_on_cluster.py} | 2 +- ...r_on_pool.py => delete_user_on_cluster.py} | 2 +- ...on_pool.py => generate_user_on_cluster.py} | 4 +-- 4 files changed, 16 insertions(+), 26 deletions(-) rename aztk/client/base/helpers/{create_user_on_pool.py => create_user_on_cluster.py} (76%) rename aztk/client/base/helpers/{delete_user_on_pool.py => delete_user_on_cluster.py} (73%) rename aztk/client/base/helpers/{generate_user_on_pool.py => generate_user_on_cluster.py} (82%) diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index 5352f421..4f9f6ffe 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -2,8 +2,8 @@ from aztk.internal import cluster_data from aztk.utils import ssh as ssh_lib -from .helpers import (create_user_on_node, create_user_on_pool, delete_user_on_node, delete_user_on_pool, - generate_user_on_node, generate_user_on_pool, get_application_log, get_remote_login_settings, +from .helpers import (create_user_on_cluster, create_user_on_node, delete_user_on_cluster, delete_user_on_node, + generate_user_on_cluster, generate_user_on_node, get_application_log, get_remote_login_settings, node_run, run, ssh_into_node) @@ -54,14 +54,7 @@ def get_cluster_data(self, id: str) -> cluster_data.ClusterData: """ return cluster_data.ClusterData(self.blob_client, id) - def ssh_into_node(self, - id, - node_id, - username, - ssh_key=None, - password=None, - port_forward_list=None, - internal=False): + def ssh_into_node(self, id, node_id, username, ssh_key=None, password=None, port_forward_list=None, internal=False): """Open an ssh tunnel to a node Args: @@ -79,7 +72,6 @@ def ssh_into_node(self, """ ssh_into_node.ssh_into_node(self, id, node_id, username, ssh_key, password, port_forward_list, internal) - #TODO: rename pool to cluster or make private def create_user_on_node(self, id, node_id, username, ssh_key=None, password=None): """Create a user on a node @@ -92,8 +84,8 @@ def create_user_on_node(self, id, node_id, username, ssh_key=None, password=None """ return create_user_on_node.create_user_on_node(self, id, node_id, username, ssh_key, password) - #TODO: rename pool to cluster or make private, get rid of nodes parameter - def create_user_on_pool(self, id, nodes, username, ssh_pub_key=None, password=None): + #TODO: remove nodes as param + def create_user_on_cluster(self, id, nodes, username, ssh_pub_key=None, password=None): """Create a user on every node in the cluster Args: @@ -106,9 +98,8 @@ def create_user_on_pool(self, id, nodes, username, ssh_pub_key=None, password=No Returns: None """ - return create_user_on_pool.create_user_on_pool(self, id, nodes, username, ssh_pub_key, password) + return create_user_on_cluster.create_user_on_cluster(self, id, nodes, username, ssh_pub_key, password) - #TODO: make private? def generate_user_on_node(self, id, node_id): """Create a user with an autogenerated username and ssh_key on the given node. @@ -121,8 +112,8 @@ def generate_user_on_node(self, id, node_id): """ return generate_user_on_node.generate_user_on_node(self, id, node_id) - #TODO: rename pool to cluster or make private - def generate_user_on_pool(self, id, nodes): + #TODO: remove nodes as param + def generate_user_on_cluster(self, id, nodes): """Create a user with an autogenerated username and ssh_key on the cluster Args: @@ -132,9 +123,8 @@ def generate_user_on_pool(self, id, nodes): Returns: tuple: A tuple of the form (username (:obj:`str`), ssh_key) where ssh_key is a Cryptodome.RSA key. """ - return generate_user_on_pool.generate_user_on_pool(self, id, nodes) + return generate_user_on_cluster.generate_user_on_cluster(self, id, nodes) - #TODO: rename pool to cluster or make private def delete_user_on_node(self, id: str, node_id: str, username: str) -> str: """Delete a user on a node @@ -148,8 +138,8 @@ def delete_user_on_node(self, id: str, node_id: str, username: str) -> str: """ return delete_user_on_node.delete_user(self, id, node_id, username) - #TODO: rename pool to cluster or make private - def delete_user_on_pool(self, username, id, nodes): + #TODO: remove nodes as param + def delete_user_on_cluster(self, username, id, nodes): """Delete a user on every node in the cluster Args: @@ -160,7 +150,7 @@ def delete_user_on_pool(self, username, id, nodes): Returns: None """ - return delete_user_on_pool.delete_user_on_pool(self, username, id, nodes) + return delete_user_on_cluster.delete_user_on_cluster(self, username, id, nodes) def node_run(self, id, node_id, command, internal, container_name=None, timeout=None): """Run a bash command on the given node diff --git a/aztk/client/base/helpers/create_user_on_pool.py b/aztk/client/base/helpers/create_user_on_cluster.py similarity index 76% rename from aztk/client/base/helpers/create_user_on_pool.py rename to aztk/client/base/helpers/create_user_on_cluster.py index f09a58a5..da22f9d2 100644 --- a/aztk/client/base/helpers/create_user_on_pool.py +++ b/aztk/client/base/helpers/create_user_on_cluster.py @@ -1,7 +1,7 @@ import concurrent.futures -def create_user_on_pool(base_client, id, nodes, username, ssh_pub_key=None, password=None): +def create_user_on_cluster(base_client, id, nodes, username, ssh_pub_key=None, password=None): with concurrent.futures.ThreadPoolExecutor() as executor: futures = { executor.submit(base_client.create_user_on_node, id, node.id, username, ssh_pub_key, password): node diff --git a/aztk/client/base/helpers/delete_user_on_pool.py b/aztk/client/base/helpers/delete_user_on_cluster.py similarity index 73% rename from aztk/client/base/helpers/delete_user_on_pool.py rename to aztk/client/base/helpers/delete_user_on_cluster.py index 051cb6a5..d4a24a66 100644 --- a/aztk/client/base/helpers/delete_user_on_pool.py +++ b/aztk/client/base/helpers/delete_user_on_cluster.py @@ -1,7 +1,7 @@ import concurrent.futures -def delete_user_on_pool(base_client, id, nodes, username): #TODO: remove nodes param +def delete_user_on_cluster(base_client, id, nodes, username): #TODO: remove nodes param with concurrent.futures.ThreadPoolExecutor() as executor: futures = [executor.submit(base_client.delete_user_on_node, id, node.id, username) for node in nodes] concurrent.futures.wait(futures) diff --git a/aztk/client/base/helpers/generate_user_on_pool.py b/aztk/client/base/helpers/generate_user_on_cluster.py similarity index 82% rename from aztk/client/base/helpers/generate_user_on_pool.py rename to aztk/client/base/helpers/generate_user_on_cluster.py index 7452e955..4be77a14 100644 --- a/aztk/client/base/helpers/generate_user_on_pool.py +++ b/aztk/client/base/helpers/generate_user_on_cluster.py @@ -5,13 +5,13 @@ from aztk.utils import secure_utils -def generate_user_on_pool(base_operations, pool_id, nodes): +def generate_user_on_cluster(base_operations, id, nodes): generated_username = secure_utils.generate_random_string() ssh_key = RSA.generate(2048) ssh_pub_key = ssh_key.publickey().exportKey('OpenSSH').decode('utf-8') with concurrent.futures.ThreadPoolExecutor() as executor: futures = { - executor.submit(base_operations.create_user_on_node, pool_id, node.id, generated_username, ssh_pub_key): node + executor.submit(base_operations.create_user_on_node, id, node.id, generated_username, ssh_pub_key): node for node in nodes } concurrent.futures.wait(futures) From 7042be918889f921ce713d6e4f32b3ffdee85c6e Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 13 Jul 2018 11:22:05 -0700 Subject: [PATCH 34/52] add todos for nodes param removal --- aztk/client/base/helpers/create_user_on_cluster.py | 1 + aztk/client/base/helpers/delete_user_on_cluster.py | 4 ++-- aztk/client/base/helpers/generate_user_on_cluster.py | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/aztk/client/base/helpers/create_user_on_cluster.py b/aztk/client/base/helpers/create_user_on_cluster.py index da22f9d2..4b308349 100644 --- a/aztk/client/base/helpers/create_user_on_cluster.py +++ b/aztk/client/base/helpers/create_user_on_cluster.py @@ -1,6 +1,7 @@ import concurrent.futures +#TODO: remove nodes param def create_user_on_cluster(base_client, id, nodes, username, ssh_pub_key=None, password=None): with concurrent.futures.ThreadPoolExecutor() as executor: futures = { diff --git a/aztk/client/base/helpers/delete_user_on_cluster.py b/aztk/client/base/helpers/delete_user_on_cluster.py index d4a24a66..b20935e8 100644 --- a/aztk/client/base/helpers/delete_user_on_cluster.py +++ b/aztk/client/base/helpers/delete_user_on_cluster.py @@ -1,7 +1,7 @@ import concurrent.futures - -def delete_user_on_cluster(base_client, id, nodes, username): #TODO: remove nodes param +#TODO: remove nodes param +def delete_user_on_cluster(base_client, id, nodes, username): with concurrent.futures.ThreadPoolExecutor() as executor: futures = [executor.submit(base_client.delete_user_on_node, id, node.id, username) for node in nodes] concurrent.futures.wait(futures) diff --git a/aztk/client/base/helpers/generate_user_on_cluster.py b/aztk/client/base/helpers/generate_user_on_cluster.py index 4be77a14..aa9a2563 100644 --- a/aztk/client/base/helpers/generate_user_on_cluster.py +++ b/aztk/client/base/helpers/generate_user_on_cluster.py @@ -5,6 +5,7 @@ from aztk.utils import secure_utils +#TODO: remove nodes param def generate_user_on_cluster(base_operations, id, nodes): generated_username = secure_utils.generate_random_string() ssh_key = RSA.generate(2048) From 32c9d2c4050c99b0a3e331564878f9c6a5788279 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 13 Jul 2018 11:23:27 -0700 Subject: [PATCH 35/52] update functions names --- aztk/client/base/helpers/run.py | 4 ++-- aztk/client/cluster/helpers/copy.py | 4 ++-- aztk/spark/client/cluster/helpers/create_user.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/aztk/client/base/helpers/run.py b/aztk/client/base/helpers/run.py index d8881e53..bd279b64 100644 --- a/aztk/client/base/helpers/run.py +++ b/aztk/client/base/helpers/run.py @@ -16,7 +16,7 @@ def cluster_run(base_operations, cluster_id, command, internal, container_name=N else: cluster_nodes = [(node, base_operations.get_remote_login_settings(pool.id, node.id)) for node in nodes] try: - generated_username, ssh_key = base_operations.generate_user_on_pool(pool.id, nodes) + generated_username, ssh_key = base_operations.generate_user_on_cluster(pool.id, nodes) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) @@ -33,4 +33,4 @@ def cluster_run(base_operations, cluster_id, command, internal, container_name=N except OSError as exc: raise exc finally: - base_operations.delete_user_on_pool(pool.id, nodes, generated_username) + base_operations.delete_user_on_cluster(pool.id, nodes, generated_username) diff --git a/aztk/client/cluster/helpers/copy.py b/aztk/client/cluster/helpers/copy.py index 699b1c1a..bc97d8c1 100644 --- a/aztk/client/cluster/helpers/copy.py +++ b/aztk/client/cluster/helpers/copy.py @@ -17,7 +17,7 @@ def cluster_copy(cluster_operations, cluster_id, source_path, destination_path=N cluster_nodes = [(node, cluster_operations.get_remote_login_settings(pool.id, node.id)) for node in nodes] try: - generated_username, ssh_key = cluster_operations.generate_user_on_pool(pool.id, nodes) + generated_username, ssh_key = cluster_operations.generate_user_on_cluster(pool.id, nodes) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) @@ -38,4 +38,4 @@ def cluster_copy(cluster_operations, cluster_id, source_path, destination_path=N except (OSError, batch_error.BatchErrorException) as exc: raise exc finally: - cluster_operations.delete_user_on_pool(pool.id, nodes, generated_username) + cluster_operations.delete_user_on_cluster(pool.id, nodes, generated_username) diff --git a/aztk/spark/client/cluster/helpers/create_user.py b/aztk/spark/client/cluster/helpers/create_user.py index 801b3375..d18c5080 100644 --- a/aztk/spark/client/cluster/helpers/create_user.py +++ b/aztk/spark/client/cluster/helpers/create_user.py @@ -10,6 +10,6 @@ def create_user(spark_cluster_operations, cluster_id: str, username: str, passwo master_node_id = cluster.master_node_id if not master_node_id: raise error.ClusterNotReadyError("The master has not yet been picked, a user cannot be added.") - spark_cluster_operations.create_user_on_pool(username, cluster.id, cluster.nodes, ssh_key, password) + spark_cluster_operations.create_user_on_cluster(username, cluster.id, cluster.nodes, ssh_key, password) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) From cfdf1326cc000ab5394230d6fa115d0b1430f2ac Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 13 Jul 2018 14:59:27 -0700 Subject: [PATCH 36/52] remove deprecated fucntion calls --- aztk_cli/spark/endpoints/cluster/cluster_ssh.py | 6 +++--- aztk_cli/utils.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_ssh.py b/aztk_cli/spark/endpoints/cluster/cluster_ssh.py index 70b047aa..885cd7b8 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_ssh.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_ssh.py @@ -31,8 +31,8 @@ def setup_parser(parser: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) - cluster = spark_client.get_cluster(args.cluster_id) - cluster_config = spark_client.get_cluster_config(args.cluster_id) + cluster = spark_client.cluster.get(args.cluster_id) + cluster_config = spark_client.cluster.get_cluster_config(args.cluster_id) ssh_conf = SshConfig() ssh_conf.merge( @@ -93,7 +93,7 @@ def native_python_ssh_into_master(spark_client, cluster, ssh_conf, password): log.warning("No ssh client found, using pure python connection.") return - configuration = spark_client.get_cluster_config(cluster.id) + configuration = spark_client.cluster.get_cluster_config(cluster.id) plugin_ports = [] if configuration and configuration.plugins: ports = [ diff --git a/aztk_cli/utils.py b/aztk_cli/utils.py index d0252fd1..f00089b7 100644 --- a/aztk_cli/utils.py +++ b/aztk_cli/utils.py @@ -167,7 +167,7 @@ def ssh_in_master( # Get master node id from task (job and task are both named pool_id) cluster = client.cluster.get(cluster_id) - configuration = client.get_cluster_config(cluster_id) + configuration = client.cluster.get_cluster_config(cluster_id) master_node_id = cluster.master_node_id From d48f7e50be131e55f8a73f39c730fba42040d598 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 17 Jul 2018 13:44:03 -0700 Subject: [PATCH 37/52] update docs and docstrings --- aztk/client/base/base_operations.py | 29 +++++++++++++++++------------ aztk/client/cluster/operations.py | 1 - docs/aztk.rst | 6 ++++++ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index 4f9f6ffe..3bc12389 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -38,8 +38,9 @@ def get_cluster_config(self, id: str) -> models.ClusterConfiguration: The defined ports will be forwarded to the client. internal (:obj:`bool`, optional): if True, this will connect to the node using its internal IP. Only use this if running within the same VNET as the cluster. Defaults to False. + Returns: - ClusterConfiguration: Object representing the cluster's configuration + :obj:`aztk.models.ClusterConfiguration`: Object representing the cluster's configuration """ return self.get_cluster_data(id).read_cluster_config() @@ -50,7 +51,7 @@ def get_cluster_data(self, id: str) -> cluster_data.ClusterData: id (:obj:`str`): the id of the cluster to get Returns: - ClusterData: Object used to manage the data and storage functions for a cluster + :obj:`aztk.models.ClusterData`: Object used to manage the data and storage functions for a cluster """ return cluster_data.ClusterData(self.blob_client, id) @@ -67,8 +68,9 @@ def ssh_into_node(self, id, node_id, username, ssh_key=None, password=None, port The defined ports will be forwarded to the client. internal (:obj:`bool`, optional): if True, this will connect to the node using its internal IP. Only use this if running within the same VNET as the cluster. Defaults to False. + Returns: - None + :obj:`None` """ ssh_into_node.ssh_into_node(self, id, node_id, username, ssh_key, password, port_forward_list, internal) @@ -81,6 +83,9 @@ def create_user_on_node(self, id, node_id, username, ssh_key=None, password=None username (:obj:`str`): name of the user to create. ssh_key (:obj:`str`, optional): ssh public key to create the user with, must use ssh_key or password. password (:obj:`str`, optional): password for the user, must use ssh_key or password. + + Returns: + :obj:`None` """ return create_user_on_node.create_user_on_node(self, id, node_id, username, ssh_key, password) @@ -96,7 +101,7 @@ def create_user_on_cluster(self, id, nodes, username, ssh_pub_key=None, password password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None. Returns: - None + :obj:`None` """ return create_user_on_cluster.create_user_on_cluster(self, id, nodes, username, ssh_pub_key, password) @@ -108,7 +113,7 @@ def generate_user_on_node(self, id, node_id): node_id (:obj:`str`): the id of the node in the cluster to generate the user on. Returns: - tuple: A tuple of the form (username (:obj:`str`), ssh_key) where ssh_key is a Cryptodome.RSA key. + :obj:`tuple`: A tuple of the form (username: :obj:`str`, ssh_key: :obj:`Cryptodome.PublicKey.RSA`) """ return generate_user_on_node.generate_user_on_node(self, id, node_id) @@ -121,7 +126,7 @@ def generate_user_on_cluster(self, id, nodes): node_id (:obj:`str`): the id of the node in the cluster to generate the user on. Returns: - tuple: A tuple of the form (username (:obj:`str`), ssh_key) where ssh_key is a Cryptodome.RSA key. + :obj:`tuple`: A tuple of the form (username: :obj:`str`, ssh_key: :obj:`Cryptodome.PublicKey.RSA`) """ return generate_user_on_cluster.generate_user_on_cluster(self, id, nodes) @@ -134,7 +139,7 @@ def delete_user_on_node(self, id: str, node_id: str, username: str) -> str: username (:obj:`str`): the name of the user to delete. Returns: - None + :obj:`None` """ return delete_user_on_node.delete_user(self, id, node_id, username) @@ -148,7 +153,7 @@ def delete_user_on_cluster(self, username, id, nodes): username (:obj:`str`): the name of the user to delete. Returns: - None + :obj:`None` """ return delete_user_on_cluster.delete_user_on_cluster(self, username, id, nodes) @@ -167,7 +172,7 @@ def node_run(self, id, node_id, command, internal, container_name=None, timeout= Defaults to None. Returns: - NodeOutput: object containing the output of the run command + :obj:`aztk.models.NodeOutput`: object containing the output of the run command """ return node_run.node_run(self, id, node_id, command, internal, container_name, timeout) @@ -179,7 +184,7 @@ def get_remote_login_settings(self, id: str, node_id: str): node_id (:obj:`str`): the id of the node in the cluster Returns: - RemoteLogin: + :obj:`aztk.models.RemoteLogin`: Object that contains the ip address and port combination to login to a node """ return get_remote_login_settings.get_remote_login_settings(self, id, node_id) @@ -197,7 +202,7 @@ def run(self, id, command, internal, container_name=None, timeout=None): Defaults to None. Returns: - List[NodeOutput]: list of NodeOutput objects containing the output of the run command + :obj:`List[azkt.models.NodeOutput]`: list of NodeOutput objects containing the output of the run command """ return run.cluster_run(self, id, command, internal, container_name, timeout) @@ -213,6 +218,6 @@ def get_application_log(self, id: str, application_name: str, tail=False, curren Only useful is streaming the log as it is being written. Only used if tail is True. Returns: - aztk.models.ApplicationLog: a model representing the output of the application. + :obj:`aztk.models.ApplicationLog`: a model representing the output of the application. """ return get_application_log.get_application_log(self, id, application_name, tail, current_bytes) diff --git a/aztk/client/cluster/operations.py b/aztk/client/cluster/operations.py index c8c99d88..7d25e980 100644 --- a/aztk/client/cluster/operations.py +++ b/aztk/client/cluster/operations.py @@ -1,5 +1,4 @@ from aztk.client.base import BaseOperations -from aztk.client.base.base_operations import BaseOperations from aztk.models import ClusterConfiguration from .helpers import copy, create, delete, get, list diff --git a/docs/aztk.rst b/docs/aztk.rst index 1ccbe595..408e12a8 100644 --- a/docs/aztk.rst +++ b/docs/aztk.rst @@ -15,6 +15,12 @@ aztk.client module :show-inheritance: +.. autoclass:: aztk.client.base.BaseOperations + :members: + :undoc-members: + :show-inheritance: + + .. autoclass:: aztk.client.cluster.CoreClusterOperations :members: :undoc-members: From c2de9b3a68a9d8906baebcd146f06509cc72542d Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 17 Jul 2018 13:50:46 -0700 Subject: [PATCH 38/52] update docstrings --- aztk/client/job/operations.py | 2 +- aztk/spark/client/base/operations.py | 6 +++--- aztk/spark/client/cluster/helpers/create.py | 2 +- aztk/spark/client/job/operations.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/aztk/client/job/operations.py b/aztk/client/job/operations.py index b18442c1..e0fb1185 100644 --- a/aztk/client/job/operations.py +++ b/aztk/client/job/operations.py @@ -24,7 +24,7 @@ def submit(self, job_configuration, start_task, job_manager_task, autoscale_form part of the job Returns: - azure.batch.models.CloudJobSchedule: Model representing the Azure Batch JobSchedule state. + :obj:`azure.batch.models.CloudJobSchedule`: Model representing the Azure Batch JobSchedule state. """ return submit.submit_job(self, job_configuration, start_task, job_manager_task, autoscale_formula, software_metadata_key, vm_image_model, application_metadata) diff --git a/aztk/spark/client/base/operations.py b/aztk/spark/client/base/operations.py index f6413973..de10fa2f 100644 --- a/aztk/spark/client/base/operations.py +++ b/aztk/spark/client/base/operations.py @@ -38,7 +38,7 @@ def generate_cluster_start_task(self, on the VM that runs the Spark master. Defaults to True. Returns: - azure.batch.models.StartTask: the StartTask definition to provision the cluster. + :obj:`azure.batch.models.StartTask`: the StartTask definition to provision the cluster. """ return generate_cluster_start_task.generate_cluster_start_task(self, zip_resource_file, id, gpu_enabled, docker_repo, file_shares, plugins, mixed_mode, @@ -56,7 +56,7 @@ def generate_application_task(self, container_id, application, remote=False): and not the client. Defaults to False. Returns: - azure.batch.models.TaskAddParameter: the Task definition for the Application. + :obj:`azure.batch.models.TaskAddParameter`: the Task definition for the Application. """ return generate_application_task.generate_application_task(self, container_id, application, remote) @@ -72,6 +72,6 @@ def get_application_log(self, id: str, application_name: str, tail=False, curren Only useful is streaming the log as it is being written. Only used if tail is True. Returns: - aztk.spark.models.ApplicationLog: a model representing the output of the application. + :obj:`aztk.spark.models.ApplicationLog`: a model representing the output of the application. """ return get_application_log.get_application_log(SparkBaseOperations, self, id, application_name, tail, current_bytes) diff --git a/aztk/spark/client/cluster/helpers/create.py b/aztk/spark/client/cluster/helpers/create.py index e941f97d..00a38883 100644 --- a/aztk/spark/client/cluster/helpers/create.py +++ b/aztk/spark/client/cluster/helpers/create.py @@ -35,7 +35,7 @@ def create_cluster(spark_cluster_operations, cluster_conf: models.ClusterConfigu wait(bool): If you should wait for the cluster to be ready before returning Returns: - aztk.spark.models.Cluster + :obj:`aztk.spark.models.Cluster` """ cluster_conf = _apply_default_for_cluster_config(cluster_conf) cluster_conf.validate() diff --git a/aztk/spark/client/job/operations.py b/aztk/spark/client/job/operations.py index 545c694e..a8869ac7 100644 --- a/aztk/spark/client/job/operations.py +++ b/aztk/spark/client/job/operations.py @@ -80,7 +80,7 @@ def stop(self, id): id (:obj:`str`): the id of the job to stop Returns: - None + :obj:`None` """ return stop.stop(self, id) From d6760c18919a8040cb50df1d57e6c99fbea1fbec Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Tue, 17 Jul 2018 14:03:35 -0700 Subject: [PATCH 39/52] get rid of TODOs, fix docstrings --- aztk/spark/client/cluster/helpers/list.py | 3 +-- aztk/spark/client/cluster/operations.py | 4 ++-- aztk/spark/client/job/helpers/list.py | 1 - 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/aztk/spark/client/cluster/helpers/list.py b/aztk/spark/client/cluster/helpers/list.py index 9e6d3231..7fa72864 100644 --- a/aztk/spark/client/cluster/helpers/list.py +++ b/aztk/spark/client/cluster/helpers/list.py @@ -1,6 +1,5 @@ import azure.batch.models.batch_error as batch_error -import aztk.models # TODO: get rid of this import and use aztk.spark.models from aztk import error from aztk.spark import models from aztk.utils import helpers @@ -8,7 +7,7 @@ def list_clusters(spark_cluster_operations): try: - software_metadata_key = aztk.models.Software.spark + software_metadata_key = "spark" return [models.Cluster(pool) for pool in super(type(spark_cluster_operations), spark_cluster_operations).list(software_metadata_key)] except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index 6a768800..f7bf502a 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -62,7 +62,7 @@ def submit(self, id: str, application: models.ApplicationConfiguration, remote: wait (:obj:`bool`, optional): If True, this function blocks until the application has completed. Defaults to False. Returns: - None + :obj:`None` """ return submit.submit(self, id, application, remote, wait) @@ -76,7 +76,7 @@ def create_user(self, id: str, username: str, password: str = None, ssh_key: str password (:obj:`str`, optional): password for the user, must use ssh_key or password. Defaults to None. Returns: - None + :obj:`None` """ return create_user.create_user(self, id, username, ssh_key, password) diff --git a/aztk/spark/client/job/helpers/list.py b/aztk/spark/client/job/helpers/list.py index 068e4057..8b511c4f 100644 --- a/aztk/spark/client/job/helpers/list.py +++ b/aztk/spark/client/job/helpers/list.py @@ -1,6 +1,5 @@ import azure.batch.models.batch_error as batch_error -import aztk.models # TODO: get rid of this import and use aztk.spark.models from aztk import error from aztk.spark import models from aztk.utils import helpers From a418a37d0bb56fd209cd91c052fe428ce0ee86b7 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 18 Jul 2018 13:16:57 -0700 Subject: [PATCH 40/52] remove unused setting --- .vscode/settings.json | 1 - 1 file changed, 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 941779c2..7641f485 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -15,5 +15,4 @@ "python.venvPath": "${workspaceFolder}/.venv/", "python.pythonPath": "${workspaceFolder}/.venv/Scripts/python.exe", "python.unitTest.pyTestEnabled": true, - // "editor.formatOnSave": true, } From 4749017a40bc979b07256dd431b867077d06a97e Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 20 Jul 2018 17:01:19 -0700 Subject: [PATCH 41/52] inheritance -> composition --- aztk/client/base/base_operations.py | 2 +- aztk/client/client.py | 16 ----- aztk/client/cluster/helpers/create.py | 12 ++-- aztk/client/cluster/helpers/delete.py | 12 ++-- aztk/client/cluster/helpers/get.py | 12 ++-- aztk/client/cluster/helpers/list.py | 7 +- .../base/helpers/generate_application_task.py | 12 ++-- .../helpers/generate_cluster_start_task.py | 16 ++--- .../base/helpers/get_application_log.py | 9 --- aztk/spark/client/base/operations.py | 51 ++++++-------- aztk/spark/client/client.py | 2 +- aztk/spark/client/cluster/helpers/copy.py | 4 +- aztk/spark/client/cluster/helpers/create.py | 14 ++-- .../client/cluster/helpers/create_user.py | 4 +- aztk/spark/client/cluster/helpers/delete.py | 4 +- aztk/spark/client/cluster/helpers/download.py | 4 +- aztk/spark/client/cluster/helpers/get.py | 6 +- .../cluster/helpers/get_application_log.py | 7 ++ .../cluster/helpers/get_application_status.py | 4 +- .../helpers/get_remote_login_settings.py | 12 ++++ aztk/spark/client/cluster/helpers/list.py | 4 +- aztk/spark/client/cluster/helpers/node_run.py | 4 +- aztk/spark/client/cluster/helpers/run.py | 4 +- aztk/spark/client/cluster/helpers/submit.py | 23 +++---- aztk/spark/client/cluster/operations.py | 66 +++++++++++++++---- aztk/spark/client/job/helpers/delete.py | 14 ++-- aztk/spark/client/job/helpers/get.py | 16 ++--- .../client/job/helpers/get_application.py | 4 +- .../client/job/helpers/get_application_log.py | 13 ++-- .../client/job/helpers/get_recent_job.py | 6 +- aztk/spark/client/job/helpers/list.py | 8 +-- .../client/job/helpers/list_applications.py | 10 +-- aztk/spark/client/job/helpers/stop.py | 8 +-- .../client/job/helpers/stop_application.py | 6 +- aztk/spark/client/job/helpers/submit.py | 17 ++--- .../client/job/helpers/wait_until_complete.py | 10 +-- aztk/spark/client/job/operations.py | 31 ++++++--- aztk/spark/models/models.py | 20 ++++-- aztk/spark/utils/constants.py | 3 + aztk/spark/utils/util.py | 6 +- .../spark/sdk/job/test_job.py | 24 +++---- .../spark/sdk/job/test_job_deprecated.py | 2 +- 42 files changed, 279 insertions(+), 230 deletions(-) delete mode 100644 aztk/spark/client/base/helpers/get_application_log.py create mode 100644 aztk/spark/client/cluster/helpers/get_application_log.py create mode 100644 aztk/spark/client/cluster/helpers/get_remote_login_settings.py create mode 100644 aztk/spark/utils/constants.py diff --git a/aztk/client/base/base_operations.py b/aztk/client/base/base_operations.py index 3bc12389..fb02924f 100644 --- a/aztk/client/base/base_operations.py +++ b/aztk/client/base/base_operations.py @@ -8,7 +8,7 @@ class BaseOperations: - """Base operations that all other operations inherit from + """Base operations that all other operations have as an attribute Attributes: batch_client (:obj:`azure.batch.batch_service_client.BatchServiceClient`): Client used to interact with the diff --git a/aztk/client/client.py b/aztk/client/client.py index 2f133a43..94257a7b 100644 --- a/aztk/client/client.py +++ b/aztk/client/client.py @@ -25,23 +25,7 @@ class CoreClient: **This client should not be used directly. Only software specific clients should be used.** - Attributes: - cluster (:obj:`aztk.client.cluster.CoreClusterOperations`): Cluster - job (:obj:`aztk.client.job.CoreJobOperations`): Job """ - - # TODO: remove ability to specify secrets_config in 0.10.0 - def __init__(self, secrets_configuration: models.SecretsConfiguration = None, **kwargs): - self.secrets_configuration = None - context = None - if kwargs.get("secrets_config"): - # TODO: add deprecated warning - context = self._get_context(kwargs.get("secrets_config")) - else: - context = self._get_context(secrets_configuration) - self.cluster = CoreClusterOperations(context) - self.job = CoreJobOperations(context) - def _get_context(self, secrets_configuration: models.SecretsConfiguration): self.secrets_configuration = secrets_configuration diff --git a/aztk/client/cluster/helpers/create.py b/aztk/client/cluster/helpers/create.py index dfa2dfa8..eb0a6d3c 100644 --- a/aztk/client/cluster/helpers/create.py +++ b/aztk/client/cluster/helpers/create.py @@ -5,7 +5,7 @@ from aztk.utils import helpers, constants -def create_pool_and_job(cluster_client, cluster_conf: models.ClusterConfiguration, software_metadata_key: str, start_task, VmImageModel): +def create_pool_and_job(core_cluster_operations, cluster_conf: models.ClusterConfiguration, software_metadata_key: str, start_task, VmImageModel): """ Create a pool and job :param cluster_conf: the configuration object used to create the cluster @@ -15,7 +15,7 @@ def create_pool_and_job(cluster_client, cluster_conf: models.ClusterConfiguratio :param VmImageModel: the type of image to provision for the cluster :param wait: wait until the cluster is ready """ - cluster_client.get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf) + core_cluster_operations.get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf) # reuse pool_id as job_id pool_id = cluster_conf.cluster_id job_id = cluster_conf.cluster_id @@ -23,7 +23,7 @@ def create_pool_and_job(cluster_client, cluster_conf: models.ClusterConfiguratio # Get a verified node agent sku sku_to_use, image_ref_to_use = \ helpers.select_latest_verified_vm_image_with_node_agent_sku( - VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, cluster_client.batch_client) + VmImageModel.publisher, VmImageModel.offer, VmImageModel.sku, core_cluster_operations.batch_client) network_conf = None if cluster_conf.subnet_id is not None: @@ -54,7 +54,7 @@ def create_pool_and_job(cluster_client, cluster_conf: models.ClusterConfiguratio ]) # Create the pool + create user for the pool - helpers.create_pool_if_not_exist(pool, cluster_client.batch_client) + helpers.create_pool_if_not_exist(pool, core_cluster_operations.batch_client) # Create job job = batch_models.JobAddParameter( @@ -62,6 +62,6 @@ def create_pool_and_job(cluster_client, cluster_conf: models.ClusterConfiguratio pool_info=batch_models.PoolInformation(pool_id=pool_id)) # Add job to batch - cluster_client.batch_client.job.add(job) + core_cluster_operations.batch_client.job.add(job) - return helpers.get_cluster(cluster_conf.cluster_id, cluster_client.batch_client) + return helpers.get_cluster(cluster_conf.cluster_id, core_cluster_operations.batch_client) diff --git a/aztk/client/cluster/helpers/delete.py b/aztk/client/cluster/helpers/delete.py index ead592de..7f242def 100644 --- a/aztk/client/cluster/helpers/delete.py +++ b/aztk/client/cluster/helpers/delete.py @@ -1,7 +1,7 @@ import azure.batch.models as batch_models -def delete_pool_and_job(cluster_client, pool_id: str, keep_logs: bool = False): +def delete_pool_and_job(core_cluster_operations, pool_id: str, keep_logs: bool = False): """ Delete a pool and it's associated job :param cluster_id: the pool to add the user to @@ -12,20 +12,20 @@ def delete_pool_and_job(cluster_client, pool_id: str, keep_logs: bool = False): job_exists = True try: - cluster_client.batch_client.job.get(job_id) + core_cluster_operations.batch_client.job.get(job_id) except batch_models.batch_error.BatchErrorException: job_exists = False - pool_exists = cluster_client.batch_client.pool.exists(pool_id) + pool_exists = core_cluster_operations.batch_client.pool.exists(pool_id) if job_exists: - cluster_client.batch_client.job.delete(job_id) + core_cluster_operations.batch_client.job.delete(job_id) if pool_exists: - cluster_client.batch_client.pool.delete(pool_id) + core_cluster_operations.batch_client.pool.delete(pool_id) if not keep_logs: - cluster_data = cluster_client.get_cluster_data(pool_id) + cluster_data = core_cluster_operations.get_cluster_data(pool_id) cluster_data.delete_container(pool_id) return job_exists or pool_exists diff --git a/aztk/client/cluster/helpers/get.py b/aztk/client/cluster/helpers/get.py index 73b9a3d7..41c25232 100644 --- a/aztk/client/cluster/helpers/get.py +++ b/aztk/client/cluster/helpers/get.py @@ -1,11 +1,15 @@ -def get_pool_details(cluster_client, cluster_id: str): +#TODO: return Cluster instead of (pool, nodes) +from aztk import models + + +def get_pool_details(core_cluster_operations, cluster_id: str): """ Print the information for the given cluster :param cluster_id: Id of the cluster :return pool: CloudPool, nodes: ComputeNodePaged """ - pool = cluster_client.batch_client.pool.get(cluster_id) - nodes = cluster_client.batch_client.compute_node.list(pool_id=cluster_id) - return pool, nodes + pool = core_cluster_operations.batch_client.pool.get(cluster_id) + nodes = core_cluster_operations.batch_client.compute_node.list(pool_id=cluster_id) + return models.Cluster(pool, nodes) diff --git a/aztk/client/cluster/helpers/list.py b/aztk/client/cluster/helpers/list.py index af6612ea..e1f825a5 100644 --- a/aztk/client/cluster/helpers/list.py +++ b/aztk/client/cluster/helpers/list.py @@ -1,3 +1,4 @@ +from aztk import models from aztk.utils import constants @@ -11,9 +12,9 @@ def list_clusters(cluster_client, software_metadata_key): cluster_metadata = ( constants.AZTK_MODE_METADATA_KEY, constants.AZTK_CLUSTER_MODE_METADATA) - aztk_pools = [] + aztk_clusters = [] for pool in [pool for pool in pools if pool.metadata]: pool_metadata = [(metadata.name, metadata.value) for metadata in pool.metadata] if all([metadata in pool_metadata for metadata in [software_metadata, cluster_metadata]]): - aztk_pools.append(pool) - return aztk_pools + aztk_clusters.append(models.Cluster(pool)) + return aztk_clusters diff --git a/aztk/spark/client/base/helpers/generate_application_task.py b/aztk/spark/client/base/helpers/generate_application_task.py index 0dfc3a4d..183adfd4 100644 --- a/aztk/spark/client/base/helpers/generate_application_task.py +++ b/aztk/spark/client/base/helpers/generate_application_task.py @@ -7,7 +7,7 @@ from aztk.utils.command_builder import CommandBuilder -def generate_application_task(spark_client, container_id, application, remote=False): +def generate_application_task(core_base_operations, container_id, application, remote=False): resource_files = [] # The application provided is not hosted remotely and therefore must be uploaded @@ -16,7 +16,7 @@ def generate_application_task(spark_client, container_id, application, remote=Fa container_name=container_id, application_name=application.name, file_path=application.application, - blob_client=spark_client.blob_client, + blob_client=core_base_operations.blob_client, use_full_path=False) # Upload application file @@ -31,7 +31,7 @@ def generate_application_task(spark_client, container_id, application, remote=Fa container_name=container_id, application_name=application.name, file_path=jar, - blob_client=spark_client.blob_client, + blob_client=core_base_operations.blob_client, use_full_path=False) jar_resource_file_paths.append(current_jar_resource_file_path) resource_files.append(current_jar_resource_file_path) @@ -43,7 +43,7 @@ def generate_application_task(spark_client, container_id, application, remote=Fa container_name=container_id, application_name=application.name, file_path=py_file, - blob_client=spark_client.blob_client, + blob_client=core_base_operations.blob_client, use_full_path=False) py_files_resource_file_paths.append(current_py_files_resource_file_path) resource_files.append(current_py_files_resource_file_path) @@ -55,7 +55,7 @@ def generate_application_task(spark_client, container_id, application, remote=Fa container_name=container_id, application_name=application.name, file_path=file, - blob_client=spark_client.blob_client, + blob_client=core_base_operations.blob_client, use_full_path=False) files_resource_file_paths.append(files_resource_file_path) resource_files.append(files_resource_file_path) @@ -69,7 +69,7 @@ def generate_application_task(spark_client, container_id, application, remote=Fa application_name=application.name, file_path='application.yaml', content=yaml.dump(vars(application)), - blob_client=spark_client.blob_client) + blob_client=core_base_operations.blob_client) resource_files.append(application_definition_file) # create command to submit task diff --git a/aztk/spark/client/base/helpers/generate_cluster_start_task.py b/aztk/spark/client/base/helpers/generate_cluster_start_task.py index 6c9e1a8f..0b100bf1 100644 --- a/aztk/spark/client/base/helpers/generate_cluster_start_task.py +++ b/aztk/spark/client/base/helpers/generate_cluster_start_task.py @@ -23,9 +23,9 @@ def _get_aztk_environment(cluster_id, worker_on_master, mixed_mode): return envs -def __get_docker_credentials(spark_client): +def __get_docker_credentials(core_base_operations): creds = [] - docker = spark_client.secrets_configuration.docker + docker = core_base_operations.secrets_configuration.docker if docker: if docker.endpoint: creds.append(batch_models.EnvironmentSetting(name="DOCKER_ENDPOINT", value=docker.endpoint)) @@ -37,9 +37,9 @@ def __get_docker_credentials(spark_client): return creds -def __get_secrets_env(spark_client): - shared_key = spark_client.secrets_configuration.shared_key - service_principal = spark_client.secrets_configuration.service_principal +def __get_secrets_env(core_base_operations): + shared_key = core_base_operations.secrets_configuration.shared_key + service_principal = core_base_operations.secrets_configuration.service_principal if shared_key: return [ batch_models.EnvironmentSetting(name="BATCH_SERVICE_URL", value=shared_key.batch_service_url), @@ -103,7 +103,7 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, return commands -def generate_cluster_start_task(spark_client, +def generate_cluster_start_task(core_base_operations, zip_resource_file: batch_models.ResourceFile, cluster_id: str, gpu_enabled: bool, @@ -127,14 +127,14 @@ def generate_cluster_start_task(spark_client, spark_submit_logs_file = constants.SPARK_SUBMIT_LOGS_FILE # TODO use certificate - environment_settings = __get_secrets_env(spark_client) + [ + environment_settings = __get_secrets_env(core_base_operations) + [ batch_models.EnvironmentSetting(name="SPARK_WEB_UI_PORT", value=spark_web_ui_port), batch_models.EnvironmentSetting(name="SPARK_WORKER_UI_PORT", value=spark_worker_ui_port), batch_models.EnvironmentSetting(name="SPARK_JOB_UI_PORT", value=spark_job_ui_port), batch_models.EnvironmentSetting(name="SPARK_CONTAINER_NAME", value=spark_container_name), batch_models.EnvironmentSetting(name="SPARK_SUBMIT_LOGS_FILE", value=spark_submit_logs_file), batch_models.EnvironmentSetting(name="AZTK_GPU_ENABLED", value=helpers.bool_env(gpu_enabled)), - ] + __get_docker_credentials(spark_client) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode) + ] + __get_docker_credentials(core_base_operations) + _get_aztk_environment(cluster_id, worker_on_master, mixed_mode) # start task command command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, plugins, worker_on_master, file_shares, diff --git a/aztk/spark/client/base/helpers/get_application_log.py b/aztk/spark/client/base/helpers/get_application_log.py deleted file mode 100644 index 9065a292..00000000 --- a/aztk/spark/client/base/helpers/get_application_log.py +++ /dev/null @@ -1,9 +0,0 @@ -from aztk.spark import models - - -def get_application_log(super_type, spark_base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): - base_application_log = super(super_type, spark_base_operations).get_application_log( - cluster_id, application_name, tail, current_bytes) - return models.ApplicationLog(base_application_log.name, base_application_log.cluster_id, base_application_log.log, - base_application_log.total_bytes, base_application_log.application_state, - base_application_log.exit_code) diff --git a/aztk/spark/client/base/operations.py b/aztk/spark/client/base/operations.py index de10fa2f..e922ff7c 100644 --- a/aztk/spark/client/base/operations.py +++ b/aztk/spark/client/base/operations.py @@ -5,20 +5,24 @@ from aztk.client.base import BaseOperations as CoreBaseOperations from aztk.spark import models -from .helpers import generate_cluster_start_task, generate_application_task, get_application_log +from .helpers import generate_cluster_start_task, generate_application_task -class SparkBaseOperations(CoreBaseOperations): +class SparkBaseOperations: + """Spark Base operations object that all other Spark operations objects inherit from + """ + #TODO: make this private or otherwise not public - def generate_cluster_start_task(self, - zip_resource_file: batch_models.ResourceFile, - id: str, - gpu_enabled: bool, - docker_repo: str = None, - file_shares: List[models.FileShare] = None, - plugins: List[models.PluginConfiguration] = None, - mixed_mode: bool = False, - worker_on_master: bool = True): + def _generate_cluster_start_task(self, + core_base_operations, + zip_resource_file: batch_models.ResourceFile, + id: str, + gpu_enabled: bool, + docker_repo: str = None, + file_shares: List[models.FileShare] = None, + plugins: List[models.PluginConfiguration] = None, + mixed_mode: bool = False, + worker_on_master: bool = True): """Generate the Azure Batch Start Task to provision a Spark cluster. Args: @@ -40,12 +44,11 @@ def generate_cluster_start_task(self, Returns: :obj:`azure.batch.models.StartTask`: the StartTask definition to provision the cluster. """ - return generate_cluster_start_task.generate_cluster_start_task(self, zip_resource_file, id, gpu_enabled, - docker_repo, file_shares, plugins, mixed_mode, - worker_on_master) + return generate_cluster_start_task.generate_cluster_start_task( + core_base_operations, zip_resource_file, id, gpu_enabled, docker_repo, file_shares, plugins, mixed_mode, worker_on_master) #TODO: make this private or otherwise not public - def generate_application_task(self, container_id, application, remote=False): + def _generate_application_task(self, core_base_operations, container_id, application, remote=False): """Generate the Azure Batch Start Task to provision a Spark cluster. Args: @@ -58,20 +61,4 @@ def generate_application_task(self, container_id, application, remote=False): Returns: :obj:`azure.batch.models.TaskAddParameter`: the Task definition for the Application. """ - return generate_application_task.generate_application_task(self, container_id, application, remote) - - def get_application_log(self, id: str, application_name: str, tail=False, current_bytes: int = 0): - """Get the log for a running or completed application - - Args: - id (:obj:`str`): the id of the cluster to run the command on. - application_name (:obj:`str`): str - tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes. Otherwise, the whole log will be retrieved. - Only use this if streaming the log as it is being written. Defaults to False. - current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes are retrieved. - Only useful is streaming the log as it is being written. Only used if tail is True. - - Returns: - :obj:`aztk.spark.models.ApplicationLog`: a model representing the output of the application. - """ - return get_application_log.get_application_log(SparkBaseOperations, self, id, application_name, tail, current_bytes) + return generate_application_task.generate_application_task(core_base_operations, container_id, application, remote) diff --git a/aztk/spark/client/client.py b/aztk/spark/client/client.py index 7291738d..d799f866 100644 --- a/aztk/spark/client/client.py +++ b/aztk/spark/client/client.py @@ -25,7 +25,7 @@ class Client(CoreClient): cluster (:obj:`aztk.spark.client.cluster.ClusterOperations`): Cluster job (:obj:`aztk.spark.client.job.JobOperations`): Job """ - def __init__(self, secrets_configuration: models.SecretsConfiguration, **kwargs): + def __init__(self, secrets_configuration: models.SecretsConfiguration = None, **kwargs): self.secrets_configuration = None context = None if kwargs.get("secrets_config"): diff --git a/aztk/spark/client/cluster/helpers/copy.py b/aztk/spark/client/cluster/helpers/copy.py index e2658b6f..8795931b 100644 --- a/aztk/spark/client/cluster/helpers/copy.py +++ b/aztk/spark/client/cluster/helpers/copy.py @@ -4,10 +4,10 @@ from aztk.utils import helpers -def cluster_copy(spark_cluster_operations, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False, timeout: int = None): +def cluster_copy(core_cluster_operations, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False, timeout: int = None): try: container_name = None if host else 'spark' - return super(type(spark_cluster_operations), spark_cluster_operations).copy( + return core_cluster_operations.copy( cluster_id, source_path, destination_path=destination_path, diff --git a/aztk/spark/client/cluster/helpers/create.py b/aztk/spark/client/cluster/helpers/create.py index 00a38883..2f0c68f0 100644 --- a/aztk/spark/client/cluster/helpers/create.py +++ b/aztk/spark/client/cluster/helpers/create.py @@ -4,7 +4,7 @@ from aztk import error from aztk.internal.cluster_data import NodeData from aztk.spark import models -from aztk.spark.utils import util +from aztk.spark.utils import constants, util from aztk.utils import helpers POOL_ADMIN_USER_IDENTITY = batch_models.UserIdentity( @@ -26,7 +26,7 @@ def _apply_default_for_cluster_config(configuration: models.ClusterConfiguration return cluster_conf -def create_cluster(spark_cluster_operations, cluster_conf: models.ClusterConfiguration, wait: bool = False): +def create_cluster(core_cluster_operations, spark_cluster_operations, cluster_conf: models.ClusterConfiguration, wait: bool = False): """ Create a new aztk spark cluster @@ -40,26 +40,24 @@ def create_cluster(spark_cluster_operations, cluster_conf: models.ClusterConfigu cluster_conf = _apply_default_for_cluster_config(cluster_conf) cluster_conf.validate() - cluster_data = spark_cluster_operations.get_cluster_data(cluster_conf.cluster_id) + cluster_data = core_cluster_operations.get_cluster_data(cluster_conf.cluster_id) try: zip_resource_files = None node_data = NodeData(cluster_conf).add_core().done() zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() - start_task = spark_cluster_operations.generate_cluster_start_task(zip_resource_files, cluster_conf.cluster_id, + start_task = spark_cluster_operations._generate_cluster_start_task(core_cluster_operations, zip_resource_files, cluster_conf.cluster_id, cluster_conf.gpu_enabled(), cluster_conf.get_docker_repo(), cluster_conf.file_shares, cluster_conf.plugins, cluster_conf.mixed_mode(), cluster_conf.worker_on_master) software_metadata_key = "spark" - vm_image = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04') - - cluster = super(type(spark_cluster_operations), spark_cluster_operations).create(cluster_conf, software_metadata_key, start_task, vm_image) + cluster = core_cluster_operations.create(cluster_conf, software_metadata_key, start_task, constants.SPARK_VM_IMAGE) # Wait for the master to be ready if wait: - util.wait_for_master_to_be_ready(spark_cluster_operations, cluster.id) + util.wait_for_master_to_be_ready(core_cluster_operations, spark_cluster_operations, cluster.id) cluster = spark_cluster_operations.get(cluster.id) return cluster diff --git a/aztk/spark/client/cluster/helpers/create_user.py b/aztk/spark/client/cluster/helpers/create_user.py index d18c5080..4cac446e 100644 --- a/aztk/spark/client/cluster/helpers/create_user.py +++ b/aztk/spark/client/cluster/helpers/create_user.py @@ -4,12 +4,12 @@ from aztk.utils import helpers -def create_user(spark_cluster_operations, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str: +def create_user(core_cluster_operations, spark_cluster_operations, cluster_id: str, username: str, password: str = None, ssh_key: str = None) -> str: try: cluster = spark_cluster_operations.get(cluster_id) master_node_id = cluster.master_node_id if not master_node_id: raise error.ClusterNotReadyError("The master has not yet been picked, a user cannot be added.") - spark_cluster_operations.create_user_on_cluster(username, cluster.id, cluster.nodes, ssh_key, password) + core_cluster_operations.create_user_on_cluster(username, cluster.id, cluster.nodes, ssh_key, password) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/delete.py b/aztk/spark/client/cluster/helpers/delete.py index 446d77bb..fe3074e6 100644 --- a/aztk/spark/client/cluster/helpers/delete.py +++ b/aztk/spark/client/cluster/helpers/delete.py @@ -4,8 +4,8 @@ from aztk.utils import helpers -def delete_cluster(spark_cluster_operations, cluster_id: str, keep_logs: bool = False): +def delete_cluster(core_cluster_operations, cluster_id: str, keep_logs: bool = False): try: - return super(type(spark_cluster_operations), spark_cluster_operations).delete(cluster_id, keep_logs) + return core_cluster_operations.delete(cluster_id, keep_logs) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/download.py b/aztk/spark/client/cluster/helpers/download.py index 82bd12c2..3ecb5dfd 100644 --- a/aztk/spark/client/cluster/helpers/download.py +++ b/aztk/spark/client/cluster/helpers/download.py @@ -5,10 +5,10 @@ from aztk.utils import helpers -def cluster_download(spark_cluster_operations, cluster_id: str, source_path: str, destination_path: str = None, host: bool = False, internal: bool = False, timeout: int = None): +def cluster_download(core_cluster_operations, cluster_id: str, source_path: str, destination_path: str = None, host: bool = False, internal: bool = False, timeout: int = None): try: container_name = None if host else 'spark' - return super(type(spark_cluster_operations), spark_cluster_operations).copy(cluster_id, + return core_cluster_operations.copy(cluster_id, source_path, destination_path=destination_path, container_name=container_name, diff --git a/aztk/spark/client/cluster/helpers/get.py b/aztk/spark/client/cluster/helpers/get.py index 4eae4ba6..11cbbe68 100644 --- a/aztk/spark/client/cluster/helpers/get.py +++ b/aztk/spark/client/cluster/helpers/get.py @@ -5,9 +5,9 @@ from aztk.utils import helpers -def get_cluster(spark_cluster_operations, cluster_id: str): +def get_cluster(core_cluster_operations, cluster_id: str): try: - pool, nodes = super(type(spark_cluster_operations), spark_cluster_operations).get(cluster_id) - return models.Cluster(pool, nodes) + cluster = core_cluster_operations.get(cluster_id) + return models.Cluster(cluster) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/get_application_log.py b/aztk/spark/client/cluster/helpers/get_application_log.py new file mode 100644 index 00000000..4ec73fe8 --- /dev/null +++ b/aztk/spark/client/cluster/helpers/get_application_log.py @@ -0,0 +1,7 @@ +from aztk.spark import models + + +def get_application_log(core_base_operations, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): + base_application_log = core_base_operations.get_application_log( + cluster_id, application_name, tail, current_bytes) + return models.ApplicationLog(base_application_log) diff --git a/aztk/spark/client/cluster/helpers/get_application_status.py b/aztk/spark/client/cluster/helpers/get_application_status.py index d8b4ac75..4dc19106 100644 --- a/aztk/spark/client/cluster/helpers/get_application_status.py +++ b/aztk/spark/client/cluster/helpers/get_application_status.py @@ -4,9 +4,9 @@ from aztk.utils import helpers -def get_application_status(spark_cluster_operations, cluster_id: str, app_name: str): +def get_application_status(core_cluster_operations, cluster_id: str, app_name: str): try: - task = spark_cluster_operations.batch_client.task.get(cluster_id, app_name) + task = core_cluster_operations.batch_client.task.get(cluster_id, app_name) return task.state._value_ except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/get_remote_login_settings.py b/aztk/spark/client/cluster/helpers/get_remote_login_settings.py new file mode 100644 index 00000000..3a7b0d85 --- /dev/null +++ b/aztk/spark/client/cluster/helpers/get_remote_login_settings.py @@ -0,0 +1,12 @@ +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.spark import models +from aztk.utils import helpers + + +def get_remote_login_settings(core_cluster_operations, id: str, node_id: str): + try: + return models.RemoteLogin(core_cluster_operations.get_remote_login_settings(id, node_id)) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/list.py b/aztk/spark/client/cluster/helpers/list.py index 7fa72864..8fff81fe 100644 --- a/aztk/spark/client/cluster/helpers/list.py +++ b/aztk/spark/client/cluster/helpers/list.py @@ -5,9 +5,9 @@ from aztk.utils import helpers -def list_clusters(spark_cluster_operations): +def list_clusters(core_cluster_operations): try: software_metadata_key = "spark" - return [models.Cluster(pool) for pool in super(type(spark_cluster_operations), spark_cluster_operations).list(software_metadata_key)] + return [models.Cluster(cluster) for cluster in core_cluster_operations.list(software_metadata_key)] except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/node_run.py b/aztk/spark/client/cluster/helpers/node_run.py index 3060f3e1..623bb57d 100644 --- a/aztk/spark/client/cluster/helpers/node_run.py +++ b/aztk/spark/client/cluster/helpers/node_run.py @@ -4,7 +4,7 @@ from aztk.utils import helpers -def node_run(spark_cluster_operations, +def node_run(core_cluster_operations, cluster_id: str, node_id: str, command: str, @@ -12,7 +12,7 @@ def node_run(spark_cluster_operations, internal: bool = False, timeout=None): try: - return super(type(spark_cluster_operations), spark_cluster_operations).node_run( + return core_cluster_operations.node_run( cluster_id, node_id, command, internal, container_name='spark' if not host else None, timeout=timeout) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/run.py b/aztk/spark/client/cluster/helpers/run.py index 385a6c90..a3677b83 100644 --- a/aztk/spark/client/cluster/helpers/run.py +++ b/aztk/spark/client/cluster/helpers/run.py @@ -4,9 +4,9 @@ from aztk.utils import helpers -def cluster_run(spark_cluster_operations, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): +def cluster_run(core_cluster_operations, cluster_id: str, command: str, host=False, internal: bool = False, timeout=None): try: - return super(type(spark_cluster_operations), spark_cluster_operations).run( + return core_cluster_operations.run( cluster_id, command, internal, container_name='spark' if not host else None, timeout=timeout) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/helpers/submit.py b/aztk/spark/client/cluster/helpers/submit.py index 93d50af0..0da03eff 100644 --- a/aztk/spark/client/cluster/helpers/submit.py +++ b/aztk/spark/client/cluster/helpers/submit.py @@ -7,40 +7,41 @@ from aztk.utils import helpers -def __get_node(spark_cluster_operations, node_id: str, cluster_id: str) -> batch_models.ComputeNode: - return spark_cluster_operations.batch_client.compute_node.get(cluster_id, node_id) +def __get_node(core_cluster_operations, node_id: str, cluster_id: str) -> batch_models.ComputeNode: + return core_cluster_operations.batch_client.compute_node.get(cluster_id, node_id) -def affinitize_task_to_master(spark_cluster_operations, cluster_id, task): +def affinitize_task_to_master(core_cluster_operations, spark_cluster_operations, cluster_id, task): cluster = spark_cluster_operations.get(cluster_id) if cluster.master_node_id is None: raise AztkError("Master has not yet been selected. Please wait until the cluster is finished provisioning.") - master_node = spark_cluster_operations.batch_client.compute_node.get(pool_id=cluster_id, node_id=cluster.master_node_id) + master_node = core_cluster_operations.batch_client.compute_node.get(pool_id=cluster_id, node_id=cluster.master_node_id) task.affinity_info = batch_models.AffinityInformation(affinity_id=master_node.affinity_id) return task -def submit_application(spark_cluster_operations, cluster_id, application, remote: bool = False, wait: bool = False): +def submit_application(core_cluster_operations, spark_cluster_operations, cluster_id, application, remote: bool = False, wait: bool = False): """ Submit a spark app """ - task = spark_cluster_operations.generate_application_task(cluster_id, application, remote) - task = affinitize_task_to_master(spark_cluster_operations, cluster_id, task) + task = spark_cluster_operations._generate_application_task(core_cluster_operations, cluster_id, application, remote) + task = affinitize_task_to_master(core_cluster_operations, spark_cluster_operations, cluster_id, task) # Add task to batch job (which has the same name as cluster_id) job_id = cluster_id - spark_cluster_operations.batch_client.task.add(job_id=job_id, task=task) + core_cluster_operations.batch_client.task.add(job_id=job_id, task=task) if wait: - helpers.wait_for_task_to_complete(job_id=job_id, task_id=task.id, batch_client=spark_cluster_operations.batch_client) + helpers.wait_for_task_to_complete(job_id=job_id, task_id=task.id, batch_client=core_cluster_operations.batch_client) -def submit(spark_cluster_operations, +def submit(core_cluster_operations, + spark_cluster_operations, cluster_id: str, application: models.ApplicationConfiguration, remote: bool = False, wait: bool = False): try: - submit_application(spark_cluster_operations, cluster_id, application, remote, wait) + submit_application(core_cluster_operations, spark_cluster_operations, cluster_id, application, remote, wait) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index f7bf502a..03c8e73e 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -2,11 +2,22 @@ from aztk.spark import models from aztk.spark.client.base import SparkBaseOperations -from .helpers import (copy, create, create_user, delete, diagnostics, download, - get, get_application_status, list, node_run, run, submit) +from .helpers import (copy, create, create_user, delete, diagnostics, download, get, get_application_log, + get_application_status, get_remote_login_settings, list, node_run, run, submit) -class ClusterOperations(CoreClusterOperations, SparkBaseOperations): +class ClusterOperations(SparkBaseOperations): + """Spark ClusterOperations object + + Attributes: + _core_cluster_operations (:obj:`aztk.client.cluster.CoreClusterOperations`): + # _spark_base_cluster_operations (:obj:`aztk.spark.client.cluster.CoreClusterOperations`): + """ + + def __init__(self, context): + self._core_cluster_operations = CoreClusterOperations(context) + # self._spark_base_cluster_operations = SparkBaseOperations() + def create(self, cluster_configuration: models.ClusterConfiguration, wait: bool = False): """Create a cluster. @@ -17,7 +28,7 @@ def create(self, cluster_configuration: models.ClusterConfiguration, wait: bool Returns: :obj:`aztk.spark.models.Cluster`: An Cluster object representing the state and configuration of the cluster. """ - return create.create_cluster(self, cluster_configuration, wait) + return create.create_cluster(self._core_cluster_operations, self, cluster_configuration, wait) def delete(self, id: str, keep_logs: bool = False): """Delete a cluster. @@ -29,7 +40,7 @@ def delete(self, id: str, keep_logs: bool = False): Returns: :obj:`bool`: True if the deletion process was successful. """ - return delete.delete_cluster(self, id, keep_logs) + return delete.delete_cluster(self._core_cluster_operations, id, keep_logs) def get(self, id: str): """Get details about the state of a cluster. @@ -40,7 +51,7 @@ def get(self, id: str): Returns: :obj:`aztk.spark.models.Cluster`: A Cluster object representing the state and configuration of the cluster. """ - return get.get_cluster(self, id) + return get.get_cluster(self._core_cluster_operations, id) def list(self): """List all clusters. @@ -48,7 +59,7 @@ def list(self): Returns: :obj:`List[aztk.spark.models.Cluster]`: List of Cluster objects each representing the state and configuration of the cluster. """ - return list.list_clusters(self) + return list.list_clusters(self._core_cluster_operations) def submit(self, id: str, application: models.ApplicationConfiguration, remote: bool = False, wait: bool = False): """Submit an application to a cluster. @@ -64,7 +75,7 @@ def submit(self, id: str, application: models.ApplicationConfiguration, remote: Returns: :obj:`None` """ - return submit.submit(self, id, application, remote, wait) + return submit.submit(self._core_cluster_operations, self, id, application, remote, wait) def create_user(self, id: str, username: str, password: str = None, ssh_key: str = None): """Create a user on every node in the cluster @@ -90,7 +101,7 @@ def get_application_status(self, id: str, application_name: str): Returns: :obj:`str`: the status state of the application """ - return get_application_status.get_application_status(self, id, application_name) + return get_application_status.get_application_status(self._core_cluster_operations, id, application_name) def run(self, id: str, command: str, host=False, internal: bool = False, timeout=None): """Run a bash command on every node in the cluster @@ -108,7 +119,7 @@ def run(self, id: str, command: str, host=False, internal: bool = False, timeout Returns: :obj:`List[aztk.spark.models.NodeOutput]`: list of NodeOutput objects containing the output of the run command """ - return run.cluster_run(self, id, command, host, internal, timeout) + return run.cluster_run(self._core_cluster_operations, id, command, host, internal, timeout) def node_run(self, id: str, node_id: str, command: str, host=False, internal: bool = False, timeout=None): """Run a bash command on the given node @@ -127,7 +138,7 @@ def node_run(self, id: str, node_id: str, command: str, host=False, internal: bo Returns: :obj:`aztk.spark.models.NodeOutput`: object containing the output of the run command """ - return node_run.node_run(self, id, node_id, command, host, internal, timeout) + return node_run.node_run(self._core_cluster_operations, id, node_id, command, host, internal, timeout) def copy(self, id: str, @@ -152,7 +163,7 @@ def copy(self, Returns: :obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. """ - return copy.cluster_copy(self, id, source_path, destination_path, host, internal, timeout) + return copy.cluster_copy(self._core_cluster_operations, id, source_path, destination_path, host, internal, timeout) def download(self, id: str, @@ -179,7 +190,8 @@ def download(self, Returns: :obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. """ - return download.cluster_download(self, id, source_path, destination_path, host, internal, timeout) + return download.cluster_download(self._core_cluster_operations, id, source_path, destination_path, host, internal, + timeout) def diagnostics(self, id, output_directory=None): """Download a file from every node in a cluster. @@ -194,3 +206,31 @@ def diagnostics(self, id, output_directory=None): :obj:`List[aztk.spark.models.NodeOutput]`: A list of NodeOutput objects representing the output of the copy command. """ return diagnostics.run_cluster_diagnostics(self, id, output_directory) + + def get_application_log(self, id: str, application_name: str, tail=False, current_bytes: int = 0): + """Get the log for a running or completed application + + Args: + id (:obj:`str`): the id of the cluster to run the command on. + application_name (:obj:`str`): str + tail (:obj:`bool`, optional): If True, get the remaining bytes after current_bytes. Otherwise, the whole log will be retrieved. + Only use this if streaming the log as it is being written. Defaults to False. + current_bytes (:obj:`int`): Specifies the last seen byte, so only the bytes after current_bytes are retrieved. + Only useful is streaming the log as it is being written. Only used if tail is True. + + Returns: + :obj:`aztk.spark.models.ApplicationLog`: a model representing the output of the application. + """ + return get_application_log.get_application_log(self._core_cluster_operations, id, application_name, tail, current_bytes) + + def get_remote_login_settings(self, id: str, node_id: str): + """Get the remote login information for a node in a cluster + + Args: + id (:obj:`str`): the id of the cluster the node is in + node_id (:obj:`str`): the id of the node in the cluster + + Returns: + :obj:`aztk.spark.models.RemoteLogin`: Object that contains the ip address and port combination to login to a node + """ + return get_remote_login_settings.get_remote_login_settings(self._core_cluster_operations, id, node_id) diff --git a/aztk/spark/client/job/helpers/delete.py b/aztk/spark/client/job/helpers/delete.py index eb0f7658..e2ad8be2 100644 --- a/aztk/spark/client/job/helpers/delete.py +++ b/aztk/spark/client/job/helpers/delete.py @@ -8,32 +8,32 @@ from .get_recent_job import get_recent_job -def _delete(spark_job_operations, job_id, keep_logs: bool = False): - recent_run_job = get_recent_job(spark_job_operations, job_id) +def _delete(core_job_operations, spark_job_operations, job_id, keep_logs: bool = False): + recent_run_job = get_recent_job(core_job_operations, job_id) deleted_job_or_job_schedule = False # delete job try: - spark_job_operations.batch_client.job.delete(recent_run_job.id) + core_job_operations.batch_client.job.delete(recent_run_job.id) deleted_job_or_job_schedule = True except batch_models.batch_error.BatchErrorException: pass # delete job_schedule try: - spark_job_operations.batch_client.job_schedule.delete(job_id) + core_job_operations.batch_client.job_schedule.delete(job_id) deleted_job_or_job_schedule = True except batch_models.batch_error.BatchErrorException: pass # delete storage container if keep_logs: - cluster_data = spark_job_operations.get_cluster_data(job_id) + cluster_data = core_job_operations.get_cluster_data(job_id) cluster_data.delete_container(job_id) return deleted_job_or_job_schedule -def delete(spark_job_operations, job_id: str, keep_logs: bool = False): +def delete(core_job_operations, spark_job_operations, job_id: str, keep_logs: bool = False): try: - return _delete(spark_job_operations, job_id, keep_logs) + return _delete(core_job_operations, spark_job_operations, job_id, keep_logs) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/get.py b/aztk/spark/client/job/helpers/get.py index 94fab362..2be9b55e 100644 --- a/aztk/spark/client/job/helpers/get.py +++ b/aztk/spark/client/job/helpers/get.py @@ -7,26 +7,26 @@ from .get_recent_job import get_recent_job -def _get_job(spark_job_operations, job_id): - job = spark_job_operations.batch_client.job_schedule.get(job_id) +def _get_job(core_job_operations, job_id): + job = core_job_operations.batch_client.job_schedule.get(job_id) job_apps = [ - app for app in spark_job_operations.batch_client.task.list(job_id=job.execution_info.recent_job.id) if app.id != job_id + app for app in core_job_operations.batch_client.task.list(job_id=job.execution_info.recent_job.id) if app.id != job_id ] - recent_run_job = get_recent_job(spark_job_operations, job_id) + recent_run_job = get_recent_job(core_job_operations, job_id) pool_prefix = recent_run_job.pool_info.auto_pool_specification.auto_pool_id_prefix pool = nodes = None - for cloud_pool in spark_job_operations.batch_client.pool.list(): + for cloud_pool in core_job_operations.batch_client.pool.list(): if pool_prefix in cloud_pool.id: pool = cloud_pool break if pool: - nodes = spark_job_operations.batch_client.compute_node.list(pool_id=pool.id) + nodes = core_job_operations.batch_client.compute_node.list(pool_id=pool.id) return job, job_apps, pool, nodes -def get_job(spark_job_operations, job_id): +def get_job(core_job_operations, job_id): try: - job, apps, pool, nodes = _get_job(spark_job_operations, job_id) + job, apps, pool, nodes = _get_job(core_job_operations, job_id) return models.Job(job, apps, pool, nodes) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/get_application.py b/aztk/spark/client/job/helpers/get_application.py index 4a514302..cbee81d9 100644 --- a/aztk/spark/client/job/helpers/get_application.py +++ b/aztk/spark/client/job/helpers/get_application.py @@ -10,9 +10,9 @@ def _get_application(spark_job_operations, job_id, application_name): # info about the app - recent_run_job = get_recent_job(spark_job_operations, job_id) + recent_run_job = get_recent_job(spark_job_operations._core_job_operations, job_id) try: - return spark_job_operations.batch_client.task.get(job_id=recent_run_job.id, task_id=application_name) + return spark_job_operations._core_job_operations.batch_client.task.get(job_id=recent_run_job.id, task_id=application_name) except batch_models.batch_error.BatchErrorException: raise error.AztkError( "The Spark application {0} is still being provisioned or does not exist.".format(application_name)) diff --git a/aztk/spark/client/job/helpers/get_application_log.py b/aztk/spark/client/job/helpers/get_application_log.py index af2406cc..8c1855d9 100644 --- a/aztk/spark/client/job/helpers/get_application_log.py +++ b/aztk/spark/client/job/helpers/get_application_log.py @@ -9,13 +9,13 @@ from .get_recent_job import get_recent_job -def _get_application_log(spark_job_operations, job_id, application_name): +def _get_application_log(core_job_operations, spark_job_operations, job_id, application_name): # TODO: change where the logs are uploaded so they aren't overwritten on scheduled runs # current: job_id, application_name/output.log # new: job_id, recent_run_job.id/application_name/output.log - recent_run_job = get_recent_job(spark_job_operations, job_id) + recent_run_job = get_recent_job(core_job_operations, job_id) try: - task = spark_job_operations.batch_client.task.get(job_id=recent_run_job.id, task_id=application_name) + task = core_job_operations.batch_client.task.get(job_id=recent_run_job.id, task_id=application_name) except batch_models.batch_error.BatchErrorException as e: # see if the application is written to metadata of pool applications = spark_job_operations.list_applications(job_id) @@ -29,11 +29,12 @@ def _get_application_log(spark_job_operations, job_id, application_name): batch_models.TaskState.preparing): raise error.AztkError("The application {0} has not yet finished executing.".format(application_name)) - return super(type(spark_job_operations), spark_job_operations).get_application_log(job_id, application_name) + return core_job_operations.get_application_log(job_id, application_name) -def get_job_application_log(spark_job_operations, job_id, application_name): +def get_job_application_log(core_job_operations, spark_job_operations, job_id, application_name): try: - return _get_application_log(spark_job_operations, job_id, application_name) + return models.ApplicationLog( + _get_application_log(core_job_operations, spark_job_operations, job_id, application_name)) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/get_recent_job.py b/aztk/spark/client/job/helpers/get_recent_job.py index 62a5082d..92f763e5 100644 --- a/aztk/spark/client/job/helpers/get_recent_job.py +++ b/aztk/spark/client/job/helpers/get_recent_job.py @@ -1,3 +1,3 @@ -def get_recent_job(spark_job_operations, job_id): - job_schedule = spark_job_operations.batch_client.job_schedule.get(job_id) - return spark_job_operations.batch_client.job.get(job_schedule.execution_info.recent_job.id) +def get_recent_job(core_job_operations, job_id): + job_schedule = core_job_operations.batch_client.job_schedule.get(job_id) + return core_job_operations.batch_client.job.get(job_schedule.execution_info.recent_job.id) diff --git a/aztk/spark/client/job/helpers/list.py b/aztk/spark/client/job/helpers/list.py index 8b511c4f..146c465d 100644 --- a/aztk/spark/client/job/helpers/list.py +++ b/aztk/spark/client/job/helpers/list.py @@ -5,12 +5,12 @@ from aztk.utils import helpers -def _list_jobs(spark_job_operations): - return [cloud_job_schedule for cloud_job_schedule in spark_job_operations.batch_client.job_schedule.list()] +def _list_jobs(core_job_operations): + return [cloud_job_schedule for cloud_job_schedule in core_job_operations.batch_client.job_schedule.list()] -def list_jobs(self): +def list_jobs(core_job_operations): try: - return [models.Job(cloud_job_schedule) for cloud_job_schedule in _list_jobs(self)] + return [models.Job(cloud_job_schedule) for cloud_job_schedule in _list_jobs(core_job_operations)] except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/list_applications.py b/aztk/spark/client/job/helpers/list_applications.py index aca5cc16..81dab6cc 100644 --- a/aztk/spark/client/job/helpers/list_applications.py +++ b/aztk/spark/client/job/helpers/list_applications.py @@ -7,8 +7,8 @@ from .get_recent_job import get_recent_job -def _list_applications(spark_job_operations, job_id): - recent_run_job = get_recent_job(spark_job_operations, job_id) +def _list_applications(core_job_operations, job_id): + recent_run_job = get_recent_job(core_job_operations, job_id) # get application names from Batch job metadata applications = {} for metadata_item in recent_run_job.metadata: @@ -17,16 +17,16 @@ def _list_applications(spark_job_operations, job_id): applications[app_name] = None # get tasks from Batch job - for task in spark_job_operations.batch_client.task.list(recent_run_job.id): + for task in core_job_operations.batch_client.task.list(recent_run_job.id): if task.id != job_id: applications[task.id] = task return applications -def list_applications(spark_job_operations, job_id): +def list_applications(core_job_operations, job_id): try: - applications = _list_applications(spark_job_operations, job_id) + applications = _list_applications(core_job_operations, job_id) for item in applications: if applications[item]: applications[item] = models.Application(applications[item]) diff --git a/aztk/spark/client/job/helpers/stop.py b/aztk/spark/client/job/helpers/stop.py index 8e21c416..8fd7660e 100644 --- a/aztk/spark/client/job/helpers/stop.py +++ b/aztk/spark/client/job/helpers/stop.py @@ -7,12 +7,12 @@ from .get_recent_job import get_recent_job -def _stop(spark_job_operations, job_id): +def _stop(core_job_operations, job_id): # terminate currently running job and tasks - recent_run_job = get_recent_job(spark_job_operations, job_id) - spark_job_operations.batch_client.job.terminate(recent_run_job.id) + recent_run_job = get_recent_job(core_job_operations, job_id) + core_job_operations.batch_client.job.terminate(recent_run_job.id) # terminate job_schedule - spark_job_operations.batch_client.job_schedule.terminate(job_id) + core_job_operations.batch_client.job_schedule.terminate(job_id) def stop(self, job_id): diff --git a/aztk/spark/client/job/helpers/stop_application.py b/aztk/spark/client/job/helpers/stop_application.py index a55269ac..bc9c9611 100644 --- a/aztk/spark/client/job/helpers/stop_application.py +++ b/aztk/spark/client/job/helpers/stop_application.py @@ -5,12 +5,12 @@ from aztk.utils import helpers from .get_recent_job import get_recent_job -def stop_app(spark_job_operations, job_id, application_name): - recent_run_job = get_recent_job(spark_job_operations, job_id) +def stop_app(core_job_operations, job_id, application_name): + recent_run_job = get_recent_job(core_job_operations, job_id) # stop batch task try: - spark_job_operations.batch_client.task.terminate(job_id=recent_run_job.id, task_id=application_name) + core_job_operations.batch_client.task.terminate(job_id=recent_run_job.id, task_id=application_name) return True except batch_error.BatchErrorException: return False diff --git a/aztk/spark/client/job/helpers/submit.py b/aztk/spark/client/job/helpers/submit.py index 15ffa667..f1341812 100644 --- a/aztk/spark/client/job/helpers/submit.py +++ b/aztk/spark/client/job/helpers/submit.py @@ -22,7 +22,7 @@ def __app_cmd(): return docker_exec.to_str() -def generate_job_manager_task(spark_client, job, application_tasks): +def generate_job_manager_task(core_job_operations, job, application_tasks): resource_files = [] for application, task in application_tasks: task_definition_resource_file = helpers.upload_text_to_container( @@ -30,7 +30,7 @@ def generate_job_manager_task(spark_client, job, application_tasks): application_name=application.name + '.yaml', file_path=application.name + '.yaml', content=yaml.dump(task), - blob_client=spark_client.blob_client) + blob_client=core_job_operations.blob_client) resource_files.append(task_definition_resource_file) task_cmd = __app_cmd() @@ -63,15 +63,16 @@ def _apply_default_for_job_config(job_conf: models.JobConfiguration): return job_conf -def submit_job(spark_job_operations, job_configuration: models.JobConfiguration): +def submit_job(core_job_operations, spark_job_operations, job_configuration: models.JobConfiguration): try: job_configuration = _apply_default_for_job_config(job_configuration) job_configuration.validate() - cluster_data = spark_job_operations.get_cluster_data(job_configuration.id) + cluster_data = core_job_operations.get_cluster_data(job_configuration.id) node_data = NodeData(job_configuration.to_cluster_config()).add_core().done() zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file() - start_task = spark_job_operations.generate_cluster_start_task( + start_task = spark_job_operations._generate_cluster_start_task( + core_job_operations, zip_resource_files, job_configuration.id, job_configuration.gpu_enabled, @@ -82,10 +83,10 @@ def submit_job(spark_job_operations, job_configuration: models.JobConfiguration) application_tasks = [] for application in job_configuration.applications: application_tasks.append((application, - spark_job_operations.generate_application_task(job_configuration.id, + spark_job_operations._generate_application_task(core_job_operations, job_configuration.id, application))) - job_manager_task = generate_job_manager_task(spark_job_operations, job_configuration, application_tasks) + job_manager_task = generate_job_manager_task(core_job_operations, job_configuration, application_tasks) software_metadata_key = "spark" @@ -96,7 +97,7 @@ def submit_job(spark_job_operations, job_configuration: models.JobConfiguration) job_configuration.max_dedicated_nodes, job_configuration.max_low_pri_nodes) - job = super(type(spark_job_operations), spark_job_operations).submit( + job = core_job_operations.submit( job_configuration=job_configuration, start_task=start_task, job_manager_task=job_manager_task, diff --git a/aztk/spark/client/job/helpers/wait_until_complete.py b/aztk/spark/client/job/helpers/wait_until_complete.py index 0f58ea3b..5fcc8ae3 100644 --- a/aztk/spark/client/job/helpers/wait_until_complete.py +++ b/aztk/spark/client/job/helpers/wait_until_complete.py @@ -7,16 +7,16 @@ from aztk.utils import helpers -def _wait_until_job_finished(spark_job_operations, job_id): - job_state = spark_job_operations.batch_client.job_schedule.get(job_id).state +def _wait_until_job_finished(core_job_operations, job_id): + job_state = core_job_operations.batch_client.job_schedule.get(job_id).state while job_state != batch_models.JobScheduleState.completed: time.sleep(3) - job_state = spark_job_operations.batch_client.job_schedule.get(job_id).state + job_state = core_job_operations.batch_client.job_schedule.get(job_id).state -def wait_until_job_finished(spark_job_operations, job_id): +def wait_until_job_finished(core_job_operations, job_id): try: - _wait_until_job_finished(spark_job_operations, job_id) + _wait_until_job_finished(core_job_operations, job_id) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/operations.py b/aztk/spark/client/job/operations.py index a8869ac7..a9a1d1b7 100644 --- a/aztk/spark/client/job/operations.py +++ b/aztk/spark/client/job/operations.py @@ -6,14 +6,25 @@ stop_application, submit, wait_until_complete) -class JobOperations(CoreJobOperations, SparkBaseOperations): +class JobOperations(SparkBaseOperations): + """Spark ClusterOperations object + + Attributes: + _core_job_operations (:obj:`aztk.client.cluster.CoreJobOperations`): + """ + + def __init__(self, context): + self._core_job_operations = CoreJobOperations(context) + # self._spark_base_cluster_operations = SparkBaseOperations() + + def list(self): """List all jobs. Returns: :obj:`List[Job]`: List of aztk.models.Job objects each representing the state and configuration of the job. """ - return list.list_jobs(self) + return list.list_jobs(self._core_job_operations) def delete(self, id, keep_logs: bool = False): """Delete a job. @@ -25,7 +36,7 @@ def delete(self, id, keep_logs: bool = False): Returns: :obj:`bool`: True if the deletion process was successful. """ - return delete.delete(self, id, keep_logs) + return delete.delete(self._core_job_operations, self, id, keep_logs) def get(self, id): """Get details about the state of a job. @@ -36,7 +47,7 @@ def get(self, id): Returns: :obj:`aztk.spark.models.job`: A job object representing the state and configuration of the job. """ - return get.get_job(self, id) + return get.get_job(self._core_job_operations, id) def get_application(self, id, application_name): """Get information on a submitted application @@ -60,7 +71,7 @@ def get_application_log(self, id, application_name): Returns: :obj:`aztk.spark.models.ApplicationLog`: a model representing the output of the application. """ - return get_application_log.get_job_application_log(self, id, application_name) + return get_application_log.get_job_application_log(self._core_job_operations, self, id, application_name) def list_applications(self, id): """List all application defined as a part of a job @@ -71,7 +82,7 @@ def list_applications(self, id): Returns: :obj:`List[aztk.spark.models.Application]`: a list of all applications defined as a part of the job """ - return list_applications.list_applications(self, id) + return list_applications.list_applications(self._core_job_operations, id) def stop(self, id): """Stop a submitted job @@ -82,7 +93,7 @@ def stop(self, id): Returns: :obj:`None` """ - return stop.stop(self, id) + return stop.stop(self._core_job_operations, id) def stop_application(self, id, application_name): """Stops a submitted application @@ -94,7 +105,7 @@ def stop_application(self, id, application_name): Returns: :obj:`bool`: True if the stop was successful, else False """ - return stop_application.stop_app(self, id, application_name) + return stop_application.stop_app(self._core_job_operations, id, application_name) def submit(self, job_configuration: models.JobConfiguration): """Submit a job @@ -109,8 +120,8 @@ def submit(self, job_configuration: models.JobConfiguration): Returns: :obj:`aztk.spark.models.Job`: Model representing the state of the job. """ - return submit.submit_job(self, job_configuration) + return submit.submit_job(self._core_job_operations, self, job_configuration) #TODO: rename to something better or make this a parameter of submit def wait_until_job_finished(self, id): - wait_until_complete.wait_until_job_finished(self, id) + wait_until_complete.wait_until_job_finished(self._core_job_operations, id) diff --git a/aztk/spark/models/models.py b/aztk/spark/models/models.py index c1ae1105..26c44b8d 100644 --- a/aztk/spark/models/models.py +++ b/aztk/spark/models/models.py @@ -17,10 +17,10 @@ def __init__(self, version: str, environment: str = None, environment_version: s class Cluster(aztk.models.Cluster): - def __init__(self, pool: batch_models.CloudPool = None, nodes: batch_models.ComputeNodePaged = None): - super().__init__(pool, nodes) + def __init__(self, cluster: aztk.models.Cluster): + super().__init__(cluster.pool, cluster.nodes) self.master_node_id = self.__get_master_node_id() - self.gpu_enabled = helpers.is_gpu_enabled(pool.vm_size) + self.gpu_enabled = helpers.is_gpu_enabled(cluster.pool.vm_size) def is_pool_running_spark(self, pool: batch_models.CloudPool): if pool.metadata is None: @@ -47,7 +47,9 @@ def __get_master_node_id(self): class RemoteLogin(aztk.models.RemoteLogin): - pass + def __init__(self, remote_login: aztk.models.RemoteLogin): + super().__init__(remote_login.ip_address, remote_login.port) + class PortForwardingSpecification(aztk.models.PortForwardingSpecification): pass @@ -286,10 +288,16 @@ def __init__(self, cloud_job_schedule: batch_models.CloudJobSchedule, self.creation_time = cloud_job_schedule.creation_time self.applications = [Application(task) for task in (cloud_tasks or [])] if pool: - self.cluster = Cluster(pool, nodes) + self.cluster = Cluster(aztk.models.Cluster(pool, nodes)) else: self.cluster = None class ApplicationLog(aztk.models.ApplicationLog): - pass \ No newline at end of file + def __init__(self, application_log: aztk.models.ApplicationLog): + self.name = application_log.name + self.cluster_id = application_log.cluster_id # TODO: change to something cluster/job agnostic + self.log = application_log.log + self.total_bytes = application_log.total_bytes + self.application_state = application_log.application_state + self.exit_code = application_log.exit_code diff --git a/aztk/spark/utils/constants.py b/aztk/spark/utils/constants.py new file mode 100644 index 00000000..831abf84 --- /dev/null +++ b/aztk/spark/utils/constants.py @@ -0,0 +1,3 @@ +from aztk.spark import models + +SPARK_VM_IMAGE = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04') diff --git a/aztk/spark/utils/util.py b/aztk/spark/utils/util.py index 17ef6173..7d72239c 100644 --- a/aztk/spark/utils/util.py +++ b/aztk/spark/utils/util.py @@ -17,18 +17,18 @@ class MasterInvalidStateError(Exception): pass -def wait_for_master_to_be_ready(client, cluster_id: str): +def wait_for_master_to_be_ready(core_operations, spark_operations, cluster_id: str): master_node_id = None start_time = datetime.datetime.now() while True: if not master_node_id: - master_node_id = client.get(cluster_id).master_node_id + master_node_id = spark_operations.get(cluster_id).master_node_id if not master_node_id: time.sleep(5) continue - master_node = client.batch_client.compute_node.get(cluster_id, master_node_id) + master_node = core_operations.batch_client.compute_node.get(cluster_id, master_node_id) if master_node.state in [batch_models.ComputeNodeState.idle, batch_models.ComputeNodeState.running]: break diff --git a/tests/integration_tests/spark/sdk/job/test_job.py b/tests/integration_tests/spark/sdk/job/test_job.py index 0350c6cb..00d04b62 100644 --- a/tests/integration_tests/spark/sdk/job/test_job.py +++ b/tests/integration_tests/spark/sdk/job/test_job.py @@ -19,12 +19,12 @@ def test_submit_job(): app1 = aztk.spark.models.ApplicationConfiguration( name="pipy100", application="./examples/src/main/python/pi.py", - application_args=[100] + application_args=[10] ) app2 = aztk.spark.models.ApplicationConfiguration( name="pipy101", application="./examples/src/main/python/pi.py", - application_args=[100] + application_args=[10] ) job_configuration = aztk.spark.models.JobConfiguration( id=test_id+base_job_id, @@ -54,12 +54,12 @@ def test_list_jobs(): app1 = aztk.spark.models.ApplicationConfiguration( name="pipy100", application="./examples/src/main/python/pi.py", - application_args=[100] + application_args=[10] ) app2 = aztk.spark.models.ApplicationConfiguration( name="pipy101", application="./examples/src/main/python/pi.py", - application_args=[100] + application_args=[10] ) job_configuration = aztk.spark.models.JobConfiguration( id=test_id+base_job_id, @@ -93,12 +93,12 @@ def test_list_applications(): app1 = aztk.spark.models.ApplicationConfiguration( name="pipy100", application="./examples/src/main/python/pi.py", - application_args=[100] + application_args=[10] ) app2 = aztk.spark.models.ApplicationConfiguration( name="pipy101", application="./examples/src/main/python/pi.py", - application_args=[100] + application_args=[10] ) job_configuration = aztk.spark.models.JobConfiguration( id=test_id+base_job_id, @@ -133,12 +133,12 @@ def test_get_job(): app1 = aztk.spark.models.ApplicationConfiguration( name="pipy100", application="./examples/src/main/python/pi.py", - application_args=[100] + application_args=[10] ) app2 = aztk.spark.models.ApplicationConfiguration( name="pipy101", application="./examples/src/main/python/pi.py", - application_args=[100] + application_args=[10] ) job_configuration = aztk.spark.models.JobConfiguration( id=test_id+base_job_id, @@ -172,7 +172,7 @@ def test_get_application(): app1 = aztk.spark.models.ApplicationConfiguration( name="pipy100", application="./examples/src/main/python/pi.py", - application_args=[100] + application_args=[10] ) job_configuration = aztk.spark.models.JobConfiguration( id=test_id+base_job_id, @@ -203,7 +203,7 @@ def test_get_application_log(): app1 = aztk.spark.models.ApplicationConfiguration( name="pipy100", application="./examples/src/main/python/pi.py", - application_args=[100] + application_args=[10] ) job_configuration = aztk.spark.models.JobConfiguration( id=test_id+base_job_id, @@ -239,7 +239,7 @@ def test_delete_job(): app1 = aztk.spark.models.ApplicationConfiguration( name="pipy100", application="./examples/src/main/python/pi.py", - application_args=[100] + application_args=[10] ) job_configuration = aztk.spark.models.JobConfiguration( id=test_id+base_job_id, @@ -271,5 +271,5 @@ def test_delete_job(): def clean_up_job(job_id): try: spark_client.job.delete(job_id) - except (BatchErrorException, AztkError): + except Exception: pass diff --git a/tests/integration_tests/spark/sdk/job/test_job_deprecated.py b/tests/integration_tests/spark/sdk/job/test_job_deprecated.py index 36e0542e..0a463283 100644 --- a/tests/integration_tests/spark/sdk/job/test_job_deprecated.py +++ b/tests/integration_tests/spark/sdk/job/test_job_deprecated.py @@ -296,5 +296,5 @@ def test_delete_job(): def clean_up_job(job_id): try: spark_client.delete_job(job_id) - except (BatchErrorException, AztkError): + except Exception: pass From 2d0700cd79e70e97fd419f2feb07e1dffb20b003 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 30 Jul 2018 10:35:46 -0700 Subject: [PATCH 42/52] fix models bugs --- aztk/spark/client/client.py | 9 +++++---- aztk/spark/helpers/get_log.py | 21 +++++++++++---------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/aztk/spark/client/client.py b/aztk/spark/client/client.py index d799f866..40c610a2 100644 --- a/aztk/spark/client/client.py +++ b/aztk/spark/client/client.py @@ -4,16 +4,17 @@ import aztk from aztk import error +from aztk import models as base_models from aztk.client import CoreClient from aztk.internal.cluster_data import NodeData from aztk.spark import models from aztk.spark.client.cluster import ClusterOperations from aztk.spark.client.job import JobOperations +from aztk.spark.helpers import cluster_diagnostic_helper from aztk.spark.helpers import create_cluster as create_cluster_helper from aztk.spark.helpers import get_log as get_log_helper from aztk.spark.helpers import job_submission as job_submit_helper from aztk.spark.helpers import submit as cluster_submit_helper -from aztk.spark.helpers import cluster_diagnostic_helper from aztk.spark.utils import util from aztk.utils import azure_api, deprecated, helpers @@ -94,10 +95,10 @@ def wait_until_applications_done(self, cluster_id: str): # NOT IMPLEMENTED @deprecated("0.10.0") def wait_until_cluster_is_ready(self, cluster_id: str): # NOT IMPLEMENTED try: - util.wait_for_master_to_be_ready(self, cluster_id) + util.wait_for_master_to_be_ready(self.cluster._core_cluster_operations, self.cluster, cluster_id) pool = self.batch_client.pool.get(cluster_id) nodes = self.batch_client.compute_node.list(pool_id=cluster_id) - return models.Cluster(pool, nodes) + return models.Cluster(base_models.Cluster(pool, nodes)) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) @@ -169,7 +170,7 @@ def cluster_ssh_into_master(self, password=None, port_forward_list=None, internal=False): - return self.cluster.ssh_into_node(cluster_id, node_id, username, ssh_key, password, port_forward_list, internal) + return self.cluster._core_cluster_operations.ssh_into_node(cluster_id, node_id, username, ssh_key, password, port_forward_list, internal) ''' job submission diff --git a/aztk/spark/helpers/get_log.py b/aztk/spark/helpers/get_log.py index 6444ea8a..032e64a3 100644 --- a/aztk/spark/helpers/get_log.py +++ b/aztk/spark/helpers/get_log.py @@ -1,13 +1,13 @@ import time -import azure.batch.models as batch_models + import azure +import azure.batch.models as batch_models import azure.batch.models.batch_error as batch_error from aztk import error -from aztk.utils import helpers -from aztk.utils import constants +from aztk import models as base_models from aztk.spark import models - +from aztk.utils import constants, helpers output_file = constants.TASK_WORKING_DIR + \ "/" + constants.SPARK_SUBMIT_LOGS_FILE @@ -53,14 +53,14 @@ def get_log_from_storage(blob_client, container_name, application_name, task): blob = blob_client.get_blob_to_text(container_name, application_name + '/' + constants.SPARK_SUBMIT_LOGS_FILE) except azure.common.AzureMissingResourceHttpError: raise error.AztkError("Logs not found in your storage account. They were either deleted or never existed.") - - return models.ApplicationLog( + base_model = base_models.ApplicationLog( name=application_name, cluster_id=container_name, application_state=task.state._value_, log=blob.content, total_bytes=blob.properties.content_length, - exit_code = task.execution_info.exit_code) + exit_code=task.execution_info.exit_code) + return models.ApplicationLog(base_model) def get_log(batch_client, blob_client, cluster_id: str, application_name: str, tail=False, current_bytes: int = 0): @@ -85,19 +85,20 @@ def get_log(batch_client, blob_client, cluster_id: str, application_name: str, t stream = batch_client.file.get_from_task( job_id, task_id, output_file, batch_models.FileGetFromTaskOptions(ocp_range=ocp_range)) content = helpers.read_stream_as_string(stream) - - return models.ApplicationLog( + base_model = base_models.ApplicationLog( name=application_name, cluster_id=cluster_id, application_state=task.state._value_, log=content, total_bytes=target_bytes, exit_code=task.execution_info.exit_code) + return models.ApplicationLog(base_model) else: - return models.ApplicationLog( + base_model = base_models.ApplicationLog( name=application_name, cluster_id=cluster_id, application_state=task.state._value_, log='', total_bytes=target_bytes, exit_code=task.execution_info.exit_code) + return models.ApplicationLog(base_model) From be31eac5f8db8a43611a0d13215f646e61b6ae8b Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 30 Jul 2018 11:06:32 -0700 Subject: [PATCH 43/52] fix create_user bug --- aztk/spark/client/cluster/operations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index 03c8e73e..568f0669 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -89,7 +89,7 @@ def create_user(self, id: str, username: str, password: str = None, ssh_key: str Returns: :obj:`None` """ - return create_user.create_user(self, id, username, ssh_key, password) + return create_user.create_user(self._core_cluster_operations, self, id, username, ssh_key, password) def get_application_status(self, id: str, application_name: str): """Get the status of a submitted application From f179b0dc2c064fb93eac725fd1fd1e450d6c72e7 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 30 Jul 2018 11:07:16 -0700 Subject: [PATCH 44/52] update sdk_example.py --- examples/sdk/sdk_example.py | 105 +++++++++++++----------------------- 1 file changed, 38 insertions(+), 67 deletions(-) diff --git a/examples/sdk/sdk_example.py b/examples/sdk/sdk_example.py index 8359e30a..86065dcb 100644 --- a/examples/sdk/sdk_example.py +++ b/examples/sdk/sdk_example.py @@ -3,7 +3,7 @@ from aztk.error import AztkError # set your secrets -secrets_confg = aztk.spark.models.SecretsConfiguration( +secrets_configuration = aztk.spark.models.SecretsConfiguration( service_principal=aztk.spark.models.ServicePrincipalConfiguration( tenant_id=".onmicrosoft.com", client_id="", @@ -11,105 +11,76 @@ batch_account_resource_id="", storage_account_resource_id="", ), - ssh_pub_key="" -) + ssh_pub_key="") # set path to root of repository to reference files -ROOT_PATH = os.path.normpath(os.path.join(os.path.dirname(__file__), '..', '..')) +ROOT_PATH = os.path.abspath(os.path.normpath(os.path.join(os.path.dirname(__file__), '..', '..'))) # create a client -client = aztk.spark.Client(secrets_confg) +client = aztk.spark.Client(secrets_configuration) # list available clusters -clusters = client.list_clusters() - -# define a custom script -custom_script = aztk.spark.models.CustomScript( - name="simple.sh", - script=os.path.join(ROOT_PATH, 'custom-scripts', 'simple.sh'), - run_on="all-nodes") +clusters = client.cluster.list() # define spark configuration -spark_conf = aztk.spark.models.SparkConfiguration( - spark_defaults_conf=os.path.join(ROOT_PATH, 'config', 'spark-defaults.conf'), - spark_env_sh=os.path.join(ROOT_PATH, 'config', 'spark-env.sh'), - core_site_xml=os.path.join(ROOT_PATH, 'config', 'core-site.xml'), - jars=[os.path.join(ROOT_PATH, 'config', 'jars', jar) for jar in os.listdir(os.path.join(ROOT_PATH, 'config', 'jars'))] -) +configuration_file_path = os.path.join(ROOT_PATH, 'aztk_cli', 'config') +spark_configuration = aztk.spark.models.SparkConfiguration( + spark_defaults_conf=os.path.join(configuration_file_path, 'spark-defaults.conf'), + spark_env_sh=os.path.join(configuration_file_path, 'spark-env.sh'), + core_site_xml=os.path.join(configuration_file_path, 'core-site.xml'), + jars=[ + os.path.join(configuration_file_path, 'jars', jar) + for jar in os.listdir(os.path.join(configuration_file_path, 'jars')) + ]) # configure my cluster -cluster_config = aztk.spark.models.ClusterConfiguration( - cluster_id="sdk-test", +cluster_configuration = aztk.spark.models.ClusterConfiguration( + cluster_id="sdk-test1", toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), size=2, vm_size="standard_f2", - custom_scripts=[custom_script], - spark_configuration=spark_conf -) + spark_configuration=spark_configuration) # create a cluster, and wait until it is ready try: - cluster = client.create_cluster(cluster_config) - cluster = client.wait_until_cluster_is_ready(cluster.id) + cluster = client.cluster.create(cluster_configuration, wait=True) except AztkError as e: - print(e.message) - sys.exit() + raise e -# get details of specific cluster -cluster = client.get_cluster(cluster_config.cluster_id) +# get details of the cluster +cluster = client.cluster.get(cluster.id) # # create a user for the cluster -client.create_user(cluster.id, "sdk_example_user", "example_password") +client.cluster.create_user(cluster.id, "sdk_example_user", "example_password") -# create some apps to run +# define a Spark application to run app1 = aztk.spark.models.ApplicationConfiguration( name="pipy1", application=os.path.join(ROOT_PATH, 'examples', 'src', 'main', 'python', 'pi.py'), - application_args="10" -) - -app2 = aztk.spark.models.ApplicationConfiguration( - name="pipy2", - application=os.path.join(ROOT_PATH, 'examples', 'src', 'main', 'python', 'pi.py'), - application_args="20" -) - -app3 = aztk.spark.models.ApplicationConfiguration( - name="pipy3", - application=os.path.join(ROOT_PATH, 'examples', 'src', 'main', 'python', 'pi.py'), - application_args="30" -) + application_args="10") -# submit an app and wait until it is finished running -client.submit(cluster.id, app1) -client.wait_until_application_done(cluster.id, app1.name) +# submit the application and wait until it is finished running +client.cluster.submit(cluster.id, app1) -# get logs for app, print to console -app1_logs = client.get_application_log(cluster_id=cluster_config.cluster_id, application_name=app1.name) -print(app1_logs.log) - -# submit some other apps to the cluster in parallel -client.submit_all_applications(cluster.id, [app2, app3]) - -# get status of app -status = client.get_application_status(cluster_config.cluster_id, app2.name) +# get status of application +status = client.cluster.get_application_status(cluster_configuration.cluster_id, app1.name) # stream logs of app, print to console as it runs current_bytes = 0 while True: - app2_logs = client.get_application_log( - cluster_id=cluster_config.cluster_id, - application_name=app2.name, - tail=True, - current_bytes=current_bytes) + app1_logs = client.get_application_log( + cluster_id=cluster_configuration.cluster_id, application_name=app1.name, tail=True, current_bytes=current_bytes) - print(app2_logs.log, end="") + print(app1_logs.log, end="") - if app2_logs.application_state == 'completed': + if app1_logs.application_state == 'completed': break - current_bytes = app2_logs.total_bytes + current_bytes = app1_logs.total_bytes time.sleep(1) -# wait until all jobs finish, then delete the cluster -client.wait_until_applications_done(cluster.id) +# alternatively, get entire log for application, print to console +app1_logs = client.get_application_log(cluster_id=cluster_configuration.cluster_id, application_name=app1.name) +print(app1_logs.log) + +# delete the cluster client.delete_cluster(cluster.id) From 19865af6d067a8c92fba355f23f66b014b3334a9 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 30 Jul 2018 11:29:56 -0700 Subject: [PATCH 45/52] fix create user argument issue --- aztk/client/base/helpers/create_user_on_cluster.py | 4 ++-- aztk/spark/client/cluster/helpers/create_user.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/aztk/client/base/helpers/create_user_on_cluster.py b/aztk/client/base/helpers/create_user_on_cluster.py index 4b308349..0764a509 100644 --- a/aztk/client/base/helpers/create_user_on_cluster.py +++ b/aztk/client/base/helpers/create_user_on_cluster.py @@ -2,10 +2,10 @@ #TODO: remove nodes param -def create_user_on_cluster(base_client, id, nodes, username, ssh_pub_key=None, password=None): +def create_user_on_cluster(base_operations, id, nodes, username, ssh_pub_key=None, password=None): with concurrent.futures.ThreadPoolExecutor() as executor: futures = { - executor.submit(base_client.create_user_on_node, id, node.id, username, ssh_pub_key, password): node + executor.submit(base_operations.create_user_on_node, id, node.id, username, ssh_pub_key, password): node for node in nodes } concurrent.futures.wait(futures) diff --git a/aztk/spark/client/cluster/helpers/create_user.py b/aztk/spark/client/cluster/helpers/create_user.py index 4cac446e..48ea22f6 100644 --- a/aztk/spark/client/cluster/helpers/create_user.py +++ b/aztk/spark/client/cluster/helpers/create_user.py @@ -10,6 +10,6 @@ def create_user(core_cluster_operations, spark_cluster_operations, cluster_id: s master_node_id = cluster.master_node_id if not master_node_id: raise error.ClusterNotReadyError("The master has not yet been picked, a user cannot be added.") - core_cluster_operations.create_user_on_cluster(username, cluster.id, cluster.nodes, ssh_key, password) + core_cluster_operations.create_user_on_cluster(cluster.id, cluster.nodes, username, ssh_key, password) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) From 96f46ee259ca5bff8d80bdcee1e97b9e042d59d1 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 30 Jul 2018 11:30:09 -0700 Subject: [PATCH 46/52] update sdk_example.py --- examples/sdk/sdk_example.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/sdk/sdk_example.py b/examples/sdk/sdk_example.py index 86065dcb..f7f047d6 100644 --- a/examples/sdk/sdk_example.py +++ b/examples/sdk/sdk_example.py @@ -1,4 +1,7 @@ -import sys, os, time +import os +import sys +import time + import aztk.spark from aztk.error import AztkError @@ -35,7 +38,7 @@ # configure my cluster cluster_configuration = aztk.spark.models.ClusterConfiguration( - cluster_id="sdk-test1", + cluster_id="sdk-test", toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), size=2, vm_size="standard_f2", @@ -68,8 +71,8 @@ # stream logs of app, print to console as it runs current_bytes = 0 while True: - app1_logs = client.get_application_log( - cluster_id=cluster_configuration.cluster_id, application_name=app1.name, tail=True, current_bytes=current_bytes) + app1_logs = client.cluster.get_application_log( + id=cluster_configuration.cluster_id, application_name=app1.name, tail=True, current_bytes=current_bytes) print(app1_logs.log, end="") @@ -79,8 +82,7 @@ time.sleep(1) # alternatively, get entire log for application, print to console -app1_logs = client.get_application_log(cluster_id=cluster_configuration.cluster_id, application_name=app1.name) -print(app1_logs.log) +app1_logs = client.cluster.get_application_log(id=cluster_configuration.cluster_id, application_name=app1.name) # delete the cluster -client.delete_cluster(cluster.id) +client.cluster.delete(cluster.id) From 0a734a5663792bf174acbda966112a560f84f367 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Mon, 30 Jul 2018 12:52:41 -0700 Subject: [PATCH 47/52] update doc --- docs/sdk-examples.md | 132 ++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 78 deletions(-) diff --git a/docs/sdk-examples.md b/docs/sdk-examples.md index b60a1adc..24baba2e 100644 --- a/docs/sdk-examples.md +++ b/docs/sdk-examples.md @@ -5,25 +5,26 @@ You can get the values for this by either running the [Getting Started script](getting-started) or using [Batch Labs](https://github.com/Azure/BatchLabs) ```python - import sys, os, time - import aztk.spark - from aztk.error import AztkError - - # set your secrets - secrets_confg = aztk.spark.models.SecretsConfiguration( - service_principal=aztk.spark.models.ServicePrincipalConfiguration( - tenant_id=".onmicrosoft.com", - client_id="", - credential="", - batch_account_resource_id="", - storage_account_resource_id="", - ), - ssh_pub_key="" - ) - - - # create a client - client = aztk.spark.Client(secrets_confg) +import os +import sys +import time + +import aztk.spark +from aztk.error import AztkError + +# set your secrets +secrets_configuration = aztk.spark.models.SecretsConfiguration( + service_principal=aztk.spark.models.ServicePrincipalConfiguration( + tenant_id=".onmicrosoft.com", + client_id="", + credential="", + batch_account_resource_id="", + storage_account_resource_id="", + ), + ssh_pub_key="") + +# create a client +client = aztk.spark.Client(secrets_configuration) ``` @@ -31,79 +32,55 @@ You can get the values for this by either running the [Getting Started script](g ```python # list available clusters -clusters = client.list_clusters() +clusters = client.cluster.list() ``` ## Create a new cluster ```python -# define a custom script -plugins = [ - aztk.spark.models.plugins.JupyterPlugin(), -] - -# define spark configuration -spark_conf = aztk.spark.models.SparkConfiguration( - spark_defaults_conf=os.path.join(ROOT_PATH, 'config', 'spark-defaults.conf'), - spark_env_sh=os.path.join(ROOT_PATH, 'config', 'spark-env.sh'), - core_site_xml=os.path.join(ROOT_PATH, 'config', 'core-site.xml'), - jars=[os.path.join(ROOT_PATH, 'config', 'jars', jar) for jar in os.listdir(os.path.join(ROOT_PATH, 'config', 'jars'))] -) +configuration_file_path = "/path/to/spark/configuration/files" +spark_configuration = aztk.spark.models.SparkConfiguration( + spark_defaults_conf=os.path.join(configuration_file_path, 'spark-defaults.conf'), + spark_env_sh=os.path.join(configuration_file_path, 'spark-env.sh'), + core_site_xml=os.path.join(configuration_file_path, 'core-site.xml'), + jars=[ + os.path.join(configuration_file_path, 'jars', jar) + for jar in os.listdir(os.path.join(configuration_file_path, 'jars')) + ]) # configure my cluster -cluster_config = aztk.spark.models.ClusterConfiguration( +cluster_configuration = aztk.spark.models.ClusterConfiguration( cluster_id="sdk-test", toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"), - size_low_priority=2, + size=2, vm_size="standard_f2", - plugins=plugins, - spark_configuration=spark_conf -) + spark_configuration=spark_configuration) # create a cluster, and wait until it is ready try: - cluster = client.create_cluster(cluster_config) - cluster = client.wait_until_cluster_is_ready(cluster.id) + cluster = client.cluster.create(cluster_configuration, wait=True) except AztkError as e: - print(e.message) - sys.exit() + raise e ``` ## Get an exiting cluster ```python - cluster = client.get_cluster(cluster_config.cluster_id) +# get details of the cluster +cluster = client.cluster.get(cluster.id) ``` ## Run an application on the cluster ```python - -# create some apps to run +# define a Spark application to run app1 = aztk.spark.models.ApplicationConfiguration( name="pipy1", application=os.path.join(ROOT_PATH, 'examples', 'src', 'main', 'python', 'pi.py'), - application_args="10" -) - -app2 = aztk.spark.models.ApplicationConfiguration( - name="pipy2", - application=os.path.join(ROOT_PATH, 'examples', 'src', 'main', 'python', 'pi.py'), - application_args="20" -) - -app3 = aztk.spark.models.ApplicationConfiguration( - name="pipy3", - application=os.path.join(ROOT_PATH, 'examples', 'src', 'main', 'python', 'pi.py'), - application_args="30" -) - -# submit an app and wait until it is finished running -client.submit(cluster.id, app1) -client.wait_until_application_done(cluster.id, app1.name) + application_args="10") -# submit some other apps to the cluster in parallel -client.submit_all_applications(cluster.id, [app2, app3]) +# submit the application and wait until it is finished running +client.cluster.submit(cluster.id, app1) ``` @@ -118,28 +95,27 @@ print(app1_logs.log) ## Get status of app ```python -status = client.get_application_status(cluster_config.cluster_id, app2.name) +# get status of application +status = client.cluster.get_application_status(cluster_configuration.cluster_id, app1.name) ``` ## Stream logs of app, print to console as it runs ```python - +# stream logs of app, print to console as it runs current_bytes = 0 while True: - app2_logs = client.get_application_log( - cluster_id=cluster_config.cluster_id, - application_name=app2.name, - tail=True, - current_bytes=current_bytes) + app1_logs = client.cluster.get_application_log( + id=cluster_configuration.cluster_id, application_name=app1.name, tail=True, current_bytes=current_bytes) - print(app2_logs.log, end="") + print(app1_logs.log, end="") - if app2_logs.application_state == 'completed': + if app1_logs.application_state == 'completed': break - current_bytes = app2_logs.total_bytes + current_bytes = app1_logs.total_bytes time.sleep(1) - -# wait until all jobs finish, then delete the cluster -client.wait_until_applications_done(cluster.id) -client.delete_cluster(cluster.id) +``` +## Stream logs of app, print to console as it runs +```python +# delete the cluster +client.cluster.delete(cluster.id) ``` From 159566aeeee835561d5046d7f9a130484df83137 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Wed, 1 Aug 2018 17:32:30 -0700 Subject: [PATCH 48/52] use Software model instead of string --- aztk/spark/client/cluster/helpers/create.py | 3 ++- aztk/spark/client/cluster/helpers/list.py | 3 ++- aztk/spark/client/job/helpers/submit.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/aztk/spark/client/cluster/helpers/create.py b/aztk/spark/client/cluster/helpers/create.py index 2f0c68f0..2fa30c71 100644 --- a/aztk/spark/client/cluster/helpers/create.py +++ b/aztk/spark/client/cluster/helpers/create.py @@ -2,6 +2,7 @@ import azure.batch.models.batch_error as batch_error from aztk import error +from aztk import models as base_models from aztk.internal.cluster_data import NodeData from aztk.spark import models from aztk.spark.utils import constants, util @@ -51,7 +52,7 @@ def create_cluster(core_cluster_operations, spark_cluster_operations, cluster_co cluster_conf.file_shares, cluster_conf.plugins, cluster_conf.mixed_mode(), cluster_conf.worker_on_master) - software_metadata_key = "spark" + software_metadata_key = base_models.Software.spark cluster = core_cluster_operations.create(cluster_conf, software_metadata_key, start_task, constants.SPARK_VM_IMAGE) diff --git a/aztk/spark/client/cluster/helpers/list.py b/aztk/spark/client/cluster/helpers/list.py index 8fff81fe..220e3cea 100644 --- a/aztk/spark/client/cluster/helpers/list.py +++ b/aztk/spark/client/cluster/helpers/list.py @@ -1,13 +1,14 @@ import azure.batch.models.batch_error as batch_error from aztk import error +from aztk import models as base_models from aztk.spark import models from aztk.utils import helpers def list_clusters(core_cluster_operations): try: - software_metadata_key = "spark" + software_metadata_key = base_models.Software.spark return [models.Cluster(cluster) for cluster in core_cluster_operations.list(software_metadata_key)] except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/helpers/submit.py b/aztk/spark/client/job/helpers/submit.py index f1341812..050431a4 100644 --- a/aztk/spark/client/job/helpers/submit.py +++ b/aztk/spark/client/job/helpers/submit.py @@ -3,6 +3,7 @@ import yaml from aztk import error +from aztk import models as base_models from aztk.internal.cluster_data import NodeData from aztk.spark import models from aztk.utils import helpers @@ -88,7 +89,7 @@ def submit_job(core_job_operations, spark_job_operations, job_configuration: mod job_manager_task = generate_job_manager_task(core_job_operations, job_configuration, application_tasks) - software_metadata_key = "spark" + software_metadata_key = base_models.Software.spark vm_image = models.VmImage(publisher='Canonical', offer='UbuntuServer', sku='16.04') From 42fed0fb95f97a6e45d4fc214e2cf1c042f6673b Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Thu, 2 Aug 2018 16:01:59 -0700 Subject: [PATCH 49/52] add job wait flag, add cluster application wait functions --- .../cluster/helpers/wait_for_task_to_complete.py | 12 ++++++++++++ aztk/client/cluster/operations.py | 14 +++++++++++++- aztk/spark/client/client.py | 5 +++-- aztk/spark/client/cluster/helpers/wait.py | 10 ++++++++++ aztk/spark/client/cluster/operations.py | 14 +++++++++++++- aztk/spark/client/job/helpers/submit.py | 5 ++++- aztk/spark/client/job/operations.py | 7 +++---- aztk_cli/spark/endpoints/cluster/cluster_submit.py | 2 +- 8 files changed, 59 insertions(+), 10 deletions(-) create mode 100644 aztk/client/cluster/helpers/wait_for_task_to_complete.py create mode 100644 aztk/spark/client/cluster/helpers/wait.py diff --git a/aztk/client/cluster/helpers/wait_for_task_to_complete.py b/aztk/client/cluster/helpers/wait_for_task_to_complete.py new file mode 100644 index 00000000..db84886a --- /dev/null +++ b/aztk/client/cluster/helpers/wait_for_task_to_complete.py @@ -0,0 +1,12 @@ +import time + +import azure.batch.models as batch_models + + +def wait_for_task_to_complete(core_cluster_operations, job_id: str, task_id: str): + while True: + task = core_cluster_operations.batch_client.task.get(job_id=job_id, task_id=task_id) + if task.state != batch_models.TaskState.completed: + time.sleep(2) + else: + return diff --git a/aztk/client/cluster/operations.py b/aztk/client/cluster/operations.py index 7d25e980..5ba831c2 100644 --- a/aztk/client/cluster/operations.py +++ b/aztk/client/cluster/operations.py @@ -1,7 +1,7 @@ from aztk.client.base import BaseOperations from aztk.models import ClusterConfiguration -from .helpers import copy, create, delete, get, list +from .helpers import copy, create, delete, get, list, wait_for_task_to_complete class CoreClusterOperations(BaseOperations): @@ -80,3 +80,15 @@ def list(self, software_metadata_key): :obj:`List[aztk.models.Cluster]`: list of clusters running the software defined by software_metadata_key """ return list.list_clusters(self, software_metadata_key) + + def wait(self, id, task_name): + """Wait until the task has completed + + Args: + id (:obj:`str`): the id of the job the task was submitted to + task_name (:obj:`str`): the name of the task to wait for + + Returns: + :obj:`None` + """ + return wait_for_task_to_complete.wait_for_task_to_complete(self, id, task_name) diff --git a/aztk/spark/client/client.py b/aztk/spark/client/client.py index 40c610a2..9c690b66 100644 --- a/aztk/spark/client/client.py +++ b/aztk/spark/client/client.py @@ -16,7 +16,7 @@ from aztk.spark.helpers import job_submission as job_submit_helper from aztk.spark.helpers import submit as cluster_submit_helper from aztk.spark.utils import util -from aztk.utils import azure_api, deprecated, helpers +from aztk.utils import azure_api, deprecated, deprecate, helpers class Client(CoreClient): @@ -30,7 +30,8 @@ def __init__(self, secrets_configuration: models.SecretsConfiguration = None, ** self.secrets_configuration = None context = None if kwargs.get("secrets_config"): - # TODO: add deprecated warning + deprecate(version="0.10.0", message="secrets_config key is deprecated in secrets.yaml", + advice="Please use secrets_configuration key instead.") context = self._get_context(kwargs.get("secrets_config")) else: context = self._get_context(secrets_configuration) diff --git a/aztk/spark/client/cluster/helpers/wait.py b/aztk/spark/client/cluster/helpers/wait.py new file mode 100644 index 00000000..5d9e3cff --- /dev/null +++ b/aztk/spark/client/cluster/helpers/wait.py @@ -0,0 +1,10 @@ +import azure.batch.models.batch_error as batch_error + +from aztk import error +from aztk.utils import helpers + +def wait_for_application_to_complete(core_cluster_operations, id, application_name): + try: + return core_cluster_operations.wait(id, application_name) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/cluster/operations.py b/aztk/spark/client/cluster/operations.py index 568f0669..75bde904 100644 --- a/aztk/spark/client/cluster/operations.py +++ b/aztk/spark/client/cluster/operations.py @@ -3,7 +3,7 @@ from aztk.spark.client.base import SparkBaseOperations from .helpers import (copy, create, create_user, delete, diagnostics, download, get, get_application_log, - get_application_status, get_remote_login_settings, list, node_run, run, submit) + get_application_status, get_remote_login_settings, list, node_run, run, submit, wait) class ClusterOperations(SparkBaseOperations): @@ -234,3 +234,15 @@ def get_remote_login_settings(self, id: str, node_id: str): :obj:`aztk.spark.models.RemoteLogin`: Object that contains the ip address and port combination to login to a node """ return get_remote_login_settings.get_remote_login_settings(self._core_cluster_operations, id, node_id) + + def wait(self, id: str, application_name: str): + """Wait until the application has completed + + Args: + id (:obj:`str`): the id of the cluster the application was submitted to + application_name (:obj:`str`): the name of the application to wait for + + Returns: + :obj:`None` + """ + return wait.wait_for_application_to_complete(self._core_cluster_operations, id, application_name) diff --git a/aztk/spark/client/job/helpers/submit.py b/aztk/spark/client/job/helpers/submit.py index 050431a4..09480c53 100644 --- a/aztk/spark/client/job/helpers/submit.py +++ b/aztk/spark/client/job/helpers/submit.py @@ -64,7 +64,7 @@ def _apply_default_for_job_config(job_conf: models.JobConfiguration): return job_conf -def submit_job(core_job_operations, spark_job_operations, job_configuration: models.JobConfiguration): +def submit_job(core_job_operations, spark_job_operations, job_configuration: models.JobConfiguration, wait: bool = False): try: job_configuration = _apply_default_for_job_config(job_configuration) job_configuration.validate() @@ -106,6 +106,9 @@ def submit_job(core_job_operations, spark_job_operations, job_configuration: mod software_metadata_key=software_metadata_key, vm_image_model=vm_image, application_metadata='\n'.join(application.name for application in (job_configuration.applications or []))) + + if wait: + spark_job_operations.wait(id=job_configuration.id) return models.Job(job) diff --git a/aztk/spark/client/job/operations.py b/aztk/spark/client/job/operations.py index a9a1d1b7..d9693ea9 100644 --- a/aztk/spark/client/job/operations.py +++ b/aztk/spark/client/job/operations.py @@ -107,7 +107,7 @@ def stop_application(self, id, application_name): """ return stop_application.stop_app(self._core_job_operations, id, application_name) - def submit(self, job_configuration: models.JobConfiguration): + def submit(self, job_configuration: models.JobConfiguration, wait: bool = False): """Submit a job Jobs are a cluster definition and one or many application definitions which run on the cluster. The job's @@ -120,8 +120,7 @@ def submit(self, job_configuration: models.JobConfiguration): Returns: :obj:`aztk.spark.models.Job`: Model representing the state of the job. """ - return submit.submit_job(self._core_job_operations, self, job_configuration) + return submit.submit_job(self._core_job_operations, self, job_configuration, wait) - #TODO: rename to something better or make this a parameter of submit - def wait_until_job_finished(self, id): + def wait(self, id): wait_until_complete.wait_until_job_finished(self._core_job_operations, id) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_submit.py b/aztk_cli/spark/endpoints/cluster/cluster_submit.py index feeb9f56..927d7571 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_submit.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_submit.py @@ -162,7 +162,7 @@ def execute(args: typing.NamedTuple): exit_code = utils.stream_logs(client=spark_client, cluster_id=args.cluster_id, application_name=args.name) else: with utils.Spinner(): - spark_client.cluster.wait_until_application_done(cluster_id=args.cluster_id, task_id=args.name) # TODO: replace wait_until_application_done + spark_client.cluster.wait(id=args.cluster_id, application_name=args.name) # TODO: replace wait_until_application_done application_log = spark_client.cluster.get_application_log(id=args.cluster_id, application_name=args.name) with open(os.path.abspath(os.path.expanduser(args.output)), "w", encoding="UTF-8") as f: f.write(application_log.log) From ecfda506d2d4508afeed9bba66fe479aad8fca9a Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Thu, 2 Aug 2018 16:19:30 -0700 Subject: [PATCH 50/52] add docs for wait, update tests --- aztk/spark/client/client.py | 2 +- aztk/spark/client/job/operations.py | 8 +++++++ .../spark/sdk/job/test_job.py | 21 +++++++------------ 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/aztk/spark/client/client.py b/aztk/spark/client/client.py index 9c690b66..db9e7d34 100644 --- a/aztk/spark/client/client.py +++ b/aztk/spark/client/client.py @@ -219,7 +219,7 @@ def stop_job_app(self, job_id, application_name): # NOT IMPLEMENTED @deprecated("0.10.0") def wait_until_job_finished(self, job_id): try: - job_submit_helper.wait_until_job_finished(self, job_id) + self.job.wait(self, job_id) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/client/job/operations.py b/aztk/spark/client/job/operations.py index d9693ea9..c639795c 100644 --- a/aztk/spark/client/job/operations.py +++ b/aztk/spark/client/job/operations.py @@ -116,6 +116,7 @@ def submit(self, job_configuration: models.JobConfiguration, wait: bool = False) Args: job_configuration (:obj:`aztk.spark.models.JobConfiguration`): Model defining the job's configuration. + wait (:obj:`bool`): If True, blocks until job is completed. Defaults to False. Returns: :obj:`aztk.spark.models.Job`: Model representing the state of the job. @@ -123,4 +124,11 @@ def submit(self, job_configuration: models.JobConfiguration, wait: bool = False) return submit.submit_job(self._core_job_operations, self, job_configuration, wait) def wait(self, id): + """Wait until the job has completed. + Args: + id (:obj:`str`): the id of the job the application belongs to + + Returns: + :obj:`None` + """ wait_until_complete.wait_until_job_finished(self._core_job_operations, id) diff --git a/tests/integration_tests/spark/sdk/job/test_job.py b/tests/integration_tests/spark/sdk/job/test_job.py index 00d04b62..b39bbbb6 100644 --- a/tests/integration_tests/spark/sdk/job/test_job.py +++ b/tests/integration_tests/spark/sdk/job/test_job.py @@ -36,8 +36,7 @@ def test_submit_job(): max_low_pri_nodes=0 ) try: - job = spark_client.job.submit(job_configuration=job_configuration) - spark_client.job.wait_until_job_finished(id=job_configuration.id) + job = spark_client.job.submit(job_configuration=job_configuration, wait=True) assert job.id == job_configuration.id assert job.state is not None @@ -73,8 +72,7 @@ def test_list_jobs(): worker_on_master=True ) try: - spark_client.job.submit(job_configuration=job_configuration) - spark_client.job.wait_until_job_finished(job_configuration.id) + spark_client.job.submit(job_configuration=job_configuration, wait=True) jobs = spark_client.job.list() @@ -111,8 +109,7 @@ def test_list_applications(): max_low_pri_nodes=0 ) try: - spark_client.job.submit(job_configuration=job_configuration) - spark_client.job.wait_until_job_finished(job_configuration.id) + spark_client.job.submit(job_configuration=job_configuration, wait=True) applications = spark_client.job.list_applications(id=job_configuration.id) @@ -152,8 +149,7 @@ def test_get_job(): worker_on_master=True ) try: - spark_client.job.submit(job_configuration=job_configuration) - spark_client.job.wait_until_job_finished(job_configuration.id) + spark_client.job.submit(job_configuration=job_configuration, wait=True) job = spark_client.job.get(id=job_configuration.id) assert job.id == job_configuration.id @@ -185,8 +181,7 @@ def test_get_application(): max_low_pri_nodes=0 ) try: - spark_client.job.submit(job_configuration=job_configuration) - spark_client.job.wait_until_job_finished(job_configuration.id) + spark_client.job.submit(job_configuration=job_configuration, wait=True) application = spark_client.job.get_application(id=job_configuration.id, application_name=app1.name) assert isinstance(application, aztk.spark.models.Application) assert application.exit_code == 0 @@ -216,8 +211,7 @@ def test_get_application_log(): max_low_pri_nodes=0 ) try: - spark_client.job.submit(job_configuration=job_configuration) - spark_client.job.wait_until_job_finished(job_configuration.id) + spark_client.job.submit(job_configuration=job_configuration, wait=True) application_log = spark_client.job.get_application_log(id=job_configuration.id, application_name=app1.name) @@ -253,8 +247,7 @@ def test_delete_job(): worker_on_master=True ) try: - spark_client.job.submit(job_configuration=job_configuration) - spark_client.job.wait_until_job_finished(job_configuration.id) + spark_client.job.submit(job_configuration=job_configuration, wait=True) spark_client.job.delete(job_configuration.id) assert job_configuration.id not in spark_client.job.list() try: From a27e3718497a91f6f0ca2f56d423f05f3ba415be Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Thu, 2 Aug 2018 16:54:43 -0700 Subject: [PATCH 51/52] fix bug --- aztk/spark/client/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aztk/spark/client/client.py b/aztk/spark/client/client.py index db9e7d34..8db8c349 100644 --- a/aztk/spark/client/client.py +++ b/aztk/spark/client/client.py @@ -219,7 +219,7 @@ def stop_job_app(self, job_id, application_name): # NOT IMPLEMENTED @deprecated("0.10.0") def wait_until_job_finished(self, job_id): try: - self.job.wait(self, job_id) + self.job.wait(job_id) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) From 6c6767cd857efda1ec1fd16d048835b396f16062 Mon Sep 17 00:00:00 2001 From: Jake Freck Date: Fri, 3 Aug 2018 12:12:05 -0700 Subject: [PATCH 52/52] add clientrequesterror catch to fix tests --- aztk/node_scripts/install/pick_master.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/aztk/node_scripts/install/pick_master.py b/aztk/node_scripts/install/pick_master.py index 66cb8909..fecd104f 100644 --- a/aztk/node_scripts/install/pick_master.py +++ b/aztk/node_scripts/install/pick_master.py @@ -1,10 +1,11 @@ """ This is the code that all nodes will run in their start task to try to allocate the master """ - import azure.batch.batch_service_client as batch import azure.batch.models as batchmodels import azure.batch.models.batch_error as batcherror +from msrest.exceptions import ClientRequestError + from core import config MASTER_NODE_METADATA_KEY = "_spark_master_node" @@ -36,7 +37,7 @@ def try_assign_self_as_master(client: batch.BatchServiceClient, pool: batchmodel if_match=pool.e_tag, )) return True - except batcherror.BatchErrorException: + except (batcherror.BatchErrorException, ClientRequestError): print("Couldn't assign itself as master the pool because the pool was modified since last get.") return False