Feature: Readthedocs support (#497)

Azure · Apr 26, 2018 · e361c3b · e361c3b
1 parent a00dbb7
commit e361c3b
Show file tree

Hide file tree

Showing 34 changed files with 626 additions and 850 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -3,6 +3,7 @@ indent_style = space
 indent_size = 4
 insert_final_newline = true
 trim_trailing_whitespace = true
+end_of_line = lf
 
 [*.{json,yml,yaml}]
 indent_size = 2

diff --git a/.gitignore b/.gitignore
@@ -39,3 +39,7 @@ tmp/
 
 # PyTest
 .cache/
+
+
+# Built docs
+docs/_build/
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -15,6 +15,5 @@
   "python.formatting.yapfArgs": [
     "--style=.style.yapf"
   ],
-  "python.venvPath": "${workspaceFolder}/ENV",
   "python.pythonPath": "${workspaceFolder}\\ENV\\Scripts\\python.exe"
 }
diff --git a/aztk/client.py b/aztk/client.py
@@ -61,7 +61,7 @@ def __delete_pool_and_job(self, pool_id: str, keep_logs: bool = False):
 
         if pool_exists:
             self.batch_client.pool.delete(pool_id)
-        
+
         if not keep_logs:
             cluster_data = self._get_cluster_data(pool_id)
             cluster_data.delete_container(pool_id)

diff --git a/aztk/error.py b/aztk/error.py
@@ -4,10 +4,8 @@
 """
 
 
-
 class AztkError(Exception):
-    def __init__(self, message: str=None):
-        super().__init__(message)
+    pass
 
 class ClusterNotReadyError(AztkError):
     pass

diff --git a/aztk/internal/cluster_data/__init__.py b/aztk/internal/cluster_data/__init__.py
@@ -1,3 +1,3 @@
-from .blob_data import *
-from .node_data import *
-from .cluster_data import *
+from .blob_data import BlobData
+from .node_data import NodeData
+from .cluster_data import ClusterData
diff --git a/aztk/internal/cluster_data/cluster_data.py b/aztk/internal/cluster_data/cluster_data.py
@@ -1,7 +1,6 @@
-import yaml
 import logging
+import yaml
 import azure.common
-from azure.storage.blob import BlockBlobService
 from .node_data import NodeData
 from .blob_data import BlobData
 
@@ -15,7 +14,7 @@ class ClusterData:
     APPLICATIONS_DIR = "applications"
     CLUSTER_CONFIG_FILE = "config.yaml"
 
-    def __init__(self, blob_client: BlockBlobService, cluster_id: str):
+    def __init__(self, blob_client, cluster_id: str):
         self.blob_client = blob_client
         self.cluster_id = cluster_id
         self._ensure_container()

diff --git a/aztk/internal/cluster_data/node_data.py b/aztk/internal/cluster_data/node_data.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 from typing import List
 import yaml
-from aztk.spark import models
+from aztk import models
 from aztk.utils import constants, file_utils, secure_utils
 from aztk.error import InvalidCustomScriptError
 
@@ -61,10 +61,12 @@ def add_files(self, file_paths: List[str], zip_dir, binary: bool = True):
         for file in file_paths:
             self.add_file(file, zip_dir, binary)
 
-    def add_dir(self, path: str, dest: str = None, exclude: List[str] = []):
+    def add_dir(self, path: str, dest: str = None, exclude: List[str] = None):
         """
             Zip all the files in the given directory into the zip file handler
         """
+        exclude = exclude or []
+
         for base, _, files in os.walk(path):
             relative_folder = os.path.relpath(base, path)
             for file in files:
@@ -156,7 +158,8 @@ def _add_plugins(self):
     def _add_node_scripts(self):
         self.add_dir(os.path.join(ROOT_PATH, NODE_SCRIPT_FOLDER), NODE_SCRIPT_FOLDER, exclude=['*.pyc*'])
 
-    def _includeFile(self, filename: str, exclude: List[str] = []) -> bool:
+    def _includeFile(self, filename: str, exclude: List[str]) -> bool:
+        exclude = exclude or []
         for pattern in exclude:
             if fnmatch.fnmatch(filename, pattern):
                 return False

diff --git a/aztk/spark/__init__.py b/aztk/spark/__init__.py
@@ -1,2 +1 @@
-from .models import *
 from .client import Client
diff --git a/aztk/spark/client.py b/aztk/spark/client.py
@@ -15,13 +15,27 @@
 
 
 class Client(BaseClient):
+    """
+    Aztk Spark Client
+    This is the main entry point for using aztk for spark
+
+    Args:
+        secrets_config(aztk.spark.models.models.SecretsConfiguration): Configuration with all the needed credentials
+    """
     def __init__(self, secrets_config):
         super().__init__(secrets_config)
 
-    '''
-    Spark client public interface
-    '''
     def create_cluster(self, cluster_conf: models.ClusterConfiguration, wait: bool = False):
+        """
+        Create a new aztk spark cluster
+
+        Args:
+            cluster_conf(aztk.spark.models.models.ClusterConfiguration): Configuration for the the cluster to be created
+            wait(bool): If you should wait for the cluster to be ready before returning
+
+        Returns:
+            aztk.spark.models.Cluster
+        """
         cluster_conf.validate()
         cluster_data = self._get_cluster_data(cluster_conf.cluster_id)
         try:

diff --git a/aztk/spark/models/__init__.py b/aztk/spark/models/__init__.py
@@ -1 +1 @@
-from .models import *
+from .models import *
diff --git a/aztk/spark/models/plugins/__init__.py b/aztk/spark/models/plugins/__init__.py
@@ -1,6 +1,6 @@
-from .hdfs import *
-from .jupyter import *
-from .jupyter_lab import *
-from .rstudio_server import *
-from .simple import *
-from .spark_ui_proxy import *
+from .hdfs import HDFSPlugin
+from .jupyter import JupyterPlugin
+from .jupyter_lab import JupyterLabPlugin
+from .rstudio_server import RStudioServerPlugin
+from .simple import SimplePlugin
+from .spark_ui_proxy import SparkUIProxyPlugin
diff --git a/docs/00-getting-started.md b/docs/00-getting-started.md
@@ -1,14 +1,11 @@
-# Azure Distributed Data Engineering Toolkit
-The Azure Distributed Data Engineering Toolkit is a project that allows Sparks users to easily spin up a Spark cluster in Azure.
-
-## Getting Started
+# Getting Started
 The minimum requirements to get started with this package are:
 - Python 3.5+, pip 9.0.1+
 - An Azure account
 - An Azure Batch account
 - An Azure Storage account
 
-### Cloning and installing the project
+## Cloning and installing the project
 1. Clone the repo
 2. Make sure you are running python 3.5 or greater.
     _If the default version on your machine is python 2 make sure to run the following commands with **pip3** instead of **pip**._
@@ -40,12 +37,12 @@ The minimum requirements to get started with this package are:
     This will put default configuration files in your home directory, *~/*. Please note that configuration files in your current working directory will take precident over global configuration files in your home directory.
 
 
-### Setting up your accounts
+## Setting up your accounts
 
-#### Using the account setup script
-A script to create and configure the Azure resources required to use `aztk` is provided. For more more information and usage, see [Getting Started Script](./01-getting-started-script.md).
+### Using the account setup script
+A script to create and configure the Azure resources required to use `aztk` is provided. For more more information and usage, see [Getting Started Script](01-getting-started-script.html)
 
-#### Manual resource creation
+### Manual resource creation
 To finish setting up, you need to fill out your Azure Batch and Azure Storage secrets in *.aztk/secrets.yaml*. We'd also recommend that you enter SSH key info in this file too.
 
 Please note that if you use ssh keys and a have a non-standard ssh key file name or path, you will need to specify the location of your ssh public and private keys. To do so, set them as shown below:
@@ -64,7 +61,7 @@ If you do not already have an Azure account, go to [https://azure.microsoft.com/
     Once you have one, simply log in and go to the [Azure Portal](https://portal.azure.com) to start creating your Azure Batch account and Azure Storage account.
 
 
-##### Using AAD
+#### Using AAD
 To get the required keys for your Azure Active Directory (AAD) Service Principal, Azure Batch Account and Azure Storage Account, please follow these instructions. Note that this is the recommended path for use with AZTK, as some features require AAD and are disabled if using Shared Key authentication.
 
 1. Register an Azure Active Directory (AAD) Application
@@ -135,7 +132,7 @@ service_principal:
     storage_account_resource_id: </storage/account/resource/id>
 ```
 
-#### Using Shared Keys
+### Using Shared Keys
 _Please note that using Shared Keys prevents the use of certain AZTK features including Mixed Mode clusters and support for VNETs._
 
 To get the required keys for Azure Batch and Azure Storage, please follow the below instructions:
@@ -167,19 +164,19 @@ To get the required keys for Azure Batch and Azure Storage, please follow the be
 - Go to the accounts in the Azure portal and copy pase the account names, keys and other information needed into the
 secrets file.
 
-#### Storage account
+### Storage account
 
 For the Storage account, copy the name and one of the two keys:
 
 ![](./misc/Storage_secrets.png)
 
-#### Batch account
+### Batch account
 
 For the Batch account, copy the name, the url and one of the two keys:
 
 ![](./misc/Batch_secrets.png)
 
 
 ## Next Steps
-- [Create a cluster](./10-clusters.md)
-- [Run a Spark job](./20-spark-submit.md)
+- [Create a cluster](10-clusters.html)
+- [Run a Spark job](./20-spark-submit.html)
diff --git a/docs/01-getting-started-script.md b/docs/01-getting-started-script.md
@@ -9,7 +9,7 @@ The script will create and configure the following resources:
 - Azure Active Directory application and service principal
 <!-- - Virtual network with a configured subnet -->
 
-The script outputs all of the necessary information to use `aztk`, just copy the output into the `.aztk/secrets.yaml` file created when running `aztk spark init`. 
+The script outputs all of the necessary information to use `aztk`, just copy the output into the `.aztk/secrets.yaml` file created when running `aztk spark init`.
 
 ## Usage
 Copy and paste the following into an [Azure Cloud Shell](https://shell.azure.com):
@@ -41,4 +41,4 @@ service_principal:
 
 Copy the entire `service_principal` section in your `.aztk/secrets.yaml`. If you do not have a `secrets.yaml` file, you can create one in your current working directory by running `aztk spark init`.
 
-Now you are ready to create your first `aztk` cluster. See [Creating a Cluster](./10-clusters.md#creating-a-cluster).
+Now you are ready to create your first `aztk` cluster. See [Creating a Cluster](./10-clusters.html#creating-a-cluster).
diff --git a/docs/10-clusters.md b/docs/10-clusters.md
@@ -4,7 +4,7 @@ In the Azure Distributed Data Engineering Toolkit, a cluster is primarily design
 ## Creating a Cluster
 Creating a Spark cluster only takes a few simple steps after which you will be able to SSH into the master node of the cluster and interact with Spark. You will be able to view the Spark Web UI, Spark Jobs UI, submit Spark jobs (with *spark-submit*), and even interact with Spark in a Jupyter notebook.
 
-For the advanced user, please note that the default cluster settings are preconfigured in the *.aztk/cluster.yaml* file that is generated when you run `aztk spark init`. More information on cluster config [here.](./13-configuration.md)
+For the advanced user, please note that the default cluster settings are preconfigured in the *.aztk/cluster.yaml* file that is generated when you run `aztk spark init`. More information on cluster config [here.](./13-configuration.html)
 
 ### Commands
 Create a Spark cluster:
@@ -33,7 +33,7 @@ You can create clusters with a mixed of [low-priority](https://docs.microsoft.co
 Please note, to use Mixed Mode clusters, you need to authenticate using Azure Active Directory (AAD) by configuring the Service Principal in `.aztk/secrets.yaml`. You also need to create a [Virtual Network \(VNET\)](https://azure.microsoft.com/en-us/services/virtual-network/), and provide the resource ID to a Subnet within the VNET in your ./aztk/cluster.yaml` configuration file.
 
 #### Setting your Spark and/or Python versions
-By default, the Azure Distributed Data Engineering Toolkit will use **Spark v2.2.0** and **Python v3.5.4**. However, you can set your Spark and/or Python versions by [configuring the base Docker image used by this package](./12-docker-image.md).
+By default, the Azure Distributed Data Engineering Toolkit will use **Spark v2.2.0** and **Python v3.5.4**. However, you can set your Spark and/or Python versions by [configuring the base Docker image used by this package](./12-docker-image.html).
 
 ### Listing clusters
 You can list all clusters currently running in your account by running
@@ -161,9 +161,9 @@ __Please be careful sharing the output of the `debug` command as secrets and app
 
 
 ### Interact with your Spark cluster
-By default, the `aztk spark cluster ssh` command port forwards the Spark Web UI to *localhost:8080*, Spark Jobs UI to *localhost:4040*, and Spark History Server to your *locahost:18080*. This can be [configured in *.aztk/ssh.yaml*](../docs/13-configuration.md##sshyaml).
+By default, the `aztk spark cluster ssh` command port forwards the Spark Web UI to *localhost:8080*, Spark Jobs UI to *localhost:4040*, and Spark History Server to your *locahost:18080*. This can be [configured in *.aztk/ssh.yaml*](../docs/13-configuration.html#sshyaml).
 
 ## Next Steps
-- [Run a Spark job](./20-spark-submit.md)
-- [Configure the Spark cluster using custom commands](./11-custom-scripts.md)
-- [Bring your own Docker image or choose between a variety of our supported base images to manage your Spark and Python versions](./12-docker-image.md)
+- [Run a Spark job](20-spark-submit.html)
+- [Configure the Spark cluster using custom commands](11-custom-scripts.html)
+- [Bring your own Docker image or choose between a variety of our supported base images to manage your Spark and Python versions](12-docker-image.html)
diff --git a/docs/13-configuration.md b/docs/13-configuration.md
@@ -22,7 +22,7 @@ size: 2
 # username: <username for the linux user to be created> (optional)
 username: spark
 
-# docker_repo: <name of docker image repo (for more information, see https://github.com/Azure/aztk/blob/master/docs/12-docker-image.md)>
+# docker_repo: <name of docker image repo (for more information, see https://github.com/Azure/aztk/blob/master/docs/12-docker-image.html)>
 docker_repo: aztk/base:spark2.2.0
 
 # custom_script: <path to custom script to run on each node> (optional)
@@ -102,14 +102,14 @@ spark.eventLog.dir              <path>
 spark.history.fs.logDirectory   <path>
  ```
 
-Please note that the path for `spark.eventLog.dir` and `spark.history.fs.logDirectory` should most likely match so that the history server reads the logs that each Spark job writes. Also note that while the paths can be local (`file:/`), it is recommended that the paths be accessible by every node in the cluster so that the history server, which runs on the Spark master node, has access to all application logs. HDFS, WASB, ADL, or any other Hadoop API compliant storage system may be used. 
+Please note that the path for `spark.eventLog.dir` and `spark.history.fs.logDirectory` should most likely match so that the history server reads the logs that each Spark job writes. Also note that while the paths can be local (`file:/`), it is recommended that the paths be accessible by every node in the cluster so that the history server, which runs on the Spark master node, has access to all application logs. HDFS, WASB, ADL, or any other Hadoop API compliant storage system may be used.
 
-If using WASB, ADL or other cloud storage services, be sure to set your keys in `.aztk/core-site.xml`. For more information, see the [Cloud Storage](./30-cloud-storage.md) documentation.
+If using WASB, ADL or other cloud storage services, be sure to set your keys in `.aztk/core-site.xml`. For more information, see the [Cloud Storage](./30-cloud-storage.html) documentation.
 
 
 ## Configuring Spark Storage
 
-The Spark cluster can be configured to use different cloud supported storage offerrings (such as Azure Storage Blobs, Azure Data Lake Storage, or any other supported Spark file system). More information can be found in the [Cloud Storage](./30-cloud-storage.md) documentation.
+The Spark cluster can be configured to use different cloud supported storage offerrings (such as Azure Storage Blobs, Azure Data Lake Storage, or any other supported Spark file system). More information can be found in the [Cloud Storage](./30-cloud-storage.html) documentation.
 
 ## Placing JARS
 
@@ -129,5 +129,5 @@ Note: _This tool automatically registers several JARS for default cloud storage
 
 
 ## Next Steps
-- [Add plugins](./15-plugins.md)
-- [Set up your Cloud Storage](./30-cloud-storage.md)
+- [Add plugins](./15-plugins.html)
+- [Set up your Cloud Storage](./30-cloud-storage.html)
diff --git a/docs/20-spark-submit.md b/docs/20-spark-submit.md
@@ -17,7 +17,7 @@ aztk spark cluster submit --id spark --name pipy examples/src/main/python/pi.py
 NOTE: The job name (--name) must be atleast 3 characters long, can only contain alphanumeric characters including hyphens but excluding underscores, and cannot contain uppercase letters. Each job you submit **must** have a unique name.
 
 ## Monitoring job
-If you have set up a [SSH tunnel](./10-clusters.md#ssh-and-port-forwarding) with port fowarding, you can naviate to http://localhost:8080 and http://localhost:4040 to view the progess of the job using the Spark UI
+If you have set up a [SSH tunnel](./10-clusters.html#ssh-and-port-forwarding) with port fowarding, you can naviate to http://localhost:8080 and http://localhost:4040 to view the progess of the job using the Spark UI
 
 
 ## Getting output logs
Original file line number	Diff line number	Diff line change
		@@ -1,2 +1 @@
		from .models import *
		from .client import Client