Bug: add spark.history.fs.logDirectory to required keys (#456)

* add spark.history.fs.logDirectory to requried keys * add spark_event_log_enabled_key to required_keys * docs, add history server config to spark-defaults.conf * fix bad logic * crlf->lf
Azure · Apr 5, 2018 · 4ef3dd0 · 4ef3dd0
1 parent 32de752
commit 4ef3dd0
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 25 deletions.
diff --git a/aztk/node_scripts/install/spark.py b/aztk/node_scripts/install/spark.py
@@ -84,24 +84,6 @@ def start_spark_master():
         print(e)
 
 
-def start_history_server():
-    # configure the history server
-    spark_event_log_enabled_key = 'spark.eventLog.enabled'
-    spark_event_log_directory_key = 'spark.eventLog.dir'
-    path_to_spark_defaults_conf = os.path.join(spark_home, 'conf/spark-defaults.conf')
-    properties = parse_configuration_file(path_to_spark_defaults_conf)
-
-    # only enable the history server if it was enabled in the configuration file
-    if properties and spark_event_log_enabled_key in properties:
-        if spark_event_log_directory_key in properties:
-            configure_history_server_log_path(properties[spark_event_log_directory_key])
-
-        exe = os.path.join(spark_home, "sbin", "start-history-server.sh")
-        cmd = [exe]
-        print("Starting history server")
-        call(cmd)
-
-
 def start_spark_worker():
     wait_for_master()
     exe = os.path.join(spark_home, "sbin", "start-slave.sh")
@@ -207,6 +189,24 @@ def parse_configuration_file(path_to_file: str):
         print(e)
 
 
+def start_history_server():
+    # configure the history server
+    spark_event_log_enabled_key = 'spark.eventLog.enabled'
+    spark_event_log_directory_key = 'spark.eventLog.dir'
+    spark_history_fs_log_directory = 'spark.history.fs.logDirectory'
+    path_to_spark_defaults_conf = os.path.join(spark_home, 'conf/spark-defaults.conf')
+    properties = parse_configuration_file(path_to_spark_defaults_conf)
+    required_keys = [spark_event_log_enabled_key, spark_event_log_directory_key, spark_history_fs_log_directory]
+
+    # only enable the history server if it was enabled in the configuration file
+    if properties:
+        if all(key in properties for key in required_keys):
+            configure_history_server_log_path(properties[spark_history_fs_log_directory])
+            exe = os.path.join(spark_home, "sbin", "start-history-server.sh")
+            print("Starting history server")
+            call([exe])
+
+
 def configure_history_server_log_path(path_to_log_file):
     # Check if the file path starts with a local file extension
     # If so, create the path on disk otherwise ignore

diff --git a/aztk_cli/config/spark-defaults.conf b/aztk_cli/config/spark-defaults.conf
@@ -25,18 +25,34 @@
 # spark.driver.memory              5g
 # spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
 
+
+#####################################
+# do not change the settings below
+#####################################
+# enable the shuffle service
+spark.shuffle.service.enabled   true
+
+# enable dynamic allocation
+spark.dynamicAllocation.enabled true
+#####################################
+#                                   #
+#####################################
+
+
 # set "scratch" space for Spark
 spark.local.dir                 /mnt/batch/tasks
 
-# Note: Aztk pre-loads wasb jars, so loading is not necessary
+# Note: aztk pre-loads wasb jars, so loading is not necessary
 spark.jars                      /home/spark-current/jars/azure-storage-2.0.0.jar,/home/spark-current/jars/hadoop-azure-2.7.3.jar
 
 # Note: Default filesystem master HA
 spark.deploy.recoveryMode       FILESYSTEM
 spark.deploy.recoveryDirectory  /root/
 
-# enable spark shuffle service
-spark.shuffle.service.enabled   true
-
-# enable dynamic allocation
-spark.dynamicAllocation.enabled true
+# enable history server
+# # if the following values are set, history server will be started automatically
+# # it is recommended to point  eventLog.dir and history.fs.logDir to a location
+# # accessible by all nodes, like HDFS, WASB, or ADL
+# spark.eventLog.enabled          true
+# spark.eventLog.dir              <hdfs://namenode:8021/directory, wasb[s]://<BlobStorageContainerName>@<StorageAccountName>.blob.core.windows.net/<path>>
+# spark.history.fs.logDirectory   <hdfs://namenode:8021/directory, wasb[s]://<BlobStorageContainerName>@<StorageAccountName>.blob.core.windows.net/<path>>
diff --git a/docs/13-configuration.md b/docs/13-configuration.md
@@ -11,7 +11,7 @@ This is the default cluster configuration:
 # id: <id of the cluster to be created>
 id: spark_cluster
 
-# vm_size: <vm-size, see available options here: https://azure.microsoft.com/en-us/pricing/details/virtual-machines/linux/>
+# vm_size: <vm-size, see available options here: https://azure.microsoft.com/pricing/details/batch//>
 vm_size: standard_a2
 
 # size: <number of dedicated nodes in the cluster, not that clusters must contain all dedicated or all low priority nodes>
@@ -76,6 +76,8 @@ Note that all of the settings in ssh.yaml will be overrided by parameters passed
 
 The repository comes with default Spark configuration files which provision your Spark cluster just the same as you would locally. After running `aztk spark init` to initialize your working environment, you can view and edit these files at `.aztk/spark-defaults.conf`, `.aztk/spark-env.sh` and `.aztk/core-site.xml`. Please note that you can bring your own Spark configuration files by copying your `spark-defaults.conf`, `spark-env.sh` and `core-site.xml` into your `.aztk/` direcotry.
 
+If using `aztk` job submission, please note that both `spark.shuffle.service.enabled` and `spark.dynamicAllocation.enabled` must be set to true so that the number of executors registered with an application can scale as nodes in the job's cluster come online.
+
 The following settings available in `spark-defaults.conf` and `spark-env.sh` are not supported:
 
 `spark-env.sh`:
@@ -92,6 +94,19 @@ The following settings available in `spark-defaults.conf` and `spark-env.sh` are
 
 Also note that this toolkit pre-loads wasb jars, so loading them elsewhere is not necessary.
 
+### History Server
+If you want to use Spark's history server, please set the following values in your `.aztk/spark-defaults.conf` file:
+```
+spark.eventLog.enabled          true
+spark.eventLog.dir              <path>
+spark.history.fs.logDirectory   <path>
+ ```
+
+Please note that the path for `spark.eventLog.dir` and `spark.history.fs.logDirectory` should most likely match so that the history server reads the logs that each Spark job writes. Also note that while the paths can be local (`file:/`), it is recommended that the paths be accessible by every node in the cluster so that the history server, which runs on the Spark master node, has access to all application logs. HDFS, WASB, ADL, or any other Hadoop API compliant storage system may be used. 
+
+If using WASB, ADL or other cloud storage services, be sure to set your keys in `.aztk/core-site.xml`. For more information, see the [Cloud Storage](./30-cloud-storage.md) documentation.
+
+
 ## Configuring Spark Storage
 
 The Spark cluster can be configured to use different cloud supported storage offerrings (such as Azure Storage Blobs, Azure Data Lake Storage, or any other supported Spark file system). More information can be found in the [Cloud Storage](./30-cloud-storage.md) documentation.