Add AWS S3 support for stable/spark-history-server (helm#9615)

* Add AWS S3 support for stable/spark-history-server Signed-off-by: Will Son <[email protected]> * Fix typo and formatting Signed-off-by: Will Son <[email protected]>
agolo · Nov 30, 2018 · f6d6e8d · f6d6e8d
1 parent a25c621
commit f6d6e8d
Show file tree

Hide file tree

Showing 5 changed files with 86 additions and 4 deletions.
diff --git a/stable/spark-history-server/Chart.yaml b/stable/spark-history-server/Chart.yaml
@@ -1,5 +1,5 @@
 name: spark-history-server
-version: 0.2.1
+version: 0.3.0
 appVersion: 2.4.0
 description: A Helm chart for Spark History Server
 home: https://spark.apache.org

diff --git a/stable/spark-history-server/README.md b/stable/spark-history-server/README.md
@@ -12,7 +12,7 @@
   $ kubectl -n <history-server-namespace> create configmap hdfs-site --from-file=hdfs-site.xml && kubectl -n history-server-namespace create configmap core-site --from-file=core-site.xml
   ```
 
-* Secret (Only if using GCS)
+* Secret (Only if using GCS or S3 without IAM based authentication)
 
   If using GCS as storage, follow the preparatory steps below:
 
@@ -36,6 +36,18 @@
 
   Then install the chart to enable the history server pod to read from the GCS bucket.
 
+  Similarly, if using S3 as storage, follow the preparatory steps below:
+  ```bash
+  $ aws s3 mb s3://your-spark-event-log-directory # default bucket is s3://spark-hs/
+  $ aws iam list-access-keys --user-name your-user-name --output text | awk '{print $2}' >> aws-access-key
+  $ echo "your-aws-secret-key" >> aws-secret-key
+  ```
+
+  Then create a secret:
+  ```bash
+  $ kubectl create secret generic aws-secrets --from-file=aws-access-key --from-file=aws-secret-key
+  ```
+
 * PVC (Only if using PVC)
 
   If you are using a PVC as the backing storage for Spark history events, then you'll need to create the PVC before installing the chart. On the Google Kubernetes Engine (GKE), the recommended underlying PersistentVolume is NFS. You can also use Portworx or Gluster. All three options provide sharing capabilities that would allow both the history server pod and the Spark job pods to mount the same PVC. 
@@ -94,6 +106,12 @@ Note that the default image `lightbend/spark-history-server` is built using this
 | gcs.secret |Pre-mounted secret name for GCS connection|history-secrets|
 | gcs.key |The JSON key file name|sparkonk8s.json|
 | gcs.logDirectory |The GCS log directory that starts with "gs://"|gs://spark-hs/|
+| s3.enableS3 | Whether to use S3 storage | false |
+| s3.enableIAM | Whether to use IAM based authentication or fall back to using AWS access key ID and secret access key | true |
+| s3.secret | Pre-mounted secret name for S3 connection. Omit if using IAM based authentication | aws-secrets |
+| s3.accessKeyName | The file name that contains the AWS access key ID. Omit if using IAM based authentication | aws-access-key |
+| s3.secretKeyName | The file name that contains the AWS secret access key. Omit if using IAM based authentication | aws-secret-key |
+| s3.logDirectory | The S3 log directory that starts with "s3a://" | s3a://spark-hs/ |
 
 Note that only when `pvc.enablePVC` is set to `true`, the following settings take effect:
 
@@ -108,6 +126,14 @@ Similary, only when `gcs.enableGCS` is `true`, the following settings take effec
 * gcs.key
 * gcs.logDirectory
 
+Similarly, only when `s3.enableS3` is `true`, the following settings take effect:
+
+* s3.enableIAM
+* s3.secret
+* s3.accessKeyName
+* s3.secretKeyName
+* s3.logDirectory
+
 And only when `pvc.enablePVC` and `gcs.enableGCS` are both `false`, is HDFS used, in which case the settings below are in effect:
 
 * hdfs.logDirectory
@@ -186,3 +212,17 @@ bin/spark-submit \
 
 Note that the image for your Spark job (i.e. `spark.kubernetes.container.image`, `spark.kubernetes.driver.container.image` and `spark.kubernetes.executor.container.image`) needs to have the [GCS connector](https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage) dependency, which is included in `lightbend/spark-history-server:2.4.0`, otherwise the `gs://` scheme won't be recognized.
 
+##### S3
+
+In the case of S3, it is recommended to use IAM based authentication. The IAM role should have equivalent access to AmazonS3FullAccess. To write event logs to S3, you need to provide configs as below: 
+```
+--conf spark.eventLog.enabled=true \
+--conf spark.eventLog.dir=s3a://spark-hs/
+```
+Similar to GCS, note that the image for your Spark job scheme needs to have the necessary dependencies: `hadoop-aws-2.7.5.jar` and `aws-java-sdk-1.7.4.jar`
+
+When not using the IAM based authentication, you need to provide additional configs for authentication as below:
+```bash
+--conf spark.hadoop.fs.s3a.access.key=your-AWS-access-key-ID \
+--conf spark.hadoop.fs.s3a.secret.key=your-AWS-secret-access-key
+```
diff --git a/stable/spark-history-server/templates/configmap.yaml b/stable/spark-history-server/templates/configmap.yaml
@@ -10,6 +10,7 @@ metadata:
 data:
   enablePVC: {{ .Values.pvc.enablePVC | quote }}
   enableGCS: {{ .Values.gcs.enableGCS | quote }}
+  enableS3: {{ .Values.s3.enableS3 | quote }}
   {{- range $key, $val := .Values.environment }}
   {{ $key }}: {{ $val | quote }}
   {{- end }}
@@ -21,6 +22,10 @@ data:
   {{- range $key, $val := .Values.gcs }}
   {{ $key }}: {{ $val | quote }}
   {{- end }}
+  {{- else if .Values.s3.enableS3 }}
+  {{- range $key, $val := .Values.s3 }}
+  {{ $key }}: {{ $val | quote }}
+  {{- end }}
   {{- else }}
   {{- range $key, $val := .Values.hdfs }}
   {{ $key }}: {{ $val | quote }}

diff --git a/stable/spark-history-server/templates/deployment.yaml b/stable/spark-history-server/templates/deployment.yaml
@@ -44,6 +44,15 @@ spec:
             export SPARK_HISTORY_OPTS="$SPARK_HISTORY_OPTS \
             -Dspark.hadoop.google.cloud.auth.service.account.json.keyfile=/etc/secrets/$key \
             -Dspark.history.fs.logDirectory=$logDirectory";
+          elif [ "$enableS3" == "true" ]; then
+            export SPARK_HISTORY_OPTS="$SPARK_HISTORY_OPTS \
+              -Dspark.history.fs.logDirectory=$logDirectory
+              -Dspark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem";
+            if [ "$enableIAM" == "false" ]; then
+              export SPARK_HISTORY_OPTS="$SPARK_HISTORY_OPTS \
+              -Dspark.hadoop.fs.s3a.access.key=$(cat /etc/secrets/${accessKeyName}) \
+              -Dspark.hadoop.fs.s3a.secret.key=$(cat /etc/secrets/${secretKeyName})";
+            fi;
           else
             export SPARK_HISTORY_OPTS="$SPARK_HISTORY_OPTS \
             -Dspark.history.fs.logDirectory=$logDirectory";
@@ -60,31 +69,48 @@ spec:
           httpGet:
             path: /
             port: historyport
-        volumeMounts:
         {{- if .Values.pvc.enablePVC }}
+        volumeMounts:
         - name: data
           mountPath: /mnt/{{ .Values.pvc.eventsDir }}
         {{- else if .Values.gcs.enableGCS }}
+        volumeMounts:
         - name: secrets-volume
           mountPath: /etc/secrets
+        {{- else if .Values.s3.enableS3 }}
+        {{- if (not .Values.s3.enableIAM) }}
+        volumeMounts:
+        - name: secrets-volume
+          mountPath: /etc/secrets
+        {{- end }}
         {{- else }}
+        volumeMounts:
         - name: core-site
           mountPath: /etc/hadoop/core-site.xml
           subPath: core-site.xml
         - name: hdfs-site
           mountPath: /etc/hadoop/hdfs-site.xml
           subPath: hdfs-site.xml
         {{- end }}
-      volumes:
       {{- if .Values.pvc.enablePVC }}
+      volumes:
       - name: data
         persistentVolumeClaim:
           claimName: {{ .Values.pvc.existingClaimName }}
       {{- else if .Values.gcs.enableGCS }}
+      volumes:
       - name: secrets-volume
         secret:
           secretName: {{ .Values.gcs.secret }}
+      {{- else if .Values.s3.enableS3 }}
+      {{- if (not .Values.s3.enableIAM) }}
+      volumes:
+      - name: secrets-volume
+        secret:
+          secretName: {{ .Values.s3.secret }}
+      {{- end }}
       {{- else }}
+      volumes:
       - name: hdfs-site
         configMap:
           name: {{ .Values.hdfs.hdfsSiteConfigMap }}

diff --git a/stable/spark-history-server/values.yaml b/stable/spark-history-server/values.yaml
@@ -52,3 +52,14 @@ hdfs:
   coreSiteConfigMap: core-site
   logDirectory: hdfs://hdfs/history/
   HADOOP_CONF_DIR: /etc/hadoop
+
+s3:
+  enableS3: false
+  enableIAM: true
+  # Omit for IAM role-based or provider-based authentication.
+  secret: aws-secrets
+  # accessKeyName is a is AWS access key ID. Omit for IAM role-based or provider-based authentication.
+  accessKeyName: aws-access-key
+  # secretKey is AWS secret key. Omit for IAM role-based or provider-based authentication.
+  secretKeyName: aws-secret-key
+  logDirectory: s3a://spark-hs/