This repository has been archived by the owner on Jun 3, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathconfig.yml
102 lines (82 loc) · 3.43 KB
/
config.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
dataGeneration:
gensort:
# Whether to generate data for the sort benchmark.
enabled: true
tpcds:
# Whether to generate data for the TPC-DS benchmark.
enabled: true
tempWorkingDir: /tmp/spark-benchmark/datagen-working-dir
# Whether to overwrite any previously generated data. If set to false, the program will fail with an exception.
overwriteData: false
# The number of threads used for data generation.
parallelism: 5
benchmarks:
# Optional. Defaults to current timestamp.
experimentName: experiment-1
gensort:
# Whether to run the sort benchmark.
enabled: false
tpcds:
# Whether to run the TPC-DS benchmark suite.
enabled: false
# The number of iterations to run each benchmark.
iterations: 10
# A list of sizes in Gigabytes to generate the data and run the benchmarks.
dataScalesGb:
- 10
- 1024
hadoop:
# A map of a filesystem name to its settings. The names are used to reference the filesystem in the settings below.
# Supported types are "azure", "s3a" and "simple". Use "simple" for HDFS and local file systems.
filesystems:
azure:
accessKey: __PLACEHOLDER__
accountName: __PLACEHOLDER__
containerName: spark-benchmark
type: azure
local:
baseUri: 'file:///home/palantir/spark-benchmark'
type: simple
s3a:
baseUri: 's3a://spark-benchmark/'
accessKey: __PLACEHOLDER__ # Optional
secretKey: __PLACEHOLDER__ # Optional
type: s3a
# The filesystem to be used to generate data, read the inputs from and write the query results to.
defaultFilesystem: s3a
# The filesystem to be used to write the query duration related metrics to. We recommend using a "simple" filesystem for this.
# This is optional and will default to "defaultFilesystem" if empty.
metricsFileSystem: local
hadoopConf:
hadoop.tmp.dir: /scratch/hadoop-tmp
# S3 specific settings to commit data consistently.
fs.s3a.committer.name: directory
fs.s3a.committer.staging.conflict-mode: append
hadoopConfDirs:
# Directory containing core-site.xml, hdfs-site.xml and yarn-site.xml if necessary.
- /path/to/hadoop/conf
spark:
# Run local spark by default.
master: local[4]
# If running a distributed benchmark:
# executorCores: 2
# executorInstances: 48
# executorMemory: 8G
sparkConf:
# # If running a distributed benchmark on YARN:
# master: yarn
# spark.yarn.executor.memoryOverhead: 2G
# spark.executorEnv.JAVA_HOME: /path/to/java/home
# spark.yarn.jars: service/lib/*
# spark.yarn.stagingDir: 'hdfs://my-hdfs/user/palantir'
spark.hadoop.hadoop.tmp.dir: /scratch/hadoop-tmp
# Use a more reliable committer settings.
spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version: 2
spark.sql.parquet.output.committer.class: org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter
spark.sql.sources.commitProtocolClass: org.apache.spark.internal.io.cloud.PathOutputCommitProtocol
# S3 specific settings to commit data consistently.
spark.hadoop.mapreduce.outputcommitter.factory.scheme.s3a: org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory
# Use adaptive query execution for partitioning generated data better, as well as performing joins more efficiently.
spark.sql.adaptive.enabled: true
# Set a very high number, so that the benchmark can continue even if certain queries fail.
spark.yarn.max.executor.failures: 1000