Skip to content

Commit

Permalink
feat!: add support for Virtual Dataproc cluster running on GKE cluster (
Browse files Browse the repository at this point in the history
#570)

- [ ] Regenerate this pull request now.

Committer: @Padmaar
PiperOrigin-RevId: 429111624

Source-Link: googleapis/googleapis@da999a2

Source-Link: googleapis/googleapis-gen@99c5b3e
Copy-Tag: eyJwIjoiLmdpdGh1Yi8uT3dsQm90LnlhbWwiLCJoIjoiOTljNWIzZTk4YmFhMWRlOTM3NzZhYTRiNWNkNGM3MzYxMzUzZTRmNiJ9
  • Loading branch information
gcf-owl-bot[bot] authored Feb 17, 2022
1 parent 4d125ff commit 51c045d
Show file tree
Hide file tree
Showing 8 changed files with 3,958 additions and 516 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,7 @@ message CreateBatchRequest {
// Optional. The ID to use for the batch, which will become the final component of
// the batch's resource name.
//
// This value must be 4-63 characters. Valid characters
// are /[a-z][0-9]-/.
// This value must be 4-63 characters. Valid characters are `/[a-z][0-9]-/`.
string batch_id = 3 [(google.api.field_behavior) = OPTIONAL];

// Optional. A unique ID used to identify the request. If the service
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2021 Google LLC
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -167,6 +167,15 @@ message Cluster {
// when clusters are updated.
ClusterConfig config = 3 [(google.api.field_behavior) = OPTIONAL];

// Optional. The virtual cluster config, used when creating a Dataproc cluster that
// does not directly control the underlying compute resources, for example,
// when creating a [Dataproc-on-GKE
// cluster](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke#create-a-dataproc-on-gke-cluster).
// Note that Dataproc may set default values, and values may change when
// clusters are updated. Exactly one of config or virtualClusterConfig must be
// specified.
VirtualClusterConfig virtual_cluster_config = 10 [(google.api.field_behavior) = OPTIONAL];

// Optional. The labels to associate with this cluster.
// Label **keys** must contain 1 to 63 characters, and must conform to
// [RFC 1035](https://www.ietf.org/rfc/rfc1035.txt).
Expand Down Expand Up @@ -275,33 +284,56 @@ message ClusterConfig {

// Optional. Metastore configuration.
MetastoreConfig metastore_config = 20 [(google.api.field_behavior) = OPTIONAL];

// Optional. BETA. The Kubernetes Engine config for Dataproc clusters deployed to
// Kubernetes. Setting this is considered mutually exclusive with Compute
// Engine-based options such as `gce_cluster_config`, `master_config`,
// `worker_config`, `secondary_worker_config`, and `autoscaling_config`.
GkeClusterConfig gke_cluster_config = 21 [(google.api.field_behavior) = OPTIONAL];
}

// The GKE config for this cluster.
message GkeClusterConfig {
// A full, namespace-isolated deployment target for an existing GKE cluster.
message NamespacedGkeDeploymentTarget {
// Optional. The target GKE cluster to deploy to.
// Format: 'projects/{project}/locations/{location}/clusters/{cluster_id}'
string target_gke_cluster = 1 [
(google.api.field_behavior) = OPTIONAL,
(google.api.resource_reference) = {
type: "container.googleapis.com/Cluster"
}
];

// Optional. A namespace within the GKE cluster to deploy into.
string cluster_namespace = 2 [(google.api.field_behavior) = OPTIONAL];
// Dataproc cluster config for a cluster that does not directly control the
// underlying compute resources, such as a [Dataproc-on-GKE
// cluster](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke#create-a-dataproc-on-gke-cluster).
message VirtualClusterConfig {
// Optional. A Storage bucket used to stage job
// dependencies, config files, and job driver console output.
// If you do not specify a staging bucket, Cloud
// Dataproc will determine a Cloud Storage location (US,
// ASIA, or EU) for your cluster's staging bucket according to the
// Compute Engine zone where your cluster is deployed, and then create
// and manage this project-level, per-location bucket (see
// [Dataproc staging and temp
// buckets](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/staging-bucket)).
// **This field requires a Cloud Storage bucket name, not a `gs://...` URI to
// a Cloud Storage bucket.**
string staging_bucket = 1 [(google.api.field_behavior) = OPTIONAL];

// Optional. A Cloud Storage bucket used to store ephemeral cluster and jobs data,
// such as Spark and MapReduce history files.
// If you do not specify a temp bucket,
// Dataproc will determine a Cloud Storage location (US,
// ASIA, or EU) for your cluster's temp bucket according to the
// Compute Engine zone where your cluster is deployed, and then create
// and manage this project-level, per-location bucket. The default bucket has
// a TTL of 90 days, but you can use any TTL (or none) if you specify a
// bucket (see
// [Dataproc staging and temp
// buckets](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/staging-bucket)).
// **This field requires a Cloud Storage bucket name, not a `gs://...` URI to
// a Cloud Storage bucket.**
string temp_bucket = 2 [(google.api.field_behavior) = OPTIONAL];

oneof infrastructure_config {
// Required. The configuration for running the Dataproc cluster on Kubernetes.
KubernetesClusterConfig kubernetes_cluster_config = 6 [(google.api.field_behavior) = REQUIRED];
}

// Optional. A target for the deployment.
NamespacedGkeDeploymentTarget namespaced_gke_deployment_target = 1 [(google.api.field_behavior) = OPTIONAL];
// Optional. Configuration of auxiliary services used by this cluster.
AuxiliaryServicesConfig auxiliary_services_config = 7 [(google.api.field_behavior) = OPTIONAL];
}

// Auxiliary services configuration for a Cluster.
message AuxiliaryServicesConfig {
// Optional. The Hive Metastore configuration for this workload.
MetastoreConfig metastore_config = 1 [(google.api.field_behavior) = OPTIONAL];

// Optional. The Spark History Server configuration for the workload.
SparkHistoryServerConfig spark_history_server_config = 2 [(google.api.field_behavior) = OPTIONAL];
}

// Endpoint config for this cluster
Expand Down Expand Up @@ -660,8 +692,8 @@ message DiskConfig {
// Optional. Interface type of local SSDs (default is "scsi").
// Valid values: "scsi" (Small Computer System Interface),
// "nvme" (Non-Volatile Memory Express).
// See [SSD Interface
// types](https://cloud.google.com/compute/docs/disks/local-ssd#performance).
// See [local SSD
// performance](https://cloud.google.com/compute/docs/disks/local-ssd#performance).
string local_ssd_interface = 4 [(google.api.field_behavior) = OPTIONAL];
}

Expand Down Expand Up @@ -692,6 +724,10 @@ message ClusterStatus {
CREATING = 1;

// The cluster is currently running and healthy. It is ready for use.
//
// **Note:** The cluster state changes from "creating" to "running" status
// after the master node(s), first two primary worker nodes (and the last
// primary worker node if primary workers > 2) are running.
RUNNING = 2;

// The cluster encountered an error. It is not ready for use.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2021 Google LLC
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -108,6 +108,179 @@ message RuntimeInfo {
string diagnostic_output_uri = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// The cluster's GKE config.
message GkeClusterConfig {
// Optional. A target GKE cluster to deploy to. It must be in the same project and
// region as the Dataproc cluster (the GKE cluster can be zonal or regional).
// Format: 'projects/{project}/locations/{location}/clusters/{cluster_id}'
string gke_cluster_target = 2 [
(google.api.field_behavior) = OPTIONAL
];

// Optional. GKE NodePools where workloads will be scheduled. At least one node pool
// must be assigned the 'default' role. Each role can be given to only a
// single NodePoolTarget. All NodePools must have the same location settings.
// If a nodePoolTarget is not specified, Dataproc constructs a default
// nodePoolTarget.
repeated GkeNodePoolTarget node_pool_target = 3 [(google.api.field_behavior) = OPTIONAL];
}

// The configuration for running the Dataproc cluster on Kubernetes.
message KubernetesClusterConfig {
// Optional. A namespace within the Kubernetes cluster to deploy into. If this namespace
// does not exist, it is created. If it exists, Dataproc
// verifies that another Dataproc VirtualCluster is not installed
// into it. If not specified, the name of the Dataproc Cluster is used.
string kubernetes_namespace = 1 [(google.api.field_behavior) = OPTIONAL];

oneof config {
// Required. The configuration for running the Dataproc cluster on GKE.
GkeClusterConfig gke_cluster_config = 2 [(google.api.field_behavior) = REQUIRED];
}

// Optional. The software configuration for this Dataproc cluster running on Kubernetes.
KubernetesSoftwareConfig kubernetes_software_config = 3 [(google.api.field_behavior) = OPTIONAL];
}

// The software configuration for this Dataproc cluster running on Kubernetes.
message KubernetesSoftwareConfig {
// The components that should be installed in this Dataproc cluster. The key
// must be a string from the KubernetesComponent enumeration. The value is
// the version of the software to be installed.
// At least one entry must be specified.
map<string, string> component_version = 1;

// The properties to set on daemon config files.
//
// Property keys are specified in `prefix:property` format, for example
// `spark:spark.kubernetes.container.image`. The following are supported
// prefixes and their mappings:
//
// * spark: `spark-defaults.conf`
//
// For more information, see [Cluster
// properties](https://cloud.google.com/dataproc/docs/concepts/cluster-properties).
map<string, string> properties = 2;
}

// GKE NodePools that Dataproc workloads run on.
message GkeNodePoolTarget {
// `Role` specifies whose tasks will run on the NodePool. The roles can be
// specific to workloads. Exactly one GkeNodePoolTarget within the
// VirtualCluster must have 'default' role, which is used to run all workloads
// that are not associated with a NodePool.
enum Role {
// Role is unspecified.
ROLE_UNSPECIFIED = 0;

// Any roles that are not directly assigned to a NodePool run on the
// `default` role's NodePool.
DEFAULT = 1;

// Run controllers and webhooks.
CONTROLLER = 2;

// Run spark driver.
SPARK_DRIVER = 3;

// Run spark executors.
SPARK_EXECUTOR = 4;
}

// Required. The target GKE NodePool.
// Format:
// 'projects/{project}/locations/{location}/clusters/{cluster}/nodePools/{node_pool}'
string node_pool = 1 [
(google.api.field_behavior) = REQUIRED
];

// Required. The types of role for a GKE NodePool
repeated Role roles = 2 [(google.api.field_behavior) = REQUIRED];

// Optional. The configuration for the GKE NodePool.
//
// If specified, Dataproc attempts to create a NodePool with the
// specified shape. If one with the same name already exists, it is
// verified against all specified fields. If a field differs, the
// virtual cluster creation will fail.
//
// If omitted, any NodePool with the specified name is used. If a
// NodePool with the specified name does not exist, Dataproc create a NodePool
// with default values.
GkeNodePoolConfig node_pool_config = 3 [(google.api.field_behavior) = OPTIONAL];
}

// The configuration of a GKE NodePool used by a [Dataproc-on-GKE
// cluster](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke#create-a-dataproc-on-gke-cluster).
message GkeNodePoolConfig {
// Parameters that describe cluster nodes.
message GkeNodeConfig {
// Optional. The name of a Compute Engine [machine
// type](https://cloud.google.com/compute/docs/machine-types).
string machine_type = 1 [(google.api.field_behavior) = OPTIONAL];

// Optional. Whether the nodes are created as [preemptible VM
// instances](https://cloud.google.com/compute/docs/instances/preemptible).
bool preemptible = 10 [(google.api.field_behavior) = OPTIONAL];

// Optional. The number of local SSD disks to attach to the node, which is limited by
// the maximum number of disks allowable per zone (see [Adding Local
// SSDs](https://cloud.google.com/compute/docs/disks/local-ssd)).
int32 local_ssd_count = 7 [(google.api.field_behavior) = OPTIONAL];

// Optional. A list of [hardware
// accelerators](https://cloud.google.com/compute/docs/gpus) to attach to
// each node.
repeated GkeNodePoolAcceleratorConfig accelerators = 11 [(google.api.field_behavior) = OPTIONAL];

// Optional. [Minimum CPU
// platform](https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform)
// to be used by this instance. The instance may be scheduled on the
// specified or a newer CPU platform. Specify the friendly names of CPU
// platforms, such as "Intel Haswell"` or Intel Sandy Bridge".
string min_cpu_platform = 13 [(google.api.field_behavior) = OPTIONAL];
}

// A GkeNodeConfigAcceleratorConfig represents a Hardware Accelerator request
// for a NodePool.
message GkeNodePoolAcceleratorConfig {
// The number of accelerator cards exposed to an instance.
int64 accelerator_count = 1;

// The accelerator type resource namename (see GPUs on Compute Engine).
string accelerator_type = 2;
}

// GkeNodePoolAutoscaling contains information the cluster autoscaler needs to
// adjust the size of the node pool to the current cluster usage.
message GkeNodePoolAutoscalingConfig {
// The minimum number of nodes in the NodePool. Must be >= 0 and <=
// max_node_count.
int32 min_node_count = 2;

// The maximum number of nodes in the NodePool. Must be >= min_node_count.
// **Note:** Quota must be sufficient to scale up the cluster.
int32 max_node_count = 3;
}

// Optional. The node pool configuration.
GkeNodeConfig config = 2 [(google.api.field_behavior) = OPTIONAL];

// Optional. The list of Compute Engine
// [zones](https://cloud.google.com/compute/docs/zones#available) where
// NodePool's nodes will be located.
//
// **Note:** Currently, only one zone may be specified.
//
// If a location is not specified during NodePool creation, Dataproc will
// choose a location.
repeated string locations = 13 [(google.api.field_behavior) = OPTIONAL];

// Optional. The autoscaler configuration for this NodePool. The autoscaler is enabled
// only when a valid configuration is present.
GkeNodePoolAutoscalingConfig autoscaling = 4 [(google.api.field_behavior) = OPTIONAL];
}

// Cluster components that can be activated.
enum Component {
// Unspecified component. Specifying this will cause Cluster creation to fail.
Expand Down
Loading

0 comments on commit 51c045d

Please sign in to comment.