feat!: add support for Virtual Dataproc cluster running on GKE cluster (

#570) - [ ] Regenerate this pull request now. Committer: @Padmaar PiperOrigin-RevId: 429111624 Source-Link: googleapis/googleapis@da999a2 Source-Link: googleapis/googleapis-gen@99c5b3e Copy-Tag: eyJwIjoiLmdpdGh1Yi8uT3dsQm90LnlhbWwiLCJoIjoiOTljNWIzZTk4YmFhMWRlOTM3NzZhYTRiNWNkNGM3MzYxMzUzZTRmNiJ9
googleapis · Feb 17, 2022 · 51c045d · 51c045d
1 parent 4d125ff
commit 51c045d
Show file tree

Hide file tree

Showing 8 changed files with 3,958 additions and 516 deletions.
diff --git a/packages/google-cloud-dataproc/protos/google/cloud/dataproc/v1/batches.proto b/packages/google-cloud-dataproc/protos/google/cloud/dataproc/v1/batches.proto
@@ -90,8 +90,7 @@ message CreateBatchRequest {
   // Optional. The ID to use for the batch, which will become the final component of
   // the batch's resource name.
   //
-  // This value must be 4-63 characters. Valid characters
-  // are /[a-z][0-9]-/.
+  // This value must be 4-63 characters. Valid characters are `/[a-z][0-9]-/`.
   string batch_id = 3 [(google.api.field_behavior) = OPTIONAL];
 
   // Optional. A unique ID used to identify the request. If the service

diff --git a/packages/google-cloud-dataproc/protos/google/cloud/dataproc/v1/clusters.proto b/packages/google-cloud-dataproc/protos/google/cloud/dataproc/v1/clusters.proto
@@ -1,4 +1,4 @@
-// Copyright 2021 Google LLC
+// Copyright 2022 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -167,6 +167,15 @@ message Cluster {
   // when clusters are updated.
   ClusterConfig config = 3 [(google.api.field_behavior) = OPTIONAL];
 
+  // Optional. The virtual cluster config, used when creating a Dataproc cluster that
+  // does not directly control the underlying compute resources, for example,
+  // when creating a [Dataproc-on-GKE
+  // cluster](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke#create-a-dataproc-on-gke-cluster).
+  // Note that Dataproc may set default values, and values may change when
+  // clusters are updated. Exactly one of config or virtualClusterConfig must be
+  // specified.
+  VirtualClusterConfig virtual_cluster_config = 10 [(google.api.field_behavior) = OPTIONAL];
+
   // Optional. The labels to associate with this cluster.
   // Label **keys** must contain 1 to 63 characters, and must conform to
   // [RFC 1035](https://www.ietf.org/rfc/rfc1035.txt).
@@ -275,33 +284,56 @@ message ClusterConfig {
 
   // Optional. Metastore configuration.
   MetastoreConfig metastore_config = 20 [(google.api.field_behavior) = OPTIONAL];
-
-  // Optional. BETA. The Kubernetes Engine config for Dataproc clusters deployed to
-  // Kubernetes. Setting this is considered mutually exclusive with Compute
-  // Engine-based options such as `gce_cluster_config`, `master_config`,
-  // `worker_config`, `secondary_worker_config`, and `autoscaling_config`.
-  GkeClusterConfig gke_cluster_config = 21 [(google.api.field_behavior) = OPTIONAL];
 }
 
-// The GKE config for this cluster.
-message GkeClusterConfig {
-  // A full, namespace-isolated deployment target for an existing GKE cluster.
-  message NamespacedGkeDeploymentTarget {
-    // Optional. The target GKE cluster to deploy to.
-    // Format: 'projects/{project}/locations/{location}/clusters/{cluster_id}'
-    string target_gke_cluster = 1 [
-      (google.api.field_behavior) = OPTIONAL,
-      (google.api.resource_reference) = {
-        type: "container.googleapis.com/Cluster"
-      }
-    ];
-
-    // Optional. A namespace within the GKE cluster to deploy into.
-    string cluster_namespace = 2 [(google.api.field_behavior) = OPTIONAL];
+// Dataproc cluster config for a cluster that does not directly control the
+// underlying compute resources, such as a [Dataproc-on-GKE
+// cluster](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke#create-a-dataproc-on-gke-cluster).
+message VirtualClusterConfig {
+  // Optional. A Storage bucket used to stage job
+  // dependencies, config files, and job driver console output.
+  // If you do not specify a staging bucket, Cloud
+  // Dataproc will determine a Cloud Storage location (US,
+  // ASIA, or EU) for your cluster's staging bucket according to the
+  // Compute Engine zone where your cluster is deployed, and then create
+  // and manage this project-level, per-location bucket (see
+  // [Dataproc staging and temp
+  // buckets](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/staging-bucket)).
+  // **This field requires a Cloud Storage bucket name, not a `gs://...` URI to
+  // a Cloud Storage bucket.**
+  string staging_bucket = 1 [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. A Cloud Storage bucket used to store ephemeral cluster and jobs data,
+  // such as Spark and MapReduce history files.
+  // If you do not specify a temp bucket,
+  // Dataproc will determine a Cloud Storage location (US,
+  // ASIA, or EU) for your cluster's temp bucket according to the
+  // Compute Engine zone where your cluster is deployed, and then create
+  // and manage this project-level, per-location bucket. The default bucket has
+  // a TTL of 90 days, but you can use any TTL (or none) if you specify a
+  // bucket (see
+  // [Dataproc staging and temp
+  // buckets](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/staging-bucket)).
+  // **This field requires a Cloud Storage bucket name, not a `gs://...` URI to
+  // a Cloud Storage bucket.**
+  string temp_bucket = 2 [(google.api.field_behavior) = OPTIONAL];
+
+  oneof infrastructure_config {
+    // Required. The configuration for running the Dataproc cluster on Kubernetes.
+    KubernetesClusterConfig kubernetes_cluster_config = 6 [(google.api.field_behavior) = REQUIRED];
   }
 
-  // Optional. A target for the deployment.
-  NamespacedGkeDeploymentTarget namespaced_gke_deployment_target = 1 [(google.api.field_behavior) = OPTIONAL];
+  // Optional. Configuration of auxiliary services used by this cluster.
+  AuxiliaryServicesConfig auxiliary_services_config = 7 [(google.api.field_behavior) = OPTIONAL];
+}
+
+// Auxiliary services configuration for a Cluster.
+message AuxiliaryServicesConfig {
+  // Optional. The Hive Metastore configuration for this workload.
+  MetastoreConfig metastore_config = 1 [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. The Spark History Server configuration for the workload.
+  SparkHistoryServerConfig spark_history_server_config = 2 [(google.api.field_behavior) = OPTIONAL];
 }
 
 // Endpoint config for this cluster
@@ -660,8 +692,8 @@ message DiskConfig {
   // Optional. Interface type of local SSDs (default is "scsi").
   // Valid values: "scsi" (Small Computer System Interface),
   // "nvme" (Non-Volatile Memory Express).
-  // See [SSD Interface
-  // types](https://cloud.google.com/compute/docs/disks/local-ssd#performance).
+  // See [local SSD
+  // performance](https://cloud.google.com/compute/docs/disks/local-ssd#performance).
   string local_ssd_interface = 4 [(google.api.field_behavior) = OPTIONAL];
 }
 
@@ -692,6 +724,10 @@ message ClusterStatus {
     CREATING = 1;
 
     // The cluster is currently running and healthy. It is ready for use.
+    //
+    // **Note:** The cluster state changes from "creating" to "running" status
+    // after the master node(s), first two primary worker nodes (and the last
+    // primary worker node if primary workers > 2) are running.
     RUNNING = 2;
 
     // The cluster encountered an error. It is not ready for use.

diff --git a/packages/google-cloud-dataproc/protos/google/cloud/dataproc/v1/shared.proto b/packages/google-cloud-dataproc/protos/google/cloud/dataproc/v1/shared.proto
@@ -1,4 +1,4 @@
-// Copyright 2021 Google LLC
+// Copyright 2022 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -108,6 +108,179 @@ message RuntimeInfo {
   string diagnostic_output_uri = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
 }
 
+// The cluster's GKE config.
+message GkeClusterConfig {
+  // Optional. A target GKE cluster to deploy to. It must be in the same project and
+  // region as the Dataproc cluster (the GKE cluster can be zonal or regional).
+  // Format: 'projects/{project}/locations/{location}/clusters/{cluster_id}'
+  string gke_cluster_target = 2 [
+    (google.api.field_behavior) = OPTIONAL
+  ];
+
+  // Optional. GKE NodePools where workloads will be scheduled. At least one node pool
+  // must be assigned the 'default' role. Each role can be given to only a
+  // single NodePoolTarget. All NodePools must have the same location settings.
+  // If a nodePoolTarget is not specified, Dataproc constructs a default
+  // nodePoolTarget.
+  repeated GkeNodePoolTarget node_pool_target = 3 [(google.api.field_behavior) = OPTIONAL];
+}
+
+// The configuration for running the Dataproc cluster on Kubernetes.
+message KubernetesClusterConfig {
+  // Optional. A namespace within the Kubernetes cluster to deploy into. If this namespace
+  // does not exist, it is created. If it exists, Dataproc
+  // verifies that another Dataproc VirtualCluster is not installed
+  // into it. If not specified, the name of the Dataproc Cluster is used.
+  string kubernetes_namespace = 1 [(google.api.field_behavior) = OPTIONAL];
+
+  oneof config {
+    // Required. The configuration for running the Dataproc cluster on GKE.
+    GkeClusterConfig gke_cluster_config = 2 [(google.api.field_behavior) = REQUIRED];
+  }
+
+  // Optional. The software configuration for this Dataproc cluster running on Kubernetes.
+  KubernetesSoftwareConfig kubernetes_software_config = 3 [(google.api.field_behavior) = OPTIONAL];
+}
+
+// The software configuration for this Dataproc cluster running on Kubernetes.
+message KubernetesSoftwareConfig {
+  // The components that should be installed in this Dataproc cluster. The key
+  // must be a string from the KubernetesComponent enumeration. The value is
+  // the version of the software to be installed.
+  // At least one entry must be specified.
+  map<string, string> component_version = 1;
+
+  // The properties to set on daemon config files.
+  //
+  // Property keys are specified in `prefix:property` format, for example
+  // `spark:spark.kubernetes.container.image`. The following are supported
+  // prefixes and their mappings:
+  //
+  // * spark:  `spark-defaults.conf`
+  //
+  // For more information, see [Cluster
+  // properties](https://cloud.google.com/dataproc/docs/concepts/cluster-properties).
+  map<string, string> properties = 2;
+}
+
+// GKE NodePools that Dataproc workloads run on.
+message GkeNodePoolTarget {
+  // `Role` specifies whose tasks will run on the NodePool. The roles can be
+  // specific to workloads. Exactly one GkeNodePoolTarget within the
+  // VirtualCluster must have 'default' role, which is used to run all workloads
+  // that are not associated with a NodePool.
+  enum Role {
+    // Role is unspecified.
+    ROLE_UNSPECIFIED = 0;
+
+    // Any roles that are not directly assigned to a NodePool run on the
+    // `default` role's NodePool.
+    DEFAULT = 1;
+
+    // Run controllers and webhooks.
+    CONTROLLER = 2;
+
+    // Run spark driver.
+    SPARK_DRIVER = 3;
+
+    // Run spark executors.
+    SPARK_EXECUTOR = 4;
+  }
+
+  // Required. The target GKE NodePool.
+  // Format:
+  // 'projects/{project}/locations/{location}/clusters/{cluster}/nodePools/{node_pool}'
+  string node_pool = 1 [
+    (google.api.field_behavior) = REQUIRED
+  ];
+
+  // Required. The types of role for a GKE NodePool
+  repeated Role roles = 2 [(google.api.field_behavior) = REQUIRED];
+
+  // Optional. The configuration for the GKE NodePool.
+  //
+  // If specified, Dataproc attempts to create a NodePool with the
+  // specified shape. If one with the same name already exists, it is
+  // verified against all specified fields. If a field differs, the
+  // virtual cluster creation will fail.
+  //
+  // If omitted, any NodePool with the specified name is used. If a
+  // NodePool with the specified name does not exist, Dataproc create a NodePool
+  // with default values.
+  GkeNodePoolConfig node_pool_config = 3 [(google.api.field_behavior) = OPTIONAL];
+}
+
+// The configuration of a GKE NodePool used by a [Dataproc-on-GKE
+// cluster](https://cloud.google.com/dataproc/docs/concepts/jobs/dataproc-gke#create-a-dataproc-on-gke-cluster).
+message GkeNodePoolConfig {
+  // Parameters that describe cluster nodes.
+  message GkeNodeConfig {
+    // Optional. The name of a Compute Engine [machine
+    // type](https://cloud.google.com/compute/docs/machine-types).
+    string machine_type = 1 [(google.api.field_behavior) = OPTIONAL];
+
+    // Optional. Whether the nodes are created as [preemptible VM
+    // instances](https://cloud.google.com/compute/docs/instances/preemptible).
+    bool preemptible = 10 [(google.api.field_behavior) = OPTIONAL];
+
+    // Optional. The number of local SSD disks to attach to the node, which is limited by
+    // the maximum number of disks allowable per zone (see [Adding Local
+    // SSDs](https://cloud.google.com/compute/docs/disks/local-ssd)).
+    int32 local_ssd_count = 7 [(google.api.field_behavior) = OPTIONAL];
+
+    // Optional. A list of [hardware
+    // accelerators](https://cloud.google.com/compute/docs/gpus) to attach to
+    // each node.
+    repeated GkeNodePoolAcceleratorConfig accelerators = 11 [(google.api.field_behavior) = OPTIONAL];
+
+    // Optional. [Minimum CPU
+    // platform](https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform)
+    // to be used by this instance. The instance may be scheduled on the
+    // specified or a newer CPU platform. Specify the friendly names of CPU
+    // platforms, such as "Intel Haswell"` or Intel Sandy Bridge".
+    string min_cpu_platform = 13 [(google.api.field_behavior) = OPTIONAL];
+  }
+
+  // A GkeNodeConfigAcceleratorConfig represents a Hardware Accelerator request
+  // for a NodePool.
+  message GkeNodePoolAcceleratorConfig {
+    // The number of accelerator cards exposed to an instance.
+    int64 accelerator_count = 1;
+
+    // The accelerator type resource namename (see GPUs on Compute Engine).
+    string accelerator_type = 2;
+  }
+
+  // GkeNodePoolAutoscaling contains information the cluster autoscaler needs to
+  // adjust the size of the node pool to the current cluster usage.
+  message GkeNodePoolAutoscalingConfig {
+    // The minimum number of nodes in the NodePool. Must be >= 0 and <=
+    // max_node_count.
+    int32 min_node_count = 2;
+
+    // The maximum number of nodes in the NodePool. Must be >= min_node_count.
+    // **Note:** Quota must be sufficient to scale up the cluster.
+    int32 max_node_count = 3;
+  }
+
+  // Optional. The node pool configuration.
+  GkeNodeConfig config = 2 [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. The list of Compute Engine
+  // [zones](https://cloud.google.com/compute/docs/zones#available) where
+  // NodePool's nodes will be located.
+  //
+  // **Note:** Currently, only one zone may be specified.
+  //
+  // If a location is not specified during NodePool creation, Dataproc will
+  // choose a location.
+  repeated string locations = 13 [(google.api.field_behavior) = OPTIONAL];
+
+  // Optional. The autoscaler configuration for this NodePool. The autoscaler is enabled
+  // only when a valid configuration is present.
+  GkeNodePoolAutoscalingConfig autoscaling = 4 [(google.api.field_behavior) = OPTIONAL];
+}
+
 // Cluster components that can be activated.
 enum Component {
   // Unspecified component. Specifying this will cause Cluster creation to fail.