diff --git a/.github/workflows/devnet-deploy.yml b/.github/workflows/devnet-deploy.yml index 127f49b3143..e3b2537fc60 100644 --- a/.github/workflows/devnet-deploy.yml +++ b/.github/workflows/devnet-deploy.yml @@ -3,6 +3,10 @@ name: Deploy devnet on: workflow_dispatch: inputs: + cluster: + description: The cluster to deploy to, e.g. aztec-gke-private + required: true + default: "aztec-gke-private" namespace: description: The namespace to deploy to, e.g. smoke required: true @@ -40,7 +44,7 @@ env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} CONTRACT_S3_BUCKET: s3://static.aztec.network - CLUSTER_NAME: aztec-gke + CLUSTER_NAME: ${{ inputs.cluster }} REGION: us-west1-a NAMESPACE: ${{ inputs.namespace }} AZTEC_DOCKER_IMAGE: ${{ inputs.aztec_docker_image }} @@ -50,6 +54,7 @@ jobs: uses: ./.github/workflows/network-deploy.yml with: namespace: ${{ github.event.inputs.namespace }} + cluster: ${{ github.event.inputs.cluster }} values_file: release-devnet.yaml aztec_docker_image: ${{ github.event.inputs.aztec_docker_image }} deployment_mnemonic_secret_name: ${{ github.event.inputs.deployment_mnemonic_secret_name }} diff --git a/.github/workflows/network-deploy.yml b/.github/workflows/network-deploy.yml index d21fa650890..04cdb157380 100644 --- a/.github/workflows/network-deploy.yml +++ b/.github/workflows/network-deploy.yml @@ -3,6 +3,10 @@ name: Aztec Network Deployment on: workflow_call: inputs: + cluster: + description: The cluster to deploy to, e.g. aztec-gke-private + required: true + type: string namespace: description: The namespace to deploy to, e.g. smoke required: true @@ -50,6 +54,10 @@ on: required: true workflow_dispatch: inputs: + cluster: + description: The cluster to deploy to, e.g. aztec-gke-private + required: true + type: string namespace: description: The namespace to deploy to, e.g. smoke required: true @@ -103,10 +111,10 @@ jobs: DEPLOYMENT_MNEMONIC_SECRET_NAME: ${{ inputs.deployment_mnemonic_secret_name }} DEPLOYMENT_SALT: ${{ inputs.deployment_salt }} CHART_PATH: ./spartan/aztec-network - CLUSTER_NAME: aztec-gke + CLUSTER_NAME: ${{ inputs.cluster }} REGION: us-west1-a TF_STATE_BUCKET: aztec-terraform - GKE_CLUSTER_CONTEXT: gke_testnet-440309_us-west1-a_aztec-gke + GKE_CLUSTER_CONTEXT: "gke_testnet-440309_us-west1-a_${{ inputs.cluster }}" steps: - name: Checkout code diff --git a/.github/workflows/network-test.yml b/.github/workflows/network-test.yml index 8ed37dfa51d..dc1dcf57ee1 100644 --- a/.github/workflows/network-test.yml +++ b/.github/workflows/network-test.yml @@ -3,6 +3,10 @@ name: Aztec Network Test on: workflow_dispatch: inputs: + cluster: + description: The cluster to deploy to, e.g. aztec-gke-private + required: true + default: "aztec-gke-private" namespace: description: The namespace to deploy to, e.g. smoke required: true @@ -22,10 +26,10 @@ jobs: NAMESPACE: ${{ inputs.namespace }} TEST: ${{ inputs.test }} CHART_PATH: ./spartan/aztec-network - CLUSTER_NAME: aztec-gke + CLUSTER_NAME: ${{ inputs.cluster }} REGION: us-west1-a PROJECT_ID: testnet-440309 - GKE_CLUSTER_CONTEXT: gke_testnet-440309_us-west1-a_aztec-gke + GKE_CLUSTER_CONTEXT: "gke_testnet-440309_us-west1-a_${{ inputs.cluster }}" steps: - name: Checkout code diff --git a/.github/workflows/nightly-masternet-deploy.yml b/.github/workflows/nightly-masternet-deploy.yml index b2f937c891a..16ae44102be 100644 --- a/.github/workflows/nightly-masternet-deploy.yml +++ b/.github/workflows/nightly-masternet-deploy.yml @@ -28,6 +28,7 @@ jobs: uses: ./.github/workflows/network-deploy.yml with: ref: master + cluster: aztec-gke-private namespace: masternet values_file: rc-1.yaml aztec_docker_image: aztecprotocol/aztec@${{ needs.get-latest-commit.outputs.commit }} diff --git a/spartan/terraform/gke-cluster/cluster/main.tf b/spartan/terraform/gke-cluster/cluster/main.tf new file mode 100644 index 00000000000..00baa168529 --- /dev/null +++ b/spartan/terraform/gke-cluster/cluster/main.tf @@ -0,0 +1,270 @@ +# Create a GKE cluster +resource "google_container_cluster" "primary" { + name = var.cluster_name + location = var.zone + + initial_node_count = 1 + # Remove default node pool after cluster creation + remove_default_node_pool = true + deletion_protection = true + + # Kubernetes version + min_master_version = "latest" + + # Network configuration + network = "default" + subnetwork = "default" + + # Master auth configuration + master_auth { + client_certificate_config { + issue_client_certificate = false + } + } +} + +# Create 2 core node pool with local ssd +resource "google_container_node_pool" "aztec_nodes_2core_ssd" { + name = "${var.cluster_name}-2core-ssd" + location = var.zone + cluster = var.cluster_name + + # Enable autoscaling + autoscaling { + min_node_count = 0 + max_node_count = 256 + } + + # Node configuration + node_config { + machine_type = "n2d-standard-2" + ephemeral_storage_local_ssd_config { + local_ssd_count = 1 + } + + service_account = var.service_account + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + labels = { + env = "production" + + } + tags = ["aztec-gke-node", "aztec"] + } +} + +# Create 4 core node pool with local ssd +resource "google_container_node_pool" "aztec_nodes_4core_ssd" { + name = "${var.cluster_name}-4core-ssd" + location = var.zone + cluster = var.cluster_name + + # Enable autoscaling + autoscaling { + min_node_count = 0 + max_node_count = 256 + } + + # Node configuration + node_config { + machine_type = "n2d-standard-4" + ephemeral_storage_local_ssd_config { + local_ssd_count = 1 + } + + service_account = var.service_account + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + labels = { + env = "production" + + } + tags = ["aztec-gke-node", "aztec"] + } +} + +# Create node pool for simulated aztec nodes (validators, prover nodes, boot nodes) +resource "google_container_node_pool" "aztec_nodes_2core" { + name = "${var.cluster_name}-2core" + location = var.zone + cluster = var.cluster_name + + # Enable autoscaling + autoscaling { + min_node_count = 0 + max_node_count = 256 + } + + # Node configuration + node_config { + machine_type = "t2d-standard-2" + + service_account = var.service_account + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + labels = { + env = "production" + } + tags = ["aztec-gke-node", "aztec"] + } +} + +# Create node pool for aztec nodes (validators, prover nodes, boot nodes) +resource "google_container_node_pool" "aztec_nodes_4core" { + name = "${var.cluster_name}-4core" + location = var.zone + cluster = var.cluster_name + + # Enable autoscaling + autoscaling { + min_node_count = 0 + max_node_count = 256 + } + + # Node configuration + node_config { + machine_type = "t2d-standard-4" + + service_account = var.service_account + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + labels = { + env = "production" + } + tags = ["aztec-gke-node", "aztec"] + } + + # Management configuration + management { + auto_repair = true + auto_upgrade = true + } +} + +# Create 8 core nodes. Usually for proven bots +resource "google_container_node_pool" "aztec_nodes-8core" { + name = "${var.cluster_name}-8core" + location = var.zone + cluster = var.cluster_name + + # Enable autoscaling + autoscaling { + min_node_count = 0 + max_node_count = 256 + } + + # Node configuration + node_config { + machine_type = "t2d-standard-8" + + service_account = var.service_account + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + labels = { + env = "production" + } + tags = ["aztec-gke-node", "aztec"] + } + + # Management configuration + management { + auto_repair = true + auto_upgrade = true + } +} + +# Create spot instance node pool with autoscaling +resource "google_container_node_pool" "spot_nodes_32core" { + name = "${var.cluster_name}-32core-spot" + location = var.zone + cluster = var.cluster_name + + # Enable autoscaling + autoscaling { + min_node_count = 0 + max_node_count = 1500 + } + + # Node configuration + node_config { + machine_type = "t2d-standard-32" + spot = true + + service_account = var.service_account + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + labels = { + env = "production" + pool = "spot" + } + tags = ["aztec-gke-node", "spot"] + + # Spot instance termination handler + taint { + key = "cloud.google.com/gke-spot" + value = "true" + effect = "NO_SCHEDULE" + } + } + + # Management configuration + management { + auto_repair = true + auto_upgrade = true + } +} + +# Create low core count spot instance node pool with autoscaling +resource "google_container_node_pool" "spot_nodes_2core" { + name = "${var.cluster_name}-2core-spot" + location = var.zone + cluster = var.cluster_name + + # Enable autoscaling + autoscaling { + min_node_count = 0 + max_node_count = 1500 + } + + # Node configuration + node_config { + machine_type = "t2d-standard-2" + spot = true + + service_account = var.service_account + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + labels = { + env = "production" + pool = "spot" + } + tags = ["aztec-gke-node", "spot"] + + # Spot instance termination handler + taint { + key = "cloud.google.com/gke-spot" + value = "true" + effect = "NO_SCHEDULE" + } + } + + # Management configuration + management { + auto_repair = true + auto_upgrade = true + } +} diff --git a/spartan/terraform/gke-cluster/cluster/outputs.tf b/spartan/terraform/gke-cluster/cluster/outputs.tf new file mode 100644 index 00000000000..5f7edd9630e --- /dev/null +++ b/spartan/terraform/gke-cluster/cluster/outputs.tf @@ -0,0 +1,8 @@ +output "cluster_endpoint" { + value = google_container_cluster.primary.endpoint +} + +output "kubernetes_cluster_name" { + description = "GKE Cluster Name" + value = google_container_cluster.primary.name +} diff --git a/spartan/terraform/gke-cluster/cluster/variables.tf b/spartan/terraform/gke-cluster/cluster/variables.tf new file mode 100644 index 00000000000..cf492537eb9 --- /dev/null +++ b/spartan/terraform/gke-cluster/cluster/variables.tf @@ -0,0 +1,18 @@ +variable "project" { + default = "testnet-440309" +} + +variable "region" { + default = "us-west1" +} + +variable "zone" { + default = "us-west1-a" +} + +variable "cluster_name" { +} + +variable "service_account" { + +} diff --git a/spartan/terraform/gke-cluster/firewall.tf b/spartan/terraform/gke-cluster/firewall.tf index 0dc4b406ce3..22b4fadb5d3 100644 --- a/spartan/terraform/gke-cluster/firewall.tf +++ b/spartan/terraform/gke-cluster/firewall.tf @@ -8,7 +8,7 @@ resource "google_compute_firewall" "udp_ingress" { } direction = "INGRESS" source_ranges = ["0.0.0.0/0"] - target_tags = ["gke-node", "aztec-gke-node"] + target_tags = ["aztec-gke-node"] } # Create egress firewall rules for UDP @@ -21,7 +21,7 @@ resource "google_compute_firewall" "udp_egress" { } direction = "EGRESS" destination_ranges = ["0.0.0.0/0"] - target_tags = ["gke-node", "aztec-gke-node"] + target_tags = ["aztec-gke-node"] } # Create ingress firewall rules for TCP @@ -34,7 +34,7 @@ resource "google_compute_firewall" "tcp_ingress" { } direction = "INGRESS" source_ranges = ["0.0.0.0/0"] - target_tags = ["gke-node", "aztec-gke-node"] + target_tags = ["aztec-gke-node"] } # Create egress firewall rules for TCP @@ -47,5 +47,5 @@ resource "google_compute_firewall" "tcp_egress" { } direction = "EGRESS" destination_ranges = ["0.0.0.0/0"] - target_tags = ["gke-node", "aztec-gke-node"] + target_tags = ["aztec-gke-node"] } diff --git a/spartan/terraform/gke-cluster/iam.tf b/spartan/terraform/gke-cluster/iam.tf new file mode 100644 index 00000000000..3943b24b7ec --- /dev/null +++ b/spartan/terraform/gke-cluster/iam.tf @@ -0,0 +1,38 @@ +# Create the service account +resource "google_service_account" "gke_sa" { + account_id = "aztec-gke-nodes-sa" + display_name = "Aztec GKE Nodes Service Account" + description = "Service account for aztec GKE nodes" +} + +# Add IAM roles to the service account +resource "google_project_iam_member" "gke_sa_roles" { + for_each = toset([ + "roles/logging.logWriter", + "roles/monitoring.metricWriter", + "roles/monitoring.viewer", + "roles/artifactregistry.reader" + ]) + project = var.project + role = each.key + member = "serviceAccount:${google_service_account.gke_sa.email}" +} + +# Create a new service account for Helm +resource "google_service_account" "helm_sa" { + account_id = "helm-sa" + display_name = "Helm Service Account" + description = "Service account for Helm operations" +} + +# Add IAM roles to the Helm service account +resource "google_project_iam_member" "helm_sa_roles" { + for_each = toset([ + "roles/container.admin", + "roles/storage.admin", + "roles/secretmanager.admin" + ]) + project = var.project + role = each.key + member = "serviceAccount:${google_service_account.helm_sa.email}" +} diff --git a/spartan/terraform/gke-cluster/main.tf b/spartan/terraform/gke-cluster/main.tf index 008f7a59874..7baca828d34 100644 --- a/spartan/terraform/gke-cluster/main.tf +++ b/spartan/terraform/gke-cluster/main.tf @@ -18,302 +18,22 @@ provider "google" { region = var.region } -# Create the service account -resource "google_service_account" "gke_sa" { - account_id = "aztec-gke-nodes-sa" - display_name = "Aztec GKE Nodes Service Account" - description = "Service account for aztec GKE nodes" -} - -# Add IAM roles to the service account -resource "google_project_iam_member" "gke_sa_roles" { - for_each = toset([ - "roles/logging.logWriter", - "roles/monitoring.metricWriter", - "roles/monitoring.viewer", - "roles/artifactregistry.reader" - ]) - project = var.project - role = each.key - member = "serviceAccount:${google_service_account.gke_sa.email}" -} - -# Create a new service account for Helm -resource "google_service_account" "helm_sa" { - account_id = "helm-sa" - display_name = "Helm Service Account" - description = "Service account for Helm operations" -} +module "gke_cluster_private" { + source = "./cluster" -# Add IAM roles to the Helm service account -resource "google_project_iam_member" "helm_sa_roles" { - for_each = toset([ - "roles/container.admin", - "roles/storage.admin", - "roles/secretmanager.admin" - ]) - project = var.project - role = each.key - member = "serviceAccount:${google_service_account.helm_sa.email}" + cluster_name = "aztec-gke-private" + project = var.project + region = var.region + zone = var.zone + service_account = google_service_account.gke_sa.email } -# Create a GKE cluster -resource "google_container_cluster" "primary" { - name = var.cluster_name - location = var.zone - - initial_node_count = 1 - # Remove default node pool after cluster creation - remove_default_node_pool = true - - # Kubernetes version - min_master_version = "latest" +module "gke_cluster_public" { + source = "./cluster" - # Network configuration - network = "default" - subnetwork = "default" - - # Master auth configuration - master_auth { - client_certificate_config { - issue_client_certificate = false - } - } -} - -# Create primary node pool with autoscaling -resource "google_container_node_pool" "primary_nodes" { - name = "primary-node-pool" - location = var.zone - cluster = google_container_cluster.primary.name - - # Enable autoscaling - autoscaling { - min_node_count = 1 - max_node_count = 2 - } - - # Node configuration - node_config { - machine_type = "t2d-standard-32" - - service_account = google_service_account.gke_sa.email - oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" - ] - - labels = { - env = "production" - } - tags = ["aztec-gke-node"] - } - - # Management configuration - management { - auto_repair = true - auto_upgrade = true - } -} - -# Create 2 core node pool with local ssd -resource "google_container_node_pool" "aztec_nodes_2core_ssd" { - name = "aztec-nodes-2core-ssd" - location = var.zone - cluster = google_container_cluster.primary.name - - # Enable autoscaling - autoscaling { - min_node_count = 1 - max_node_count = 256 - } - - # Node configuration - node_config { - machine_type = "n2d-standard-2" - ephemeral_storage_local_ssd_config { - local_ssd_count = 1 - } - - service_account = google_service_account.gke_sa.email - oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" - ] - - labels = { - env = "production" - - } - tags = ["aztec-gke-node", "aztec"] - } -} - -# Create 4 core node pool with local ssd -resource "google_container_node_pool" "aztec_nodes_4core_ssd" { - name = "aztec-nodes-4core-ssd" - location = var.zone - cluster = google_container_cluster.primary.name - - # Enable autoscaling - autoscaling { - min_node_count = 1 - max_node_count = 256 - } - - # Node configuration - node_config { - machine_type = "n2d-standard-4" - ephemeral_storage_local_ssd_config { - local_ssd_count = 1 - } - - service_account = google_service_account.gke_sa.email - oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" - ] - - labels = { - env = "production" - - } - tags = ["aztec-gke-node", "aztec"] - } -} - -# Create node pool for simulated aztec nodes (validators, prover nodes, boot nodes) -resource "google_container_node_pool" "aztec_nodes_simulated" { - name = "aztec-node-pool-simulated" - location = var.zone - cluster = google_container_cluster.primary.name - - # Enable autoscaling - autoscaling { - min_node_count = 1 - max_node_count = 256 - } - - # Node configuration - node_config { - machine_type = "t2d-standard-2" - - service_account = google_service_account.gke_sa.email - oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" - ] - - labels = { - env = "production" - } - tags = ["aztec-gke-node", "aztec"] - } -} - -# Create node pool for aztec nodes (validators, prover nodes, boot nodes) -resource "google_container_node_pool" "aztec_nodes" { - name = "aztec-node-pool" - location = var.zone - cluster = google_container_cluster.primary.name - - # Enable autoscaling - autoscaling { - min_node_count = 1 - max_node_count = 256 - } - - # Node configuration - node_config { - machine_type = "t2d-standard-4" - - service_account = google_service_account.gke_sa.email - oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" - ] - - labels = { - env = "production" - } - tags = ["aztec-gke-node", "aztec"] - } - - # Management configuration - management { - auto_repair = true - auto_upgrade = true - } -} - -# Create 8 core nodes. Usually for proven bots -resource "google_container_node_pool" "aztec_nodes-8core" { - name = "aztec-nodes-8core" - location = var.zone - cluster = google_container_cluster.primary.name - - # Enable autoscaling - autoscaling { - min_node_count = 1 - max_node_count = 256 - } - - # Node configuration - node_config { - machine_type = "t2d-standard-8" - - service_account = google_service_account.gke_sa.email - oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" - ] - - labels = { - env = "production" - } - tags = ["aztec-gke-node", "aztec"] - } - - # Management configuration - management { - auto_repair = true - auto_upgrade = true - } -} - -# Create spot instance node pool with autoscaling -resource "google_container_node_pool" "spot_nodes" { - name = "aztec-spot-node-pool" - location = var.zone - cluster = google_container_cluster.primary.name - - # Enable autoscaling - autoscaling { - min_node_count = 0 - max_node_count = 1500 - } - - # Node configuration - node_config { - machine_type = "t2d-standard-32" - spot = true - - service_account = google_service_account.gke_sa.email - oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" - ] - - labels = { - env = "production" - pool = "spot" - } - tags = ["aztec-gke-node", "spot"] - - # Spot instance termination handler - taint { - key = "cloud.google.com/gke-spot" - value = "true" - effect = "NO_SCHEDULE" - } - } - - # Management configuration - management { - auto_repair = true - auto_upgrade = true - } + cluster_name = "aztec-gke-public" + project = var.project + region = var.region + zone = var.zone + service_account = google_service_account.gke_sa.email } diff --git a/spartan/terraform/gke-cluster/outputs.tf b/spartan/terraform/gke-cluster/outputs.tf index befaa28092e..5a027b23444 100644 --- a/spartan/terraform/gke-cluster/outputs.tf +++ b/spartan/terraform/gke-cluster/outputs.tf @@ -1,7 +1,3 @@ -output "cluster_endpoint" { - value = google_container_cluster.primary.endpoint -} - output "service_account_email" { value = google_service_account.gke_sa.email } @@ -10,8 +6,3 @@ output "region" { description = "Google cloud region" value = var.region } - -output "kubernetes_cluster_name" { - description = "GKE Cluster Name" - value = google_container_cluster.primary.name -} diff --git a/spartan/terraform/gke-cluster/variables.tf b/spartan/terraform/gke-cluster/variables.tf index 83e1925cbd4..b115475dcb6 100644 --- a/spartan/terraform/gke-cluster/variables.tf +++ b/spartan/terraform/gke-cluster/variables.tf @@ -10,6 +10,3 @@ variable "zone" { default = "us-west1-a" } -variable "cluster_name" { - default = "aztec-gke" -}