From 07fcf34f6b4c2e3424252aaca5f8b370b6c75bef Mon Sep 17 00:00:00 2001
From: Dalton Hubble <dghubble@gmail.com>
Date: Sun, 15 Apr 2018 00:50:43 -0700
Subject: [PATCH] Switch GCP network lb to global TCP proxy lb

* Allow multi-controller clusters on Google Cloud
* GCP regional network load balancers have a long open
bug in which requests originating from a backend instance
are routed to the instance itself, regardless of whether
the health check passes or not. As a result, only the 0th
controller node registers. We've recommended just using
single master GCP clusters for a while
* https://issuetracker.google.com/issues/67366622
* Workaround issue by switching to a GCP TCP Proxy load
balancer. TCP proxy lb routes traffic to a backend service
(global) of instance group backends. In our case, spread
controllers across 3 zones (all regions have 3+ zones) and
organize them in 3 zonal unmanaged instance groups that
serve as backends. Allows multi-controller cluster creation
* GCP network load balancers only allowed legacy HTTP health
checks so kubelet 10255 was checked as an approximation of
controller health. Replace with TCP apiserver health checks
to detect unhealth or unresponsive apiservers.
* Drawbacks: GCP provision time increases, tailed logs now
timeout (similar tradeoff in AWS), controllers only span 3
zones instead of the exact number in the region
* Workaround in Typhoon has been known and posted for 5 months,
but there still appears to be no better alternative. Its
probably time to support multi-master and accept the downsides
---
 CHANGES.md                                    |   8 ++
 docs/topics/performance.md                    |   2 +-
 .../container-linux/kubernetes/apiserver.tf   | 105 ++++++++++++------
 .../container-linux/kubernetes/controllers.tf |  13 ++-
 .../container-linux/kubernetes/ssh.tf         |   2 +-
 5 files changed, 90 insertions(+), 40 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index f47e81795..395fc2d4d 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -4,6 +4,14 @@ Notable changes between versions.
 
 ## Latest
 
+#### Google Cloud
+
+* Add support for multi-controller clusters (i.e. multi-master) ([#54](https://github.com/poseidon/typhoon/issues/54), [#190](https://github.com/poseidon/typhoon/pull/190))
+  * Switch from network load balancer to TCP proxy load balancer. Avoids a bug in Google network load balancers that limited clusters to only bootstrapping one controller node. 
+  * Add health check for apiserver pods on controllers. Replace approximating kubelet health checks
+
+## v1.10.1
+
 * Kubernetes [v1.10.1](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.10.md#v1101)
 * Enable etcd v3.3 metrics endpoint ([#175](https://github.com/poseidon/typhoon/pull/175))
 * Use `k8s.gcr.io` instead of `gcr.io/google_containers` ([#180](https://github.com/poseidon/typhoon/pull/180))
diff --git a/docs/topics/performance.md b/docs/topics/performance.md
index 68ff03d4c..5b1fd6473 100644
--- a/docs/topics/performance.md
+++ b/docs/topics/performance.md
@@ -9,7 +9,7 @@ Provisioning times vary based on the platform. Sampling the time to create (appl
 | AWS           | 6 min | 5 min   |
 | Bare-Metal    | 10-14 min | NA  |
 | Digital Ocean | 3 min 30 sec | 20 sec |
-| Google Cloud  | 4 min | 4 min 30 sec |
+| Google Cloud  | 7 min | 4 min 30 sec |
 
 Notes:
 
diff --git a/google-cloud/container-linux/kubernetes/apiserver.tf b/google-cloud/container-linux/kubernetes/apiserver.tf
index f7f41dba5..573db376d 100644
--- a/google-cloud/container-linux/kubernetes/apiserver.tf
+++ b/google-cloud/container-linux/kubernetes/apiserver.tf
@@ -1,10 +1,5 @@
-# Static IPv4 address for the Network Load Balancer
-resource "google_compute_address" "controllers-ip" {
-  name = "${var.cluster_name}-controllers-ip"
-}
-
-# DNS record for the Network Load Balancer
-resource "google_dns_record_set" "controllers" {
+# TCP Proxy load balancer DNS record
+resource "google_dns_record_set" "apiserver" {
   # DNS Zone name where record should be created
   managed_zone = "${var.dns_zone_name}"
 
@@ -13,44 +8,88 @@ resource "google_dns_record_set" "controllers" {
   type = "A"
   ttl  = 300
 
-  # IPv4 address of controllers' network load balancer
-  rrdatas = ["${google_compute_address.controllers-ip.address}"]
+  # IPv4 address of apiserver TCP Proxy load balancer
+  rrdatas = ["${google_compute_global_address.apiserver-ipv4.address}"]
 }
 
-# Network Load Balancer for controllers
-resource "google_compute_forwarding_rule" "controller-https-rule" {
-  name       = "${var.cluster_name}-controller-https-rule"
-  ip_address = "${google_compute_address.controllers-ip.address}"
-  port_range = "443"
-  target     = "${google_compute_target_pool.controllers.self_link}"
+# Static IPv4 address for the TCP Proxy Load Balancer
+resource "google_compute_global_address" "apiserver-ipv4" {
+  name = "${var.cluster_name}-apiserver-ip"
+  ip_version = "IPV4"
 }
 
-# Target pool of instances for the controller(s) Network Load Balancer
-resource "google_compute_target_pool" "controllers" {
-  name = "${var.cluster_name}-controller-pool"
+# Forward IPv4 TCP traffic to the TCP proxy load balancer
+resource "google_compute_global_forwarding_rule" "apiserver" {
+  name = "${var.cluster_name}-apiserver"
+  ip_address = "${google_compute_global_address.apiserver-ipv4.address}"
+  ip_protocol = "TCP"
+  port_range = "443"
+  target = "${google_compute_target_tcp_proxy.apiserver.self_link}"
+}
 
-  instances = [
-    "${google_compute_instance.controllers.*.self_link}",
-  ]
+# Global TCP Proxy Load Balancer for apiservers
+resource "google_compute_target_tcp_proxy" "apiserver" {
+  name = "${var.cluster_name}-apiserver"
+  description = "Distribute TCP load across ${var.cluster_name} controllers"
+  backend_service = "${google_compute_backend_service.apiserver.self_link}"
+}
 
-  health_checks = [
-    "${google_compute_http_health_check.kubelet.name}",
-  ]
+# Global backend service backed by unmanaged instance groups
+resource "google_compute_backend_service" "apiserver" {
+  name = "${var.cluster_name}-apiserver"
+  description = "${var.cluster_name} apiserver service"
 
+  protocol = "TCP"
+  port_name = "apiserver"
   session_affinity = "NONE"
+  timeout_sec = "60"
+
+  # controller(s) spread across zonal instance groups
+  backend {
+    group = "${google_compute_instance_group.controllers.0.self_link}"
+  }
+  backend {
+    group = "${google_compute_instance_group.controllers.1.self_link}"
+  }
+  backend {
+    group = "${google_compute_instance_group.controllers.2.self_link}"
+  }
+
+  health_checks = ["${google_compute_health_check.apiserver.self_link}"]
+}
+
+# Instance group of heterogeneous (unmanged) controller instances
+resource "google_compute_instance_group" "controllers" {
+  count = "${length(local.zones)}"
+
+  name = "${format("%s-controllers-%s", var.cluster_name, element(local.zones, count.index))}"
+  zone = "${element(local.zones, count.index)}"
+
+  named_port {
+    name = "apiserver"
+    port = "443"
+  }
+
+  # add instances in the zone into the instance group
+  instances = [
+    "${matchkeys(google_compute_instance.controllers.*.self_link,
+      google_compute_instance.controllers.*.zone,
+      list(element(local.zones, count.index)))}"
+  ]
 }
 
-# Kubelet HTTP Health Check
-resource "google_compute_http_health_check" "kubelet" {
-  name        = "${var.cluster_name}-kubelet-health"
-  description = "Health check Kubelet health host port"
+# TCP health check for apiserver
+resource "google_compute_health_check" "apiserver" {
+  name = "${var.cluster_name}-apiserver-tcp-health"
+  description = "TCP health check for kube-apiserver"
 
-  timeout_sec        = 5
+  timeout_sec = 5
   check_interval_sec = 5
 
-  healthy_threshold   = 2
-  unhealthy_threshold = 4
+  healthy_threshold = 1
+  unhealthy_threshold = 3
 
-  port         = 10255
-  request_path = "/healthz"
+  tcp_health_check {
+    port  = "443"
+  }
 }
diff --git a/google-cloud/container-linux/kubernetes/controllers.tf b/google-cloud/container-linux/kubernetes/controllers.tf
index 9acce2dc4..37e780e0a 100644
--- a/google-cloud/container-linux/kubernetes/controllers.tf
+++ b/google-cloud/container-linux/kubernetes/controllers.tf
@@ -19,12 +19,19 @@ data "google_compute_zones" "all" {
   region = "${var.region}"
 }
 
+locals {
+  # TCP proxy load balancers require a fixed number of zonal backends. Spread
+  # controllers over up to 3 zones, since all GCP regions have at least 3.
+  zones = "${slice(data.google_compute_zones.all.names, 0, 3)}"
+  controllers_ipv4_public = ["${google_compute_instance.controllers.*.network_interface.0.access_config.0.assigned_nat_ip}"]
+}
+
 # Controller instances
 resource "google_compute_instance" "controllers" {
   count = "${var.controller_count}"
 
   name         = "${var.cluster_name}-controller-${count.index}"
-  zone         = "${element(data.google_compute_zones.all.names, count.index)}"
+  zone         = "${element(local.zones, count.index)}"
   machine_type = "${var.controller_type}"
 
   metadata {
@@ -51,10 +58,6 @@ resource "google_compute_instance" "controllers" {
   tags           = ["${var.cluster_name}-controller"]
 }
 
-locals {
-  controllers_ipv4_public = ["${google_compute_instance.controllers.*.network_interface.0.access_config.0.assigned_nat_ip}"]
-}
-
 # Controller Container Linux Config
 data "template_file" "controller_config" {
   count = "${var.controller_count}"
diff --git a/google-cloud/container-linux/kubernetes/ssh.tf b/google-cloud/container-linux/kubernetes/ssh.tf
index 692fbb24d..bba9b9fce 100644
--- a/google-cloud/container-linux/kubernetes/ssh.tf
+++ b/google-cloud/container-linux/kubernetes/ssh.tf
@@ -66,7 +66,7 @@ resource "null_resource" "bootkube-start" {
   depends_on = [
     "module.bootkube",
     "module.workers",
-    "google_dns_record_set.controllers",
+    "google_dns_record_set.apiserver",
     "null_resource.copy-controller-secrets",
   ]