Cheap and cheerful autoscaler (#229)

Adding a simple autoscaler to collect metrics directly from the pods and scale based on concurrent requests. More in //pkg/autoscaler/doc.go. * Bring back the queue. * Wire queue between nginx and app. * Autoscaler and queue that share a stat type. * Initialize queue with autoscaler service before starting stat reporter. * Connect stat sink. * Add gorilla websocket to deps. * Build the queue with bazel and pass diget into controller through commandline parameter. * Setup env variables and service account for queue to find the autoscaler. * Create autoscaler service and deployment and connect queue. * Reconnect to autoscaler and send pod name. * Calculate 6 and 60 second QPS and scaling action. * Replace 6 and 60 with parameters. * Do actual scaling. Tune parameters. * Scale deployment in the background. * Request a full CPU for each ela pod. * Calculate QPS with floats. * Scale on concurrent requests instead of QPS. * Provide desired concurrency per process in revision spec. * Add test for queue-proxy. Fails becuase of extra autoscaler deployment. * Fix unit tests by checking ela deployment separately from autoscaler deployment. * Add autoscaler deployment env variable test. * Move core autoscaler logic into lib for unit testing. * Refactoring autoscaler for unit testing. * Autoscaler unit tests. * Autoscaler comments. * Only accept target concurrency of 1+. * Limit scale up ratio to 10x. * Fix git rebase mistakes. * Move autoscaler main to cmd. * Move queue sidecar to cmd. * Remove TargetConcurrencyPerProcess revision parameter. * Replace log with glog. * Use defaults for websocket upgrader. * Add service account and binding for autoscaler. * Update deps. * Fix incorrect usage of glog. * Const parameters. * Fix targetConcurrency typo. * Fix typo. * Inject autoscaler name to remove hardcoded value. * Pull out queue parameters into constants. * Add liscense headers to queue and autoscaler. * Cpu requests in constants. * Plumb autoscaler port through env from single constant. * Comment for autoscaler types. * Fix log statement formatting. * Add back ela-revision service account. * Report time from pod with concurrency stat. * Send only one scale request at a time with a 5 second timeout. * Move autoscaler docs to package documentation. * Include pod name in stat key. * Comment about waiting for autoscaler IP. * Add queue->autoscaler connect sleep comment. * Parse losthost url once. * Use singleton proxy in queue. * Fold autoscaler/types package into autoscaler. * Add Record and Scale function comments. * Move environment variable access into init. * Remove enableQueue nginx template parameter. * Comments and copyright headers. * Copyright header.
knative · Feb 28, 2018 · bf78448 · bf78448
1 parent 820c2a3
commit bf78448
Show file tree

Hide file tree

Showing 51 changed files with 5,947 additions and 77 deletions.
diff --git a/BUILD b/BUILD
@@ -13,6 +13,8 @@ k8s_object(
     name = "controller",
     images = {
         "ela-controller:latest": "//cmd/ela-controller:image",
+        "ela-queue:latest": "//cmd/ela-queue:image",
+        "ela-autoscaler:latest": "//cmd/ela-autoscaler:image",
     },
     template = "controller.yaml",
 )

diff --git a/Gopkg.lock b/Gopkg.lock
diff --git a/clusterrolebinding.yaml b/clusterrolebinding.yaml
@@ -24,3 +24,29 @@ roleRef:
   kind: ClusterRole
   name: cluster-admin
   apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRoleBinding
+metadata:
+  name: ela-autoscaler-write
+subjects:
+  - kind: ServiceAccount
+    name: ela-autoscaler
+    namespace: default
+roleRef:
+  kind: ClusterRole
+  name: cluster-admin
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: rbac.authorization.k8s.io/v1beta1
+kind: ClusterRoleBinding
+metadata:
+  name: ela-revision-read
+subjects:
+  - kind: ServiceAccount
+    name: ela-revision
+    namespace: default
+roleRef:
+  kind: ClusterRole
+  name: cluster-admin  # TODO(josephburnett): reduce this role to read-only
+  apiGroup: rbac.authorization.k8s.io
diff --git a/cmd/ela-autoscaler/BUILD.bazel b/cmd/ela-autoscaler/BUILD.bazel
@@ -0,0 +1,32 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
+
+go_library(
+    name = "go_default_library",
+    srcs = ["main.go"],
+    importpath = "github.com/google/elafros/cmd/ela-autoscaler",
+    visibility = ["//visibility:private"],
+    deps = [
+        "//pkg/autoscaler:go_default_library",
+        "//vendor/github.com/golang/glog:go_default_library",
+        "//vendor/github.com/gorilla/websocket:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
+        "//vendor/k8s.io/client-go/kubernetes:go_default_library",
+        "//vendor/k8s.io/client-go/rest:go_default_library",
+    ],
+)
+
+go_binary(
+    name = "ela-autoscaler",
+    embed = [":go_default_library"],
+    importpath = "github.com/google/elafros/cmd/ela-autoscaler",
+    pure = "on",
+    visibility = ["//visibility:public"],
+)
+
+load("@io_bazel_rules_docker//go:image.bzl", "go_image")
+
+go_image(
+    name = "image",
+    binary = ":ela-autoscaler",
+    visibility = ["//visibility:public"],
+)
diff --git a/cmd/ela-autoscaler/main.go b/cmd/ela-autoscaler/main.go
@@ -0,0 +1,190 @@
+/*
+Copyright 2018 Google Inc. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package main
+
+import (
+	"bytes"
+	"encoding/gob"
+	"net/http"
+	"os"
+	"time"
+
+	ela_autoscaler "github.com/google/elafros/pkg/autoscaler"
+
+	"github.com/golang/glog"
+	"github.com/gorilla/websocket"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+)
+
+const (
+	// The desired number of concurrent requests for each pod.  This
+	// is the primary knob for the fast autoscaler which will try
+	// achieve a 60-second average concurrency per pod of
+	// targetConcurrency.  Another process may tune targetConcurrency
+	// to best handle the resource requirements of the revision.
+	targetConcurrency = float64(1.0)
+
+	// A big enough buffer to handle 1000 pods sending stats every 1
+	// second while we do the autoscaling computation (a few hundred
+	// milliseconds).
+	statBufferSize = 1000
+
+	// Enough buffer to store scale requests generated every 2
+	// seconds while an http request is taking the full timeout of 5
+	// second.
+	scaleBufferSize = 10
+)
+
+var (
+	upgrader          = websocket.Upgrader{}
+	kubeClient        *kubernetes.Clientset
+	statChan          = make(chan ela_autoscaler.Stat, statBufferSize)
+	scaleChan         = make(chan int32, scaleBufferSize)
+	elaNamespace      string
+	elaDeployment     string
+	elaAutoscalerPort string
+)
+
+func init() {
+	elaNamespace = os.Getenv("ELA_NAMESPACE")
+	if elaNamespace == "" {
+		glog.Fatal("No ELA_NAMESPACE provided.")
+	}
+	glog.Infof("ELA_NAMESPACE=%v", elaNamespace)
+
+	elaDeployment = os.Getenv("ELA_DEPLOYMENT")
+	if elaDeployment == "" {
+		glog.Fatal("No ELA_DEPLOYMENT provided.")
+	}
+	glog.Infof("ELA_DEPLOYMENT=%v", elaDeployment)
+
+	elaAutoscalerPort = os.Getenv("ELA_AUTOSCALER_PORT")
+	if elaAutoscalerPort == "" {
+		glog.Fatal("No ELA_AUTOSCALER_PORT provided.")
+	}
+	glog.Infof("ELA_AUTOSCALER_PORT=%v", elaAutoscalerPort)
+}
+
+func autoscaler() {
+	glog.Infof("Target concurrency: %0.2f.", targetConcurrency)
+
+	a := ela_autoscaler.NewAutoscaler(targetConcurrency)
+	ticker := time.NewTicker(2 * time.Second)
+
+	for {
+		select {
+		case <-ticker.C:
+			scale, ok := a.Scale(time.Now())
+			if ok {
+				scaleChan <- scale
+			}
+		case s := <-statChan:
+			a.Record(s)
+		}
+	}
+}
+
+func scaleSerializer() {
+	for {
+		select {
+		case desiredPodCount := <-scaleChan:
+		FastForward:
+			// Fast forward to the most recent desired pod
+			// count since the http timeout (5 sec) is more
+			// than the autoscaling rate (2 sec) and there
+			// could be multiple pending scale requests.
+			for {
+				select {
+				case p := <-scaleChan:
+					glog.Warning("Scaling is not keeping up with autoscaling requests.")
+					desiredPodCount = p
+				default:
+					break FastForward
+				}
+			}
+			scaleTo(desiredPodCount)
+		}
+	}
+}
+
+func scaleTo(podCount int32) {
+	glog.Infof("Target scale is %v", podCount)
+	dc := kubeClient.ExtensionsV1beta1().Deployments(elaNamespace)
+	deployment, err := dc.Get(elaDeployment, metav1.GetOptions{})
+	if err != nil {
+		glog.Error("Error getting Deployment %q: %s", elaDeployment, err)
+		return
+	}
+	if *deployment.Spec.Replicas == podCount {
+		glog.Info("Already at scale.")
+		return
+	}
+	deployment.Spec.Replicas = &podCount
+	_, err = dc.Update(deployment)
+	if err != nil {
+		glog.Errorf("Error updating Deployment %q: %s", elaDeployment, err)
+	}
+	glog.Info("Successfully scaled.")
+}
+
+func handler(w http.ResponseWriter, r *http.Request) {
+	conn, err := upgrader.Upgrade(w, r, nil)
+	if err != nil {
+		glog.Error(err)
+		return
+	}
+	glog.Info("New metrics source online.")
+	for {
+		messageType, msg, err := conn.ReadMessage()
+		if err != nil {
+			glog.Info("Metrics source dropping off.")
+			return
+		}
+		if messageType != websocket.BinaryMessage {
+			glog.Error("Dropping non-binary message.")
+			continue
+		}
+		dec := gob.NewDecoder(bytes.NewBuffer(msg))
+		var stat ela_autoscaler.Stat
+		err = dec.Decode(&stat)
+		if err != nil {
+			glog.Error(err)
+			continue
+		}
+		statChan <- stat
+	}
+}
+
+func main() {
+	glog.Info("Autoscaler up")
+	config, err := rest.InClusterConfig()
+	if err != nil {
+		glog.Fatal(err)
+	}
+	config.Timeout = time.Duration(5 * time.Second)
+	kc, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		glog.Fatal(err)
+	}
+	kubeClient = kc
+	go autoscaler()
+	go scaleSerializer()
+	http.HandleFunc("/", handler)
+	http.ListenAndServe(":"+elaAutoscalerPort, nil)
+}
diff --git a/cmd/ela-queue/BUILD.bazel b/cmd/ela-queue/BUILD.bazel
@@ -0,0 +1,33 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
+
+go_library(
+    name = "go_default_library",
+    srcs = ["main.go"],
+    importpath = "github.com/google/elafros/cmd/ela-queue",
+    visibility = ["//visibility:private"],
+    deps = [
+        "//pkg/autoscaler:go_default_library",
+        "//vendor/github.com/golang/glog:go_default_library",
+        "//vendor/github.com/gorilla/websocket:go_default_library",
+        "//vendor/k8s.io/api/core/v1:go_default_library",
+        "//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
+        "//vendor/k8s.io/client-go/kubernetes:go_default_library",
+        "//vendor/k8s.io/client-go/rest:go_default_library",
+    ],
+)
+
+go_binary(
+    name = "ela-queue",
+    embed = [":go_default_library"],
+    importpath = "github.com/google/elafros/cmd/ela-queue",
+    pure = "on",
+    visibility = ["//visibility:public"],
+)
+
+load("@io_bazel_rules_docker//go:image.bzl", "go_image")
+
+go_image(
+    name = "image",
+    binary = ":ela-queue",
+    visibility = ["//visibility:public"],
+)