-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Cheap and cheerful autoscaler (#229)
Adding a simple autoscaler to collect metrics directly from the pods and scale based on concurrent requests. More in //pkg/autoscaler/doc.go. * Bring back the queue. * Wire queue between nginx and app. * Autoscaler and queue that share a stat type. * Initialize queue with autoscaler service before starting stat reporter. * Connect stat sink. * Add gorilla websocket to deps. * Build the queue with bazel and pass diget into controller through commandline parameter. * Setup env variables and service account for queue to find the autoscaler. * Create autoscaler service and deployment and connect queue. * Reconnect to autoscaler and send pod name. * Calculate 6 and 60 second QPS and scaling action. * Replace 6 and 60 with parameters. * Do actual scaling. Tune parameters. * Scale deployment in the background. * Request a full CPU for each ela pod. * Calculate QPS with floats. * Scale on concurrent requests instead of QPS. * Provide desired concurrency per process in revision spec. * Add test for queue-proxy. Fails becuase of extra autoscaler deployment. * Fix unit tests by checking ela deployment separately from autoscaler deployment. * Add autoscaler deployment env variable test. * Move core autoscaler logic into lib for unit testing. * Refactoring autoscaler for unit testing. * Autoscaler unit tests. * Autoscaler comments. * Only accept target concurrency of 1+. * Limit scale up ratio to 10x. * Fix git rebase mistakes. * Move autoscaler main to cmd. * Move queue sidecar to cmd. * Remove TargetConcurrencyPerProcess revision parameter. * Replace log with glog. * Use defaults for websocket upgrader. * Add service account and binding for autoscaler. * Update deps. * Fix incorrect usage of glog. * Const parameters. * Fix targetConcurrency typo. * Fix typo. * Inject autoscaler name to remove hardcoded value. * Pull out queue parameters into constants. * Add liscense headers to queue and autoscaler. * Cpu requests in constants. * Plumb autoscaler port through env from single constant. * Comment for autoscaler types. * Fix log statement formatting. * Add back ela-revision service account. * Report time from pod with concurrency stat. * Send only one scale request at a time with a 5 second timeout. * Move autoscaler docs to package documentation. * Include pod name in stat key. * Comment about waiting for autoscaler IP. * Add queue->autoscaler connect sleep comment. * Parse losthost url once. * Use singleton proxy in queue. * Fold autoscaler/types package into autoscaler. * Add Record and Scale function comments. * Move environment variable access into init. * Remove enableQueue nginx template parameter. * Comments and copyright headers. * Copyright header.
- Loading branch information
1 parent
820c2a3
commit bf78448
Showing
51 changed files
with
5,947 additions
and
77 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library") | ||
|
||
go_library( | ||
name = "go_default_library", | ||
srcs = ["main.go"], | ||
importpath = "github.com/google/elafros/cmd/ela-autoscaler", | ||
visibility = ["//visibility:private"], | ||
deps = [ | ||
"//pkg/autoscaler:go_default_library", | ||
"//vendor/github.com/golang/glog:go_default_library", | ||
"//vendor/github.com/gorilla/websocket:go_default_library", | ||
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", | ||
"//vendor/k8s.io/client-go/kubernetes:go_default_library", | ||
"//vendor/k8s.io/client-go/rest:go_default_library", | ||
], | ||
) | ||
|
||
go_binary( | ||
name = "ela-autoscaler", | ||
embed = [":go_default_library"], | ||
importpath = "github.com/google/elafros/cmd/ela-autoscaler", | ||
pure = "on", | ||
visibility = ["//visibility:public"], | ||
) | ||
|
||
load("@io_bazel_rules_docker//go:image.bzl", "go_image") | ||
|
||
go_image( | ||
name = "image", | ||
binary = ":ela-autoscaler", | ||
visibility = ["//visibility:public"], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,190 @@ | ||
/* | ||
Copyright 2018 Google Inc. All Rights Reserved. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package main | ||
|
||
import ( | ||
"bytes" | ||
"encoding/gob" | ||
"net/http" | ||
"os" | ||
"time" | ||
|
||
ela_autoscaler "github.com/google/elafros/pkg/autoscaler" | ||
|
||
"github.com/golang/glog" | ||
"github.com/gorilla/websocket" | ||
|
||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/client-go/kubernetes" | ||
"k8s.io/client-go/rest" | ||
) | ||
|
||
const ( | ||
// The desired number of concurrent requests for each pod. This | ||
// is the primary knob for the fast autoscaler which will try | ||
// achieve a 60-second average concurrency per pod of | ||
// targetConcurrency. Another process may tune targetConcurrency | ||
// to best handle the resource requirements of the revision. | ||
targetConcurrency = float64(1.0) | ||
|
||
// A big enough buffer to handle 1000 pods sending stats every 1 | ||
// second while we do the autoscaling computation (a few hundred | ||
// milliseconds). | ||
statBufferSize = 1000 | ||
|
||
// Enough buffer to store scale requests generated every 2 | ||
// seconds while an http request is taking the full timeout of 5 | ||
// second. | ||
scaleBufferSize = 10 | ||
) | ||
|
||
var ( | ||
upgrader = websocket.Upgrader{} | ||
kubeClient *kubernetes.Clientset | ||
statChan = make(chan ela_autoscaler.Stat, statBufferSize) | ||
scaleChan = make(chan int32, scaleBufferSize) | ||
elaNamespace string | ||
elaDeployment string | ||
elaAutoscalerPort string | ||
) | ||
|
||
func init() { | ||
elaNamespace = os.Getenv("ELA_NAMESPACE") | ||
if elaNamespace == "" { | ||
glog.Fatal("No ELA_NAMESPACE provided.") | ||
} | ||
glog.Infof("ELA_NAMESPACE=%v", elaNamespace) | ||
|
||
elaDeployment = os.Getenv("ELA_DEPLOYMENT") | ||
if elaDeployment == "" { | ||
glog.Fatal("No ELA_DEPLOYMENT provided.") | ||
} | ||
glog.Infof("ELA_DEPLOYMENT=%v", elaDeployment) | ||
|
||
elaAutoscalerPort = os.Getenv("ELA_AUTOSCALER_PORT") | ||
if elaAutoscalerPort == "" { | ||
glog.Fatal("No ELA_AUTOSCALER_PORT provided.") | ||
} | ||
glog.Infof("ELA_AUTOSCALER_PORT=%v", elaAutoscalerPort) | ||
} | ||
|
||
func autoscaler() { | ||
glog.Infof("Target concurrency: %0.2f.", targetConcurrency) | ||
|
||
a := ela_autoscaler.NewAutoscaler(targetConcurrency) | ||
ticker := time.NewTicker(2 * time.Second) | ||
|
||
for { | ||
select { | ||
case <-ticker.C: | ||
scale, ok := a.Scale(time.Now()) | ||
if ok { | ||
scaleChan <- scale | ||
} | ||
case s := <-statChan: | ||
a.Record(s) | ||
} | ||
} | ||
} | ||
|
||
func scaleSerializer() { | ||
for { | ||
select { | ||
case desiredPodCount := <-scaleChan: | ||
FastForward: | ||
// Fast forward to the most recent desired pod | ||
// count since the http timeout (5 sec) is more | ||
// than the autoscaling rate (2 sec) and there | ||
// could be multiple pending scale requests. | ||
for { | ||
select { | ||
case p := <-scaleChan: | ||
glog.Warning("Scaling is not keeping up with autoscaling requests.") | ||
desiredPodCount = p | ||
default: | ||
break FastForward | ||
} | ||
} | ||
scaleTo(desiredPodCount) | ||
} | ||
} | ||
} | ||
|
||
func scaleTo(podCount int32) { | ||
glog.Infof("Target scale is %v", podCount) | ||
dc := kubeClient.ExtensionsV1beta1().Deployments(elaNamespace) | ||
deployment, err := dc.Get(elaDeployment, metav1.GetOptions{}) | ||
if err != nil { | ||
glog.Error("Error getting Deployment %q: %s", elaDeployment, err) | ||
return | ||
} | ||
if *deployment.Spec.Replicas == podCount { | ||
glog.Info("Already at scale.") | ||
return | ||
} | ||
deployment.Spec.Replicas = &podCount | ||
_, err = dc.Update(deployment) | ||
if err != nil { | ||
glog.Errorf("Error updating Deployment %q: %s", elaDeployment, err) | ||
} | ||
glog.Info("Successfully scaled.") | ||
} | ||
|
||
func handler(w http.ResponseWriter, r *http.Request) { | ||
conn, err := upgrader.Upgrade(w, r, nil) | ||
if err != nil { | ||
glog.Error(err) | ||
return | ||
} | ||
glog.Info("New metrics source online.") | ||
for { | ||
messageType, msg, err := conn.ReadMessage() | ||
if err != nil { | ||
glog.Info("Metrics source dropping off.") | ||
return | ||
} | ||
if messageType != websocket.BinaryMessage { | ||
glog.Error("Dropping non-binary message.") | ||
continue | ||
} | ||
dec := gob.NewDecoder(bytes.NewBuffer(msg)) | ||
var stat ela_autoscaler.Stat | ||
err = dec.Decode(&stat) | ||
if err != nil { | ||
glog.Error(err) | ||
continue | ||
} | ||
statChan <- stat | ||
} | ||
} | ||
|
||
func main() { | ||
glog.Info("Autoscaler up") | ||
config, err := rest.InClusterConfig() | ||
if err != nil { | ||
glog.Fatal(err) | ||
} | ||
config.Timeout = time.Duration(5 * time.Second) | ||
kc, err := kubernetes.NewForConfig(config) | ||
if err != nil { | ||
glog.Fatal(err) | ||
} | ||
kubeClient = kc | ||
go autoscaler() | ||
go scaleSerializer() | ||
http.HandleFunc("/", handler) | ||
http.ListenAndServe(":"+elaAutoscalerPort, nil) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library") | ||
|
||
go_library( | ||
name = "go_default_library", | ||
srcs = ["main.go"], | ||
importpath = "github.com/google/elafros/cmd/ela-queue", | ||
visibility = ["//visibility:private"], | ||
deps = [ | ||
"//pkg/autoscaler:go_default_library", | ||
"//vendor/github.com/golang/glog:go_default_library", | ||
"//vendor/github.com/gorilla/websocket:go_default_library", | ||
"//vendor/k8s.io/api/core/v1:go_default_library", | ||
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library", | ||
"//vendor/k8s.io/client-go/kubernetes:go_default_library", | ||
"//vendor/k8s.io/client-go/rest:go_default_library", | ||
], | ||
) | ||
|
||
go_binary( | ||
name = "ela-queue", | ||
embed = [":go_default_library"], | ||
importpath = "github.com/google/elafros/cmd/ela-queue", | ||
pure = "on", | ||
visibility = ["//visibility:public"], | ||
) | ||
|
||
load("@io_bazel_rules_docker//go:image.bzl", "go_image") | ||
|
||
go_image( | ||
name = "image", | ||
binary = ":ela-queue", | ||
visibility = ["//visibility:public"], | ||
) |
Oops, something went wrong.