Skip to content

Commit

Permalink
Enhance metrics endpoint (#195)
Browse files Browse the repository at this point in the history
Machine controller manager metrics endpoint has now been updated to incorporate a more versatile set of metric.
  • Loading branch information
fsniper authored and prashanth26 committed Jan 4, 2019
1 parent b63dbde commit 5554b50
Show file tree
Hide file tree
Showing 7 changed files with 619 additions and 18 deletions.
308 changes: 290 additions & 18 deletions pkg/controller/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,41 +18,313 @@ limitations under the License.
package controller

import (
v1alpha1 "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
"github.com/gardener/machine-controller-manager/pkg/metrics"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"strconv"
)

var (
machineCountDesc = prometheus.NewDesc("mcm_machine_items_total", "Count of machines currently managed by the mcm.", nil, nil)
// Describe is method required to implement the prometheus.Collect interface.
func (c *controller) Describe(ch chan<- *prometheus.Desc) {
ch <- metrics.MachineCountDesc
ch <- metrics.MachineSetCountDesc
ch <- metrics.MachineDeploymentCountDesc
}

// ScrapeFailedCounter is a Prometheus metric, which counts errors during metrics collection.
ScrapeFailedCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "mcm_scrape_failure_total",
Help: "Total count of scrape failures.",
}, []string{"kind"})
)
// CollectMachineDeploymentMetrics is method to collect machineSet related metrics.
func (c *controller) CollectMachineDeploymentMetrics(ch chan<- prometheus.Metric) {
machineDeploymentList, err := c.machineDeploymentLister.MachineDeployments(c.namespace).List(labels.Everything())
if err != nil {
metrics.ScrapeFailedCounter.With(prometheus.Labels{"kind": "Machinedeployment-count"}).Inc()
return
}
metric, err := prometheus.NewConstMetric(metrics.MachineDeploymentCountDesc, prometheus.GaugeValue, float64(len(machineDeploymentList)))
if err != nil {
metrics.ScrapeFailedCounter.With(prometheus.Labels{"kind": "Machinedeployment-count"}).Inc()
return
}
ch <- metric

for _, machineDeployment := range machineDeploymentList {

mdMeta := machineDeployment.ObjectMeta
mdSpec := machineDeployment.Spec

metrics.MachineDeploymentInfo.With(prometheus.Labels{
"name": mdMeta.Name,
"namespace": mdMeta.Namespace,
"createdAt": strconv.FormatInt(mdMeta.GetCreationTimestamp().Time.Unix(), 10),
"spec_strategy_type": string(mdSpec.Strategy.Type),
}).Set(float64(1))

var paused float64
if mdSpec.Paused {
paused = 1
}
metrics.MachineDeploymentInfoSpecPaused.With(prometheus.Labels{
"name": mdMeta.Name,
"namespace": mdMeta.Namespace}).Set(paused)

metrics.MachineDeploymentInfoSpecReplicas.With(prometheus.Labels{
"name": mdMeta.Name,
"namespace": mdMeta.Namespace}).Set(float64(mdSpec.Replicas))

metrics.MachineDeploymentInfoSpecMinReadySeconds.With(prometheus.Labels{
"name": mdMeta.Name,
"namespace": mdMeta.Namespace}).Set(float64(mdSpec.MinReadySeconds))

if mdSpec.Strategy.Type == v1alpha1.RollingUpdateMachineDeploymentStrategyType {
metrics.MachineDeploymentInfoSpecRollingUpdateMaxSurge.With(prometheus.Labels{
"name": mdMeta.Name,
"namespace": mdMeta.Namespace}).Set(float64(mdSpec.Strategy.RollingUpdate.MaxSurge.IntValue()))
metrics.MachineDeploymentInfoSpecRollingUpdateMaxUnavailable.With(prometheus.Labels{
"name": mdMeta.Name,
"namespace": mdMeta.Namespace}).Set(float64(mdSpec.Strategy.RollingUpdate.MaxUnavailable.IntValue()))
}
if mdSpec.RevisionHistoryLimit != nil {
metrics.MachineDeploymentInfoSpecRevisionHistoryLimit.With(prometheus.Labels{
"name": mdMeta.Name,
"namespace": mdMeta.Namespace}).Set(float64(int64(*mdSpec.RevisionHistoryLimit)))
}
if mdSpec.ProgressDeadlineSeconds != nil {
metrics.MachineDeploymentInfoSpecProgressDeadlineSeconds.With(prometheus.Labels{
"name": mdMeta.Name,
"namespace": mdMeta.Namespace}).Set(float64(int64(*mdSpec.ProgressDeadlineSeconds)))
}
if mdSpec.RollbackTo != nil {
metrics.MachineDeploymentInfoSpecRollbackToRevision.With(prometheus.Labels{
"name": mdMeta.Name,
"namespace": mdMeta.Namespace}).Set(float64(mdSpec.RollbackTo.Revision))
}

func init() {
prometheus.MustRegister(ScrapeFailedCounter)
for _, condition := range machineDeployment.Status.Conditions {
var status float64
switch condition.Status {
case v1alpha1.ConditionTrue:
status = 1
case v1alpha1.ConditionFalse:
status = 0
case v1alpha1.ConditionUnknown:
status = 2
}

metrics.MachineDeploymentStatusCondition.With(prometheus.Labels{
"name": mdMeta.Name,
"namespace": mdMeta.Namespace,
"condition": string(condition.Type),
}).Set(status)
}

statusLabels := prometheus.Labels{
"name": mdMeta.Name,
"namespace": mdMeta.Namespace,
}
metrics.MachineDeploymentStatusAvailableReplicas.With(statusLabels).Set(float64(machineDeployment.Status.AvailableReplicas))
metrics.MachineDeploymentStatusUnavailableReplicas.With(statusLabels).Set(float64(machineDeployment.Status.UnavailableReplicas))
metrics.MachineDeploymentStatusReadyReplicas.With(statusLabels).Set(float64(machineDeployment.Status.ReadyReplicas))
metrics.MachineDeploymentStatusUpdatedReplicas.With(statusLabels).Set(float64(machineDeployment.Status.UpdatedReplicas))
metrics.MachineDeploymentStatusReplicas.With(statusLabels).Set(float64(machineDeployment.Status.Replicas))

if machineDeployment.Status.CollisionCount != nil {
metrics.MachineDeploymentStatusCollisionCount.With(statusLabels).Set(float64(*machineDeployment.Status.CollisionCount))
}

if machineDeployment.Status.FailedMachines != nil {
for _, failedMachine := range machineDeployment.Status.FailedMachines {
metrics.MachineDeploymentStatusFailedMachines.With(prometheus.Labels{
"name": mdMeta.Name,
"namespace": mdMeta.Namespace,
"failedMachine_name": failedMachine.Name,
"failedMachine_provider_id": failedMachine.ProviderID,
"failedMachine_last_operation_state": string(failedMachine.LastOperation.State),
"failedMachine_last_operation_machine_operation_type": string(failedMachine.LastOperation.Type),
"failedMachine_owner_ref": failedMachine.OwnerRef}).Set(float64(1))

}
}

}
}

// Describe is method required to implement the prometheus.Collect interface.
func (c *controller) Describe(ch chan<- *prometheus.Desc) {
ch <- machineCountDesc
// CollectMachineSetMetrics is method to collect machineSet related metrics.
func (c *controller) CollectMachineSetMetrics(ch chan<- prometheus.Metric) {
machineSetList, err := c.machineSetLister.MachineSets(c.namespace).List(labels.Everything())
if err != nil {
metrics.ScrapeFailedCounter.With(prometheus.Labels{"kind": "Machineset-count"}).Inc()
return
}
metric, err := prometheus.NewConstMetric(metrics.MachineSetCountDesc, prometheus.GaugeValue, float64(len(machineSetList)))
if err != nil {
metrics.ScrapeFailedCounter.With(prometheus.Labels{"kind": "Machineset-count"}).Inc()
return
}
ch <- metric

for _, machineSet := range machineSetList {

msMeta := machineSet.ObjectMeta
msSpec := machineSet.Spec

metrics.MachineSetInfo.With(prometheus.Labels{
"name": msMeta.Name,
"namespace": msMeta.Namespace,
"createdAt": strconv.FormatInt(msMeta.GetCreationTimestamp().Time.Unix(), 10),
"spec_machine_class_api_group": msSpec.MachineClass.APIGroup,
"spec_machine_class_kind": msSpec.MachineClass.Kind,
"spec_machine_class_name": msSpec.MachineClass.Name}).Set(float64(1))

metrics.MachineSetInfoSpecReplicas.With(prometheus.Labels{
"name": msMeta.Name,
"namespace": msMeta.Namespace}).Set(float64(msSpec.Replicas))
metrics.MachineSetInfoSpecMinReadySeconds.With(prometheus.Labels{
"name": msMeta.Name,
"namespace": msMeta.Namespace}).Set(float64(msSpec.MinReadySeconds))

for _, condition := range machineSet.Status.Conditions {
var status float64
switch condition.Status {
case v1alpha1.ConditionTrue:
status = 1
case v1alpha1.ConditionFalse:
status = 0
case v1alpha1.ConditionUnknown:
status = 2
}

metrics.MachineSetStatusCondition.With(prometheus.Labels{
"name": msMeta.Name,
"namespace": msMeta.Namespace,
"condition": string(condition.Type),
}).Set(status)
}

metrics.MachineSetStatusAvailableReplicas.With(prometheus.Labels{
"name": msMeta.Name,
"namespace": msMeta.Namespace,
}).Set(float64(machineSet.Status.AvailableReplicas))

metrics.MachineSetStatusFullyLabelledReplicas.With(prometheus.Labels{
"name": msMeta.Name,
"namespace": msMeta.Namespace,
}).Set(float64(machineSet.Status.FullyLabeledReplicas))

metrics.MachineSetStatusReadyReplicas.With(prometheus.Labels{
"name": msMeta.Name,
"namespace": msMeta.Namespace,
}).Set(float64(machineSet.Status.ReadyReplicas))

metrics.MachineSetStatusReplicas.With(prometheus.Labels{
"name": msMeta.Name,
"namespace": msMeta.Namespace,
}).Set(float64(machineSet.Status.ReadyReplicas))

if machineSet.Status.FailedMachines != nil {

for _, failedMachine := range *machineSet.Status.FailedMachines {
metrics.MachineSetStatusFailedMachines.With(prometheus.Labels{
"name": msMeta.Name,
"namespace": msMeta.Namespace,
"failedMachine_name": failedMachine.Name,
"failedMachine_provider_id": failedMachine.ProviderID,
"failedMachine_last_operation_state": string(failedMachine.LastOperation.State),
"failedMachine_last_operation_machine_operation_type": string(failedMachine.LastOperation.Type),
"failedMachine_owner_ref": failedMachine.OwnerRef}).Set(float64(1))
}
}
}
}

// Collect is method required to implement the prometheus.Collect interface.
func (c *controller) Collect(ch chan<- prometheus.Metric) {
// CollectMachines is method to collect Machine related metrics.
func (c *controller) CollectMachineMetrics(ch chan<- prometheus.Metric) {
// Collect the count of machines managed by the mcm.
machineList, err := c.machineLister.Machines(c.namespace).List(labels.Everything())
if err != nil {
ScrapeFailedCounter.With(prometheus.Labels{"kind": "machine-count"}).Inc()
metrics.ScrapeFailedCounter.With(prometheus.Labels{"kind": "Machine-count"}).Inc()
return
}

for _, machine := range machineList {
mMeta := machine.ObjectMeta
mSpec := machine.Spec

metrics.MachineInfo.With(prometheus.Labels{
"name": mMeta.Name,
"namespace": mMeta.Namespace,
"createdAt": strconv.FormatInt(mMeta.GetCreationTimestamp().Time.Unix(), 10),
"spec_provider_id": mSpec.ProviderID,
"spec_class_api_group": mSpec.Class.APIGroup,
"spec_class_kind": mSpec.Class.Kind,
"spec_class_name": mSpec.Class.Name}).Set(float64(1))

for _, condition := range machine.Status.Conditions {
var status float64
switch condition.Status {
case v1.ConditionTrue:
status = 1
case v1.ConditionFalse:
status = 0
case v1.ConditionUnknown:
status = 2
}

metrics.MachineStatusCondition.With(prometheus.Labels{
"name": mMeta.Name,
"namespace": mMeta.Namespace,
"condition": string(condition.Type),
}).Set(status)
}

var phase float64
switch machine.Status.CurrentStatus.Phase {
case v1alpha1.MachinePending:
phase = -2
case v1alpha1.MachineAvailable:
phase = -1
case v1alpha1.MachineRunning:
phase = 0
case v1alpha1.MachineTerminating:
phase = 1
case v1alpha1.MachineUnknown:
phase = 2
case v1alpha1.MachineFailed:
phase = 3
}
metrics.MachineCSPhase.With(prometheus.Labels{
"name": mMeta.Name,
"namespace": mMeta.Namespace,
}).Set(phase)

}

metric, err := prometheus.NewConstMetric(metrics.MachineCountDesc, prometheus.GaugeValue, float64(len(machineList)))
if err != nil {
metrics.ScrapeFailedCounter.With(prometheus.Labels{"kind": "Machine-count"}).Inc()
return
}
metric, err := prometheus.NewConstMetric(machineCountDesc, prometheus.GaugeValue, float64(len(machineList)))
ch <- metric

}

// CollectMachines is method to collect Machine related metrics.
func (c *controller) CollectMachineControllerFrozenStatus(ch chan<- prometheus.Metric) {
var frozenStatus float64
if c.safetyOptions.MachineControllerFrozen {
frozenStatus = 1
}
metric, err := prometheus.NewConstMetric(metrics.MachineControllerFrozenDesc, prometheus.GaugeValue, frozenStatus)
if err != nil {
ScrapeFailedCounter.With(prometheus.Labels{"kind": "machine-count"}).Inc()
metrics.ScrapeFailedCounter.With(prometheus.Labels{"kind": "Machine-count"}).Inc()
return
}
ch <- metric
}

// Collect is method required to implement the prometheus.Collect interface.
func (c *controller) Collect(ch chan<- prometheus.Metric) {
c.CollectMachineMetrics(ch)
c.CollectMachineSetMetrics(ch)
c.CollectMachineDeploymentMetrics(ch)
c.CollectMachineControllerFrozenStatus(ch)
}
14 changes: 14 additions & 0 deletions pkg/driver/driver_alicloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ import (
"strings"

"github.com/golang/glog"
"github.com/prometheus/client_golang/prometheus"

v1alpha1 "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
"github.com/gardener/machine-controller-manager/pkg/metrics"
corev1 "k8s.io/api/core/v1"

"github.com/aliyun/alibaba-cloud-sdk-go/sdk/requests"
Expand Down Expand Up @@ -89,8 +91,10 @@ func (c *AlicloudDriver) Create() (string, string, error) {

response, err := client.RunInstances(request)
if err != nil {
metrics.APIFailedRequestCount.With(prometheus.Labels{"provider": "alicloud", "service": "ecs"}).Inc()
return "", "", err
}
metrics.APIRequestCount.With(prometheus.Labels{"provider": "alicloud", "service": "ecs"}).Inc()

instanceID := response.InstanceIdSets.InstanceIdSet[0]
machineID := c.encodeMachineID(c.AlicloudMachineClass.Spec.Region, instanceID)
Expand Down Expand Up @@ -135,6 +139,10 @@ func (c *AlicloudDriver) stopInstance(client *ecs.Client, machineID string) erro
request.ForceStop = requests.NewBoolean(true)

_, err := client.StopInstance(request)
if err != nil {
metrics.APIFailedRequestCount.With(prometheus.Labels{"provider": "alicloud", "service": "ecs"}).Inc()
}
metrics.APIRequestCount.With(prometheus.Labels{"provider": "alicloud", "service": "ecs"}).Inc()

return err
}
Expand All @@ -145,6 +153,10 @@ func (c *AlicloudDriver) deleteInstance(client *ecs.Client, machineID string) er
request.Force = requests.NewBoolean(true)

_, err := client.DeleteInstance(request)
if err != nil {
metrics.APIFailedRequestCount.With(prometheus.Labels{"provider": "alicloud", "service": "ecs"}).Inc()
}
metrics.APIRequestCount.With(prometheus.Labels{"provider": "alicloud", "service": "ecs"}).Inc()
return err
}

Expand Down Expand Up @@ -192,8 +204,10 @@ func (c *AlicloudDriver) getVMDetails(machineID string) ([]ecs.Instance, error)

response, err := client.DescribeInstances(request)
if err != nil {
metrics.APIFailedRequestCount.With(prometheus.Labels{"provider": "alicloud", "service": "ecs"}).Inc()
return nil, err
}
metrics.APIRequestCount.With(prometheus.Labels{"provider": "alicloud", "service": "ecs"}).Inc()

return response.Instances.Instance, nil
}
Expand Down
Loading

0 comments on commit 5554b50

Please sign in to comment.