Skip to content

Commit

Permalink
feat: add prometheus metrics & endpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
maigl committed Jan 20, 2023
1 parent 10b4e08 commit be147b0
Show file tree
Hide file tree
Showing 315 changed files with 36,352 additions and 1,667 deletions.
65 changes: 65 additions & 0 deletions apiserver/controllers/controllers.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@ package controllers

import (
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
"os"
"strings"

"garm/apiserver/params"
"garm/auth"
Expand All @@ -29,9 +32,15 @@ import (

"github.com/gorilla/websocket"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
)

func NewAPIController(r *runner.Runner, auth *auth.Authenticator, hub *wsWriter.Hub) (*APIController, error) {
id, err := r.GetControllerID()
if err != nil {
return nil, errors.Wrap(err, "getting controller ID")
}

return &APIController{
r: r,
auth: auth,
Expand All @@ -40,6 +49,7 @@ func NewAPIController(r *runner.Runner, auth *auth.Authenticator, hub *wsWriter.
ReadBufferSize: 1024,
WriteBufferSize: 16384,
},
id: id.String(),
}, nil
}

Expand All @@ -48,6 +58,8 @@ type APIController struct {
auth *auth.Authenticator
hub *wsWriter.Hub
upgrader websocket.Upgrader
// holds this controller's id
id string
}

func handleError(w http.ResponseWriter, err error) {
Expand Down Expand Up @@ -82,6 +94,22 @@ func handleError(w http.ResponseWriter, err error) {
json.NewEncoder(w).Encode(apiErr)
}

// GetControllerInfo returns means to identify this very garm instance.
// This is very useful for debugging and monitoring purposes.
func (a *APIController) GetControllerInfo() (hostname, controllerId string) {

// the hostname is neither fixed nor in our control
// so we get it every time to avoid confusion
var err error
hostname, err = os.Hostname()
if err != nil {
log.Printf("error getting hostname: %q", err)
return "", ""
}

return hostname, a.id
}

func (a *APIController) authenticateHook(body []byte, headers http.Header) error {
// signature := headers.Get("X-Hub-Signature-256")
hookType := headers.Get("X-Github-Hook-Installation-Target-Type")
Expand All @@ -99,6 +127,21 @@ func (a *APIController) authenticateHook(body []byte, headers http.Header) error
return nil
}

// metric to count total webhooks received
// at this point the webhook is not yet authenticated and
// we don't know if it's meant for us or not
var webhooksReceived = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "garm_webhooks_received",
Help: "The total number of webhooks received",
}, []string{"valid", "reason", "hostname", "controller_id"})

func init() {
err := prometheus.Register(webhooksReceived)
if err != nil {
log.Printf("error registering prometheus metric: %q", err)
}
}

func (a *APIController) handleWorkflowJobEvent(w http.ResponseWriter, r *http.Request) {
defer r.Body.Close()
body, err := ioutil.ReadAll(r.Body)
Expand All @@ -110,14 +153,23 @@ func (a *APIController) handleWorkflowJobEvent(w http.ResponseWriter, r *http.Re
signature := r.Header.Get("X-Hub-Signature-256")
hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type")

hostname, controllerId := a.GetControllerInfo()

if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil {
if errors.Is(err, gErrors.ErrNotFound) {
webhooksReceived.WithLabelValues("false", "owner_unknown", hostname, controllerId).Inc()
log.Printf("got not found error from DispatchWorkflowJob. webhook not meant for us?: %q", err)
return
} else if strings.Contains(err.Error(), "signature") { // TODO: check error type
webhooksReceived.WithLabelValues("false", "signature_invalid", hostname, controllerId).Inc()
} else {
webhooksReceived.WithLabelValues("false", "unknown", hostname, controllerId).Inc()
}

handleError(w, err)
return
}
webhooksReceived.WithLabelValues("true", "", hostname, controllerId).Inc()
}

func (a *APIController) CatchAll(w http.ResponseWriter, r *http.Request) {
Expand Down Expand Up @@ -180,6 +232,19 @@ func (a *APIController) NotFoundHandler(w http.ResponseWriter, r *http.Request)
json.NewEncoder(w).Encode(apiErr)
}

func (a *APIController) MetricsTokenHandler(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()

token, err := a.auth.GetJWTMetricsToken(ctx)
if err != nil {
handleError(w, err)
return
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusOK)
fmt.Fprintf(w, `{"token": "%s"}`, token)
}

// LoginHandler returns a jwt token
func (a *APIController) LoginHandler(w http.ResponseWriter, r *http.Request) {
var loginInfo runnerParams.PasswordLoginParams
Expand Down
99 changes: 99 additions & 0 deletions apiserver/controllers/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package controllers

import (
"log"

"garm/auth"
"github.com/prometheus/client_golang/prometheus"
)

type GarmCollector struct {
instanceMetric *prometheus.Desc
apiController *APIController
}

func NewGarmCollector(a *APIController) *GarmCollector {
return &GarmCollector{
apiController: a,
instanceMetric: prometheus.NewDesc(
"garm_runner_status",
"Status of the runner",
[]string{"name", "status", "runner_status", "pool", "pool_type", "hostname", "controller_id"}, nil,
)}
}

func (c *GarmCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.instanceMetric
}

func (c *GarmCollector) Collect(ch chan<- prometheus.Metric) {
c.CollectInstanceMetric(ch)
}

// CollectInstanceMetric collects the metrics for the runner instances
// reflecting the statuses and the pool they belong to.
func (c *GarmCollector) CollectInstanceMetric(ch chan<- prometheus.Metric) {

ctx := auth.GetAdminContext()

instances, err := c.apiController.r.ListAllInstances(ctx)
if err != nil {
log.Printf("cannot collect metrics, listing instances: %s", err)
return
}

pools, err := c.apiController.r.ListAllPools(ctx)
if err != nil {
log.Printf("listing pools: %s", err)
// continue anyway
}

type poolInfo struct {
Name string
Type string
}

poolNames := make(map[string]poolInfo)
for _, pool := range pools {
if pool.EnterpriseName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.EnterpriseName,
Type: "enterprise",
}
} else if pool.OrgName != "" {
poolNames[pool.ID] = poolInfo{
Name: pool.OrgName,
Type: "organization",
}
} else {
poolNames[pool.ID] = poolInfo{
Name: pool.RepoName,
Type: "repository",
}
}
}

hostname, controllerID := c.apiController.GetControllerInfo()

for _, instance := range instances {

m, err := prometheus.NewConstMetric(
c.instanceMetric,
prometheus.GaugeValue,
1,
instance.Name,
string(instance.Status),
string(instance.RunnerStatus),
poolNames[instance.PoolID].Name,
poolNames[instance.PoolID].Type,
hostname,
controllerID,
)

if err != nil {
log.Printf("cannot collect metrics, creating metric: %s", err)
continue
}
ch <- m
}
}
4 changes: 4 additions & 0 deletions apiserver/routers/routers.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ func NewAPIRouter(han *controllers.APIController, logWriter io.Writer, authMiddl
apiRouter.Use(initMiddleware.Middleware)
apiRouter.Use(authMiddleware.Middleware)

// Metrics Token
apiRouter.Handle("/metrics-token/", http.HandlerFunc(han.MetricsTokenHandler)).Methods("GET", "OPTIONS")
apiRouter.Handle("/metrics-token", http.HandlerFunc(han.MetricsTokenHandler)).Methods("GET", "OPTIONS")

///////////
// Pools //
///////////
Expand Down
31 changes: 31 additions & 0 deletions auth/auth.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,37 @@ func (a *Authenticator) GetJWTToken(ctx context.Context) (string, error) {
return tokenString, nil
}

// GetJWTMetricsToken returns a JWT token that can be used to read metrics.
// This token is not tied to a user, no user is stored in the db.
func (a *Authenticator) GetJWTMetricsToken(ctx context.Context) (string, error) {
tokenID, err := util.GetRandomString(16)
if err != nil {
return "", errors.Wrap(err, "generating random string")
}
// TODO: currently this is the same TTL as the normal Token
// maybe we should make this configurable
// it's usually pretty nasty if the monitoring fails because the token expired
expireToken := time.Now().Add(a.cfg.TimeToLive.Duration()).Unix()
claims := JWTClaims{
StandardClaims: jwt.StandardClaims{
ExpiresAt: expireToken,
// TODO: make this configurable
Issuer: "garm",
},
UserID: "metrics",
TokenID: tokenID,
IsAdmin: false,
ReadMetrics: true,
}
token := jwt.NewWithClaims(jwt.SigningMethodHS256, claims)
tokenString, err := token.SignedString([]byte(a.cfg.Secret))
if err != nil {
return "", errors.Wrap(err, "fetching token string")
}

return tokenString, nil
}

func (a *Authenticator) InitController(ctx context.Context, param params.NewUserParams) (params.User, error) {
_, err := a.store.ControllerInfo()
if err != nil {
Expand Down
9 changes: 5 additions & 4 deletions auth/jwt.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,11 @@ import (

// JWTClaims holds JWT claims
type JWTClaims struct {
UserID string `json:"user"`
TokenID string `json:"token_id"`
FullName string `json:"full_name"`
IsAdmin bool `json:"is_admin"`
UserID string `json:"user"`
TokenID string `json:"token_id"`
FullName string `json:"full_name"`
IsAdmin bool `json:"is_admin"`
ReadMetrics bool `json:"read_metrics"`
jwt.StandardClaims
}

Expand Down
64 changes: 64 additions & 0 deletions auth/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package auth

import (
"fmt"
"garm/config"
"net/http"
"strings"

"github.com/golang-jwt/jwt"
)

type MetricsMiddleware struct {
cfg config.JWTAuth
}

func NewMetricsMiddleware(cfg config.JWTAuth) *MetricsMiddleware {
return &MetricsMiddleware{
cfg: cfg,
}
}

func (m *MetricsMiddleware) Middleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {

ctx := r.Context()
authorizationHeader := r.Header.Get("authorization")
if authorizationHeader == "" {
invalidAuthResponse(w)
return
}

bearerToken := strings.Split(authorizationHeader, " ")
if len(bearerToken) != 2 {
invalidAuthResponse(w)
return
}

claims := &JWTClaims{}
token, err := jwt.ParseWithClaims(bearerToken[1], claims, func(token *jwt.Token) (interface{}, error) {
if _, ok := token.Method.(*jwt.SigningMethodHMAC); !ok {
return nil, fmt.Errorf("invalid signing method")
}
return []byte(m.cfg.Secret), nil
})

if err != nil {
invalidAuthResponse(w)
return
}

if !token.Valid {
invalidAuthResponse(w)
return
}

// we fully trust the claims
if !claims.ReadMetrics {
invalidAuthResponse(w)
return
}

next.ServeHTTP(w, r.WithContext(ctx))
})
}
17 changes: 17 additions & 0 deletions cmd/garm-cli/client/organizations.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,20 @@ func (c *Client) ListOrgInstances(orgID string) ([]params.Instance, error) {
}
return response, nil
}

func (c *Client) CreateMetricsToken() (string, error) {
url := fmt.Sprintf("%s/api/v1/metrics-token", c.Config.BaseURL)

type response struct {
Token string `json:"token"`
}

var t response
resp, err := c.client.R().
SetResult(&t).
Get(url)
if err := c.handleError(err, resp); err != nil {
return "", err
}
return t.Token, nil
}
Loading

0 comments on commit be147b0

Please sign in to comment.