Skip to content

Commit

Permalink
feature/add prometheus metrics (#179)
Browse files Browse the repository at this point in the history
* STORY-25143 - Add prometheus metrics to smokescreen

* STORY-25143 - Cleanup

* STORY-25143 - Fix tests to compare new metric labels

* STORY-25143 - Host prometheus endpoint on separate port

* STORY-25143 - Use value provided via command line flag

* STORY-25143 - Add prometheus timing metrics

* STORY-25143 - Fix nil map assignment and prometheus metric name sanitisation

* STORY-25143 - Cleanup comments

* STORY-25143 - Remove some repetition + add further unit testing

* STORY-25143 - Document new prometheus features in README + add port flag to prometheus config

* STORY-25143 - Make PR requested changes:
* Don't export metrics list
* Follow project sytlistic choices

* STORY-25143 - Rename only one receiver

* STORY-25143 - Add new `--expose-prometheus-metrics` flag to CLI to toggle exposing prometheus metrics

* Small cleanup of timer metrics

* Fix go module vendoring
  • Loading branch information
jmcconnell26 authored May 19, 2023
1 parent 7c83eff commit 65b5bdb
Show file tree
Hide file tree
Showing 427 changed files with 88,890 additions and 510 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ Here are the options you can give Smokescreen:
--deny-address value Add IP[:PORT] to list of blocked IPs. Repeatable.
--allow-address value Add IP[:PORT] to list of allowed IPs. Repeatable.
--egress-acl-file FILE Validate egress traffic against FILE
--expose-prometheus-metrics Exposes metrics via a Prometheus scrapable endpoint.
--prometheus-endpoint ENDPOINT Specify endpoint to host Prometheus metrics on. (default: "/metrics")
Requires `--expose-prometheus-metrics` to be set.
--prometheus-port PORT Specify port to host Prometheus metrics on. (default "9810")
Requires `--expose-prometheus-metrics` to be set.
--resolver-address ADDRESS Make DNS requests to ADDRESS (IP:port). Repeatable.
--statsd-address ADDRESS Send metrics to statsd at ADDRESS (IP:port). (default: "127.0.0.1:8200")
--tls-server-bundle-file FILE Authenticate to clients using key and certs from FILE
Expand Down
20 changes: 20 additions & 0 deletions cmd/smokescreen.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,20 @@ func NewConfiguration(args []string, logger *log.Logger) (*smokescreen.Config, e
Name: "egress-acl-file",
Usage: "Validate egress traffic against `FILE`",
},
cli.BoolFlag{
Name: "expose-prometheus-metrics",
Usage: "Expose metrics via prometheus.",
},
cli.StringFlag{
Name: "prometheus-endpoint",
Value: "/metrics",
Usage: "Expose prometheus metrics on `ENDPOINT`. Requires --expose-prometheus-metrics to be set. Defaults to \"/metrics\"",
},
cli.StringFlag{
Name: "prometheus-port",
Value: "9810",
Usage: "Expose prometheus metrics on `PORT`. Requires --expose-prometheus-metrics to be set. Defaults to \"9810\"",
},
cli.StringSliceFlag{
Name: "resolver-address",
Usage: "Make DNS requests to `ADDRESS` (IP:port). Repeatable.",
Expand Down Expand Up @@ -229,6 +243,12 @@ func NewConfiguration(args []string, logger *log.Logger) (*smokescreen.Config, e
}
}

if c.IsSet("expose-prometheus-metrics") {
if err := conf.SetupPrometheus(c.String("prometheus-endpoint"), c.String("prometheus-port")); err != nil {
return err
}
}

if c.IsSet("egress-acl-file") {
if err := conf.SetupEgressAcl(c.String("egress-acl-file")); err != nil {
return err
Expand Down
17 changes: 14 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,27 +1,38 @@
module github.com/stripe/smokescreen

go 1.17
go 1.18

require (
github.com/DataDog/datadog-go v4.5.1+incompatible
github.com/armon/go-proxyproto v0.0.0-20170620220930-48572f11356f
github.com/carlmjohnson/versioninfo v0.22.4
github.com/hashicorp/go-cleanhttp v0.0.0-20171218145408-d5fe4b57a186
github.com/patrickmn/go-cache v2.1.0+incompatible
github.com/prometheus/client_golang v1.13.0
github.com/rs/xid v1.2.1
github.com/sirupsen/logrus v1.9.0
github.com/stretchr/testify v1.8.0
github.com/stripe/goproxy v0.0.0-20220308202309-3f1dfba6d1a4
golang.org/x/net v0.7.0
gopkg.in/urfave/cli.v1 v1.20.0
gopkg.in/yaml.v2 v2.2.8
gopkg.in/yaml.v2 v2.4.0
)

require (
github.com/Microsoft/go-winio v0.4.17 // indirect
github.com/Microsoft/go-winio v0.6.1 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.1.2 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/golang/protobuf v1.5.2 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.2.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
golang.org/x/mod v0.8.0 // indirect
golang.org/x/sys v0.5.0 // indirect
golang.org/x/text v0.7.0 // indirect
golang.org/x/tools v0.6.0 // indirect
google.golang.org/protobuf v1.28.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
478 changes: 461 additions & 17 deletions go.sum

Large diffs are not rendered by default.

11 changes: 10 additions & 1 deletion pkg/smokescreen/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ func (config *Config) SetupStatsdWithNamespace(addr, namespace string) error {
return nil
}

mc, err := metrics.NewMetricsClient(addr, namespace)
mc, err := metrics.NewStatsdMetricsClient(addr, namespace)
if err != nil {
return err
}
Expand All @@ -331,6 +331,15 @@ func (config *Config) SetupStatsd(addr string) error {
return config.SetupStatsdWithNamespace(addr, DefaultStatsdNamespace)
}

func (config *Config) SetupPrometheus(endpoint string, port string) error {
metricsClient, err := metrics.NewPrometheusMetricsClient(endpoint, port)
if err != nil {
return err
}
config.MetricsClient = metricsClient
return nil
}

func (config *Config) SetupEgressAcl(aclFile string) error {
if aclFile == "" {
config.EgressACL = nil
Expand Down
2 changes: 1 addition & 1 deletion pkg/smokescreen/config_loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ type yamlConfig struct {
Tls *yamlConfigTls
// Currently not configurable via YAML: RoleFromRequest, Log, DisabledAclPolicyActions

UnsafeAllowPrivateRanges bool `yaml:"unsafe_allow_private_ranges"`
UnsafeAllowPrivateRanges bool `yaml:"unsafe_allow_private_ranges"`
}

func (c *Config) UnmarshalYAML(unmarshal func(interface{}) error) error {
Expand Down
4 changes: 2 additions & 2 deletions pkg/smokescreen/conntrack/conn_tracker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,11 @@ func TestConnSuccessRateTracker(t *testing.T) {
assert.InDelta(tc.expectedRate, stats.ConnSuccessRate, 0.01)
assert.Equal(tc.totalConns, stats.TotalConns)

v, err := mockMetricsClient.GetValues("cn.atpt.distinct_domains_success_rate")
v, err := mockMetricsClient.GetValues("cn.atpt.distinct_domains_success_rate", map[string]string{})
assert.NoError(err)
assert.Equal(tc.expectedRate, v[len(v)-1])

v, err = mockMetricsClient.GetValues("cn.atpt.distinct_domains")
v, err = mockMetricsClient.GetValues("cn.atpt.distinct_domains", map[string]string{})
assert.NoError(err)
assert.Equal(tc.totalConns, int(v[len(v)-1]))

Expand Down
5 changes: 2 additions & 3 deletions pkg/smokescreen/conntrack/instrumented_conn.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package conntrack

import (
"encoding/json"
"fmt"
"net"
"sync"
"sync/atomic"
Expand Down Expand Up @@ -93,8 +92,8 @@ func (ic *InstrumentedConn) Close() error {
end := time.Now()
duration := end.Sub(ic.Start).Seconds()

tags := []string{
fmt.Sprintf("role:%s", ic.Role),
tags := map[string]string{
"role": ic.Role,
}

ic.tracker.statsc.IncrWithTags("cn.close", tags, 1)
Expand Down
149 changes: 10 additions & 139 deletions pkg/smokescreen/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,9 @@ package metrics

import (
"errors"
"fmt"
"net"
"sync/atomic"
"syscall"
"time"

"github.com/DataDog/datadog-go/statsd"
)

// metrics contains all of the metric names contained within the smokescreen package.
Expand Down Expand Up @@ -43,143 +39,18 @@ var metrics = []string{
"resolver.errors_total",
}

// MetricsClient is a thin wrapper around statsd.ClientInterface. It is used to allow
// adding arbitrary tags to Smokescreen metrics.
//
// MetricsClient is not thread safe and should not be used concurrently.
type MetricsClient struct {
metricsTags map[string][]string
statsdClient statsd.ClientInterface
started atomic.Value
}

type MetricsClientInterface interface {
AddMetricTags(string, []string) error
AddMetricTags(string, map[string]string) error
Incr(string, float64) error
IncrWithTags(string, []string, float64) error
IncrWithTags(string, map[string]string, float64) error
Gauge(string, float64, float64) error
Histogram(string, float64, float64) error
HistogramWithTags(string, float64, []string, float64) error
HistogramWithTags(string, float64, map[string]string, float64) error
Timing(string, time.Duration, float64) error
TimingWithTags(string, time.Duration, float64, []string) error
StatsdClient() statsd.ClientInterface
TimingWithTags(string, time.Duration, map[string]string, float64) error
SetStarted()
}

// NewMetricsClient creates a new MetricsClient with the provided statsd address and
// namespace.
func NewMetricsClient(addr, namespace string) (*MetricsClient, error) {
c, err := statsd.New(addr)
if err != nil {
return nil, err
}
c.Namespace = namespace

// Populate the client's map to hold metric tags
metricsTags := make(map[string][]string)
for _, m := range metrics {
metricsTags[m] = []string{}
}

return &MetricsClient{
metricsTags: metricsTags,
statsdClient: c,
}, nil
}

// NewNoOpMetricsClient returns a MetricsClient with a no-op statsd client. This can
// be used when there's no statsd service available to smokescreen.
func NewNoOpMetricsClient() *MetricsClient {
// Populate the client's map to hold metric tags
metricsTags := make(map[string][]string)
for _, m := range metrics {
metricsTags[m] = []string{}
}

return &MetricsClient{
metricsTags: metricsTags,
statsdClient: &statsd.NoOpClient{},
}
}

// AddMetricTags associates the provided tags slice with a given metric. The metric must be present
// in the metrics slice.
//
// Once a metric has tags added via AddMetricTags, those tags will *always* be attached whenever
// that metric is emitted.
// For example, calling `AddMetricTags(foo, [bar])` will cause the `bar` tag to be added to
// *every* metric `foo` that is emitted for the lifetime of the MetricsClient.
//
// This function is not thread safe, and adding persitent tags should only be done while initializing
// the configuration and prior to running smokescreen.
func (mc *MetricsClient) AddMetricTags(metric string, mTags []string) error {
if mc.started.Load() != nil {
return fmt.Errorf("cannot add metrics tags after starting smokescreen")
}
if tags, ok := mc.metricsTags[metric]; ok {
mc.metricsTags[metric] = append(tags, mTags...)
return nil
}
return fmt.Errorf("unknown metric: %s", metric)
}

// GetMetricTags returns the slice of metrics associated with a given metric.
func (mc *MetricsClient) GetMetricTags(metric string) []string {
if tags, ok := mc.metricsTags[metric]; ok {
return tags
}
return nil
}

func (mc *MetricsClient) Incr(metric string, rate float64) error {
mTags := mc.GetMetricTags(metric)
return mc.statsdClient.Incr(metric, mTags, rate)
}

func (mc *MetricsClient) IncrWithTags(metric string, tags []string, rate float64) error {
mTags := mc.GetMetricTags(metric)
tags = append(tags, mTags...)
return mc.statsdClient.Incr(metric, tags, rate)
}

func (mc *MetricsClient) Gauge(metric string, value float64, rate float64) error {
mTags := mc.GetMetricTags(metric)
return mc.statsdClient.Gauge(metric, value, mTags, rate)
}

func (mc *MetricsClient) Histogram(metric string, value float64, rate float64) error {
mTags := mc.GetMetricTags(metric)
return mc.statsdClient.Histogram(metric, value, mTags, rate)
}

func (mc *MetricsClient) HistogramWithTags(metric string, value float64, tags []string, rate float64) error {
mTags := mc.GetMetricTags(metric)
tags = append(tags, mTags...)
return mc.statsdClient.Histogram(metric, value, tags, rate)
}

func (mc *MetricsClient) Timing(metric string, d time.Duration, rate float64) error {
mTags := mc.GetMetricTags(metric)
return mc.statsdClient.Timing(metric, d, mTags, rate)
}

func (mc *MetricsClient) TimingWithTags(metric string, d time.Duration, rate float64, tags []string) error {
mTags := mc.GetMetricTags(metric)
tags = append(tags, mTags...)
return mc.statsdClient.Timing(metric, d, tags, rate)
}

func (mc *MetricsClient) StatsdClient() statsd.ClientInterface {
return mc.statsdClient
}

func (mc *MetricsClient) SetStarted() {
mc.started.Store(true)
}

// MetricsClient implements MetricsClientInterface
var _ MetricsClientInterface = &MetricsClient{}

// reportConnError emits a detailed metric about a connection error, with a tag corresponding to
// the failure type. If err is not a net.Error, does nothing.
func ReportConnError(mc MetricsClientInterface, err error) {
Expand All @@ -188,17 +59,17 @@ func ReportConnError(mc MetricsClientInterface, err error) {
return
}

etag := "type:unknown"
errorTag := map[string]string{"type": "unknown"}
switch {
case e.Timeout():
etag = "type:timeout"
errorTag["type"] = "timeout"
case errors.Is(e, syscall.ECONNREFUSED):
etag = "type:refused"
errorTag["type"] = "refused"
case errors.Is(e, syscall.ECONNRESET):
etag = "type:reset"
errorTag["type"] = "reset"
case errors.Is(e, syscall.ECONNABORTED):
etag = "type:aborted"
errorTag["type"] = "aborted"
}

mc.IncrWithTags("cn.atpt.connect.err", []string{etag}, 1)
mc.IncrWithTags("cn.atpt.connect.err", errorTag, 1)
}
Loading

0 comments on commit 65b5bdb

Please sign in to comment.