Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] cluster: Secure cluster traffic via mutual TLS #1819

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,14 @@ package cluster

import (
"context"
"crypto/tls"
"crypto/x509"
"fmt"
"io/ioutil"
stdLog "log"
"math/rand"
"net"
"os"
"sort"
"strconv"
"strings"
Expand All @@ -27,6 +32,7 @@ import (
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/hashicorp/memberlist"
tlsTransport "github.com/mxinden/memberlist-tls-transport"
"github.com/oklog/ulid"
"github.com/pkg/errors"

Expand Down Expand Up @@ -118,6 +124,9 @@ func Create(
tcpTimeout time.Duration,
probeTimeout time.Duration,
probeInterval time.Duration,
clusterCAFile *string,
clusterCertificate *string,
clusterCertificateKey *string,
) (*Peer, error) {
bindHost, bindPortStr, err := net.SplitHostPort(bindAddr)
if err != nil {
Expand Down Expand Up @@ -210,6 +219,46 @@ func Create(
p.setInitialFailed(resolvedPeers, bindAddr)
}

// TODO: Don't just dereference.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're handling all this differently in the node exporter tls stuff, which you should probably vendor/hack in here to keep things in line

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For sure. I will align this once prometheus/node_exporter#1277 is merged.

if *clusterCAFile != "" && *clusterCertificate != "" && *clusterCertificateKey != "" {
// TODO: Don't just dereference.
caCert, err := ioutil.ReadFile(*clusterCAFile)
if err != nil {
p.logger.Log("failed to load cert: " + err.Error())
}

caCertPool := x509.NewCertPool()
caCertPool.AppendCertsFromPEM(caCert)

// TODO: Don't just dereference.
cert, err := tls.LoadX509KeyPair(*clusterCertificate, *clusterCertificateKey)
if err != nil {
return nil, fmt.Errorf("%v", err)
}

tlsConfig := &tls.Config{
Certificates: []tls.Certificate{cert}, // server certificate which is validated by the client
ClientCAs: caCertPool, // used to verify the client cert is signed by the CA and is therefore valid
ClientAuth: tls.RequireAndVerifyClientCert, // this requires a valid client certificate to be supplied during handshake
RootCAs: caCertPool, // this is used to validate the server certificate
}
tlsConfig.BuildNameToCertificate()

transportConfig := &tlsTransport.TLSTransportConfig{
BindAddrs: []string{cfg.BindAddr},
BindPort: cfg.BindPort,
Logger: stdLog.New(os.Stderr, "", stdLog.LstdFlags),
TLS: tlsConfig,
}

transport, err := tlsTransport.NewTLSTransport(transportConfig, reg)
if err != nil {
panic(err)
}

cfg.Transport = transport
}

ml, err := memberlist.Create(cfg)
if err != nil {
return nil, errors.Wrap(err, "create memberlist")
Expand Down
28 changes: 17 additions & 11 deletions cmd/alertmanager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,17 +116,20 @@ func run() int {

clusterBindAddr = kingpin.Flag("cluster.listen-address", "Listen address for cluster.").
Default(defaultClusterAddr).String()
clusterAdvertiseAddr = kingpin.Flag("cluster.advertise-address", "Explicit address to advertise in cluster.").String()
peers = kingpin.Flag("cluster.peer", "Initial peers (may be repeated).").Strings()
peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration()
gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration()
pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration()
tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTcpTimeout.String()).Duration()
probeTimeout = kingpin.Flag("cluster.probe-timeout", "Timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of RTT (round-trip time) on your network.").Default(cluster.DefaultProbeTimeout.String()).Duration()
probeInterval = kingpin.Flag("cluster.probe-interval", "Interval between random node probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly at the expense of increased bandwidth usage.").Default(cluster.DefaultProbeInterval.String()).Duration()
settleTimeout = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration()
reconnectInterval = kingpin.Flag("cluster.reconnect-interval", "Interval between attempting to reconnect to lost peers.").Default(cluster.DefaultReconnectInterval.String()).Duration()
peerReconnectTimeout = kingpin.Flag("cluster.reconnect-timeout", "Length of time to attempt to reconnect to a lost peer.").Default(cluster.DefaultReconnectTimeout.String()).Duration()
clusterAdvertiseAddr = kingpin.Flag("cluster.advertise-address", "Explicit address to advertise in cluster.").String()
peers = kingpin.Flag("cluster.peer", "Initial peers (may be repeated).").Strings()
peerTimeout = kingpin.Flag("cluster.peer-timeout", "Time to wait between peers to send notifications.").Default("15s").Duration()
gossipInterval = kingpin.Flag("cluster.gossip-interval", "Interval between sending gossip messages. By lowering this value (more frequent) gossip messages are propagated across the cluster more quickly at the expense of increased bandwidth.").Default(cluster.DefaultGossipInterval.String()).Duration()
pushPullInterval = kingpin.Flag("cluster.pushpull-interval", "Interval for gossip state syncs. Setting this interval lower (more frequent) will increase convergence speeds across larger clusters at the expense of increased bandwidth usage.").Default(cluster.DefaultPushPullInterval.String()).Duration()
tcpTimeout = kingpin.Flag("cluster.tcp-timeout", "Timeout for establishing a stream connection with a remote node for a full state sync, and for stream read and write operations.").Default(cluster.DefaultTcpTimeout.String()).Duration()
probeTimeout = kingpin.Flag("cluster.probe-timeout", "Timeout to wait for an ack from a probed node before assuming it is unhealthy. This should be set to 99-percentile of RTT (round-trip time) on your network.").Default(cluster.DefaultProbeTimeout.String()).Duration()
probeInterval = kingpin.Flag("cluster.probe-interval", "Interval between random node probes. Setting this lower (more frequent) will cause the cluster to detect failed nodes more quickly at the expense of increased bandwidth usage.").Default(cluster.DefaultProbeInterval.String()).Duration()
settleTimeout = kingpin.Flag("cluster.settle-timeout", "Maximum time to wait for cluster connections to settle before evaluating notifications.").Default(cluster.DefaultPushPullInterval.String()).Duration()
reconnectInterval = kingpin.Flag("cluster.reconnect-interval", "Interval between attempting to reconnect to lost peers.").Default(cluster.DefaultReconnectInterval.String()).Duration()
peerReconnectTimeout = kingpin.Flag("cluster.reconnect-timeout", "Length of time to attempt to reconnect to a lost peer.").Default(cluster.DefaultReconnectTimeout.String()).Duration()
clusterCAFile = kingpin.Flag("cluster.ca-file", "Certificate authority file to use to verify messages in cluster traffic").Default("").String()
clusterCertificate = kingpin.Flag("cluster.cert", "Certificate to use when sending cluster messages").Default("").String()
clusterCertificateKey = kingpin.Flag("cluster.cert-key", "Certificate key to use when sending cluster messages").Default("").String()
)

promlogflag.AddFlags(kingpin.CommandLine, &promlogConfig)
Expand Down Expand Up @@ -160,6 +163,9 @@ func run() int {
*tcpTimeout,
*probeTimeout,
*probeInterval,
clusterCAFile,
clusterCertificate,
clusterCertificateKey,
)
if err != nil {
level.Error(logger).Log("msg", "unable to initialize gossip mesh", "err", err)
Expand Down
42 changes: 39 additions & 3 deletions doc/design/secure-cluster-traffic.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,14 @@ Alertmanager instances use the gossip layer to:
As of today the communication between Alertmanager instances in a cluster is
sent in clear-text.

```
+--------------+ +--------------+
| | <-- best-effort communcication (UDP) --> | |
| Alertmanager | | Alertmanager |
| | <-- reliable communcication (TCP) --> | |
+--------------+ +--------------+
```


## Goal

Expand Down Expand Up @@ -55,9 +63,37 @@ layer, one can use mutual TLS to secure all communication. A proof-of-concept
implementation can be found here:
https://github.com/mxinden/memberlist-tls-transport.

The data gossiped between instances does not have a low-latency requirement that
TCP could not fulfill, same would apply for the relatively low data throughput
requirements of Alertmanager.
```golang
type Transport interface {
FinalAdvertiseAddr(ip string, port int) (net.IP, int, error)

// Outgoing best-effort communication.
WriteTo(b []byte, addr string) (time.Time, error)

// Incoming best-effort communication.
PacketCh() <-chan *Packet

// Outgoing reliable connection setup.
DialTimeout(addr string, timeout time.Duration) (net.Conn, error)

// Incoming reliable connections.
StreamCh() <-chan net.Conn

Shutdown() error
}
```

```
+--------------+ +--------------+
| | <-- best-effort communcication (TLS) --> | |
| Alertmanager | | Alertmanager |
| | <-- reliable communcication (TLS) --> | |
+--------------+ +--------------+
```

The data gossiped between instances does **not** have a **low-latency**
requirement that TCP could not fulfill, same would apply for the relatively low
data throughput requirements of Alertmanager.

TCP connections could be kept alive beyond a single message to reduce latency as
well as handshake overhead costs. While this is feasible in a 3-instance
Expand Down
23 changes: 23 additions & 0 deletions examples/ha/tls/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Based on https://github.com/wolfeidau/golang-massl/

start: certs/ca.pem certs/node1.pem certs/node1-key.pem certs/node2.pem certs/node2-key.pem
goreman start

certs/ca.pem certs/ca-key.pem: certs/ca-csr.json
cd certs; cfssl gencert -initca ca-csr.json | cfssljson -bare ca

certs/node1.pem certs/node1-key.pem: certs/ca-config.json certs/ca.pem certs/ca-key.pem certs/node1-csr.json
cd certs; cfssl gencert \
-ca=ca.pem \
-ca-key=ca-key.pem \
-config=ca-config.json \
-hostname=localhost,127.0.0.1 \
-profile=massl node1-csr.json | cfssljson -bare node1

certs/node2.pem certs/node2-key.pem: certs/ca-config.json certs/ca.pem certs/ca-key.pem certs/node2-csr.json
cd certs; cfssl gencert \
-ca=ca.pem \
-ca-key=ca-key.pem \
-config=ca-config.json \
-hostname=localhost,127.0.0.1 \
-profile=massl node2-csr.json | cfssljson -bare node2
3 changes: 3 additions & 0 deletions examples/ha/tls/Procfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
a1: ./../../../alertmanager --log.level=debug --storage.path=$TMPDIR/a1 --web.listen-address=:9093 --cluster.listen-address=127.0.0.1:8001 --config.file=../alertmanager.yml --cluster.ca-file=certs/ca.pem --cluster.cert=certs/node1.pem --cluster.cert-key=certs/node1-key.pem
a2: ./../../../alertmanager --log.level=debug --storage.path=$TMPDIR/a2 --web.listen-address=:9094 --cluster.listen-address=127.0.0.1:8002 --cluster.peer=127.0.0.1:8001 --config.file=../alertmanager.yml --cluster.ca-file=certs/ca.pem --cluster.cert=certs/node2.pem --cluster.cert-key=certs/node2-key.pem
wh: go run ../../webhook/echo.go
13 changes: 13 additions & 0 deletions examples/ha/tls/certs/ca-config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"signing": {
"default": {
"expiry": "8760h"
},
"profiles": {
"massl": {
"usages": ["signing", "key encipherment", "server auth", "client auth"],
"expiry": "8760h"
}
}
}
}
16 changes: 16 additions & 0 deletions examples/ha/tls/certs/ca-csr.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"CN": "massl",
"key": {
"algo": "rsa",
"size": 2048
},
"names": [
{
"C": "AU",
"L": "Melbourne",
"O": "massl",
"OU": "VIC",
"ST": "Victoria"
}
]
}
16 changes: 16 additions & 0 deletions examples/ha/tls/certs/node1-csr.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"CN": "system:server",
"key": {
"algo": "rsa",
"size": 2048
},
"names": [
{
"C": "AU",
"L": "Melbourne",
"O": "system:node1",
"OU": "massl",
"ST": "Victoria"
}
]
}
16 changes: 16 additions & 0 deletions examples/ha/tls/certs/node2-csr.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"CN": "system:server",
"key": {
"algo": "rsa",
"size": 2048
},
"names": [
{
"C": "AU",
"L": "Melbourne",
"O": "system:node2",
"OU": "massl",
"ST": "Victoria"
}
]
}
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@ require (
github.com/go-openapi/swag v0.17.2
github.com/go-openapi/validate v0.17.2
github.com/gogo/protobuf v1.2.1
github.com/hashicorp/go-sockaddr v1.0.0
github.com/hashicorp/go-sockaddr v1.0.1
github.com/hashicorp/memberlist v0.1.3
github.com/jessevdk/go-flags v0.0.0-20180331124232-1c38ed7ad0cc
github.com/kr/pretty v0.1.0 // indirect
github.com/kylelemons/godebug v0.0.0-20160406211939-eadb3ce320cb
github.com/matttproud/golang_protobuf_extensions v1.0.1
github.com/mxinden/memberlist-tls-transport v0.0.0-20190308101202-f3e0859bbdcf
github.com/oklog/oklog v0.0.0-20170918173356-f857583a70c3
github.com/oklog/ulid v0.0.0-20170117200651-66bb6560562f
github.com/pkg/errors v0.8.0
Expand Down
Loading