Skip to content

Commit

Permalink
operator: Add support for built-in-cert-rotation for all internal lok…
Browse files Browse the repository at this point in the history
…istack encryption (grafana#7064)
  • Loading branch information
periklis authored and changhyuni committed Nov 8, 2022
1 parent a447bee commit 27b8d8d
Show file tree
Hide file tree
Showing 76 changed files with 5,754 additions and 1,724 deletions.
34 changes: 33 additions & 1 deletion operator/apis/config/v1/projectconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,30 @@ import (
cfg "sigs.k8s.io/controller-runtime/pkg/config/v1alpha1"
)

// BuiltInCertManagement is the configuration for the built-in facility to generate and rotate
// TLS client and serving certificates for all LokiStack services and internal clients except
// for the lokistack-gateway.
type BuiltInCertManagement struct {
// Enabled defines to flag to enable/disable built-in certificate management feature gate.
Enabled bool `json:"enabled,omitempty"`
// CACertValidity defines the total duration of the CA certificate validity.
CACertValidity string `json:"caValidity,omitempty"`
// CACertRefresh defines the duration of the CA certificate validity until a rotation
// should happen. It can be set up to 80% of CA certificate validity or equal to the
// CA certificate validity. Latter should be used only for rotating only when expired.
CACertRefresh string `json:"caRefresh,omitempty"`
// CertValidity defines the total duration of the validity for all LokiStack certificates.
CertValidity string `json:"certValidity,omitempty"`
// CertRefresh defines the duration of the certificate validity until a rotation
// should happen. It can be set up to 80% of certificate validity or equal to the
// certificate validity. Latter should be used only for rotating only when expired.
// The refresh is applied to all LokiStack certificates at once.
CertRefresh string `json:"certRefresh,omitempty"`
}

// OpenShiftFeatureGates is the supported set of all operator features gates on OpenShift.
type OpenShiftFeatureGates struct {
// ServingCertsService enables OpenShift service-ca annotations on Services
// ServingCertsService enables OpenShift service-ca annotations on the lokistack-gateway service only
// to use the in-platform CA and generate a TLS cert/key pair per service for
// in-cluster data-in-transit encryption.
// More details: https://docs.openshift.com/container-platform/latest/security/certificate_types_descriptions/service-ca-certificates.html
Expand Down Expand Up @@ -54,6 +75,17 @@ type FeatureGates struct {
// suffix `-ca-bundle`, e.g. `lokistack-dev-ca-bundle` and the following data:
// - `service-ca.crt`: The CA signing the service certificate in `tls.crt`.
GRPCEncryption bool `json:"grpcEncryption,omitempty"`
// BuiltInCertManagement enables the built-in facility for generating and rotating
// TLS client and serving certificates for all LokiStack services and internal clients except
// for the lokistack-gateway, In detail all internal Loki HTTP and GRPC communication is lifted
// to require mTLS. For the lokistack-gateay you need to provide a secret with or use the `ServingCertsService`
// on OpenShift:
// - `tls.crt`: The TLS server side certificate.
// - `tls.key`: The TLS key for server-side encryption.
// In addition each service requires a configmap named as the LokiStack CR with the
// suffix `-ca-bundle`, e.g. `lokistack-dev-ca-bundle` and the following data:
// - `service-ca.crt`: The CA signing the service certificate in `tls.crt`.
BuiltInCertManagement BuiltInCertManagement `json:"builtInCertManagement,omitempty"`

// LokiStackGateway enables reconciling the reverse-proxy lokistack-gateway
// component for multi-tenant authentication/authorization traffic control
Expand Down
16 changes: 16 additions & 0 deletions operator/apis/config/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions operator/apis/loki/v1/lokistack_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,8 @@ const (
ReasonInvalidTenantsConfiguration LokiStackConditionReason = "InvalidTenantsConfiguration"
// ReasonMissingGatewayOpenShiftBaseDomain when the reconciler cannot lookup the OpenShift DNS base domain.
ReasonMissingGatewayOpenShiftBaseDomain LokiStackConditionReason = "MissingGatewayOpenShiftBaseDomain"
// ReasonFailedCertificateRotation when the reconciler cannot rotate any of the required TLS certificates.
ReasonFailedCertificateRotation LokiStackConditionReason = "FailedCertificateRotation"
)

// PodStatusMap defines the type for mapping pod status to pod name.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,16 @@ data:
#
httpEncryption: true
grpcEncryption: true
builtInCertManagement:
enabled: true
# CA certificate validity: 5 years
caValidity: 43830h
# CA certificate refresh at 80% of validity
caRefresh: 35064h
# Target certificate validity: 90d
certValidity: 2160h
# Target certificate refresh at 80% of validity
certRefresh: 1728h
#
# Component feature gates
#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,7 @@ spec:
- endpoints
- nodes
- pods
- secrets
- serviceaccounts
- services
verbs:
Expand Down
9 changes: 6 additions & 3 deletions operator/cmd/loki-broker/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,13 @@ func (c *config) registerFlags(f *flag.FlagSet) {
f.StringVar(&c.Namespace, "namespace", "", "Namespace to deploy to")
f.StringVar(&c.Image, "image", manifests.DefaultContainerImage, "The Loki image pull spec loation.")
// Feature flags
c.featureFlags = configv1.FeatureGates{}
c.featureFlags.OpenShift = configv1.OpenShiftFeatureGates{}
f.BoolVar(&c.featureFlags.OpenShift.ServingCertsService, "with-serving-certs-service", false, "Enable usage of serving certs service on OpenShift.")
f.BoolVar(&c.featureFlags.ServiceMonitors, "with-service-monitors", false, "Enable service monitors for all LokiStack components.")
f.BoolVar(&c.featureFlags.OpenShift.ServingCertsService, "with-serving-certs-service", false, "Enable usage of serving certs service on OpenShift.")
f.BoolVar(&c.featureFlags.BuiltInCertManagement.Enabled, "with-builtin-cert-management", false, "Enable usage built-in cert generation and rotation.")
f.StringVar(&c.featureFlags.BuiltInCertManagement.CACertValidity, "ca-cert-validity", "8760h", "CA Certificate validity duration.")
f.StringVar(&c.featureFlags.BuiltInCertManagement.CACertRefresh, "ca-cert-refresh", "7008h", "CA Certificate refresh time.")
f.StringVar(&c.featureFlags.BuiltInCertManagement.CertValidity, "target-cert-validity", "2160h", "Target Certificate validity duration.")
f.StringVar(&c.featureFlags.BuiltInCertManagement.CertRefresh, "target-cert-refresh", "1728h", "Target Certificate refresh time.")
f.BoolVar(&c.featureFlags.HTTPEncryption, "with-http-tls-services", false, "Enables TLS for all LokiStack GRPC services.")
f.BoolVar(&c.featureFlags.GRPCEncryption, "with-grpc-tls-services", false, "Enables TLS for all LokiStack HTTP services.")
f.BoolVar(&c.featureFlags.ServiceMonitorTLSEndpoints, "with-service-monitor-tls-endpoints", false, "Enable TLS endpoint for service monitors.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ spec:
- name: manager
env:
- name: RELATED_IMAGE_LOKI
value: docker.io/grafana/loki:main-ec0bf70
value: docker.io/grafana/loki:k120-26d2989
- name: RELATED_IMAGE_GATEWAY
value: quay.io/observatorium/api:latest
10 changes: 10 additions & 0 deletions operator/config/overlays/openshift/controller_manager_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@ featureGates:
#
httpEncryption: true
grpcEncryption: true
builtInCertManagement:
enabled: true
# CA certificate validity: 5 years
caValidity: 43830h
# CA certificate refresh at 80% of validity
caRefresh: 35064h
# Target certificate validity: 90d
certValidity: 2160h
# Target certificate refresh at 80% of validity
certRefresh: 1728h
#
# Component feature gates
#
Expand Down
1 change: 1 addition & 0 deletions operator/config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ rules:
- endpoints
- nodes
- pods
- secrets
- serviceaccounts
- services
verbs:
Expand Down
113 changes: 113 additions & 0 deletions operator/controllers/loki/certrotation_controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
package controllers

import (
"context"
"errors"
"time"

"github.com/go-logr/logr"
configv1 "github.com/grafana/loki/operator/apis/config/v1"
lokiv1 "github.com/grafana/loki/operator/apis/loki/v1"
"github.com/grafana/loki/operator/controllers/loki/internal/lokistack"
"github.com/grafana/loki/operator/controllers/loki/internal/management/state"
"github.com/grafana/loki/operator/internal/certrotation"
"github.com/grafana/loki/operator/internal/external/k8s"
"github.com/grafana/loki/operator/internal/handlers"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// CertRotationReconciler reconciles the `loki.grafana.com/certRotationRequiredAt` annotation on
// any LokiStack object associated with any of the owned signer/client/serving certificates secrets
// and CA bundle configmap.
type CertRotationReconciler struct {
client.Client
Log logr.Logger
Scheme *runtime.Scheme
FeatureGates configv1.FeatureGates
}

// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// Compare the state specified by the LokiStack object against the actual cluster state,
// and then perform operations to make the cluster state reflect the state specified by
// the user.
//
// For more details, check Reconcile and its Result here:
// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/reconcile
func (r *CertRotationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
managed, err := state.IsManaged(ctx, req, r.Client)
if err != nil {
return ctrl.Result{
Requeue: true,
}, err
}
if !managed {
r.Log.Info("Skipping reconciliation for unmanaged LokiStack resource", "name", req.String())
// Stop requeueing for unmanaged LokiStack custom resources
return ctrl.Result{}, nil
}

rt, err := certrotation.ParseRotation(r.FeatureGates.BuiltInCertManagement)
if err != nil {
return ctrl.Result{Requeue: false}, err
}

checkExpiryAfter := expiryRetryAfter(rt.TargetCertRefresh)
r.Log.Info("Checking if LokiStack certificates expired", "name", req.String(), "interval", checkExpiryAfter.String())

var expired *certrotation.CertExpiredError

err = handlers.CheckCertExpiry(ctx, r.Log, req, r.Client, r.FeatureGates)
switch {
case errors.As(err, &expired):
r.Log.Info("Certificate expired", "msg", expired.Error())
case err != nil:
return ctrl.Result{
Requeue: true,
}, err
default:
r.Log.Info("Skipping cert rotation, all LokiStack certificates still valid", "name", req.String())
return ctrl.Result{
RequeueAfter: checkExpiryAfter,
}, nil
}

r.Log.Error(err, "LokiStack certificates expired", "name", req.String())
err = lokistack.AnnotateForRequiredCertRotation(ctx, r.Client, req.Name, req.Namespace)
if err != nil {
r.Log.Error(err, "failed to annotate required cert rotation", "name", req.String())
return ctrl.Result{
Requeue: true,
}, err
}

return ctrl.Result{
RequeueAfter: checkExpiryAfter,
}, nil
}

// SetupWithManager sets up the controller with the Manager.
func (r *CertRotationReconciler) SetupWithManager(mgr ctrl.Manager) error {
b := ctrl.NewControllerManagedBy(mgr)
return r.buildController(k8s.NewCtrlBuilder(b))
}

func (r *CertRotationReconciler) buildController(bld k8s.Builder) error {
return bld.
For(&lokiv1.LokiStack{}).
Owns(&corev1.Secret{}).
Complete(r)
}

func expiryRetryAfter(certRefresh time.Duration) time.Duration {
day := 24 * time.Hour
if certRefresh > day {
return 12 * time.Hour
}

return certRefresh / 4
}
74 changes: 74 additions & 0 deletions operator/controllers/loki/certrotation_controller_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package controllers

import (
"testing"
"time"

lokiv1 "github.com/grafana/loki/operator/apis/loki/v1"
"github.com/grafana/loki/operator/internal/external/k8s/k8sfakes"
"github.com/stretchr/testify/require"
corev1 "k8s.io/api/core/v1"
)

func TestCertRotationController_RegistersCustomResource_WithDefaultPredicates(t *testing.T) {
b := &k8sfakes.FakeBuilder{}
k := &k8sfakes.FakeClient{}
c := &CertRotationReconciler{Client: k, Scheme: scheme}

b.ForReturns(b)
b.OwnsReturns(b)

err := c.buildController(b)
require.NoError(t, err)

// Require only one For-Call for the custom resource
require.Equal(t, 1, b.ForCallCount())

// Require For-call with LokiStack resource
obj, _ := b.ForArgsForCall(0)
require.Equal(t, &lokiv1.LokiStack{}, obj)
}

func TestCertRotationController_RegisterOwnedResources_WithDefaultPredicates(t *testing.T) {
b := &k8sfakes.FakeBuilder{}
k := &k8sfakes.FakeClient{}
c := &CertRotationReconciler{Client: k, Scheme: scheme}

b.ForReturns(b)
b.OwnsReturns(b)

err := c.buildController(b)
require.NoError(t, err)

require.Equal(t, 1, b.OwnsCallCount())

obj, _ := b.OwnsArgsForCall(0)
require.Equal(t, &corev1.Secret{}, obj)
}

func TestCertRotationController_ExpiryRetryAfter(t *testing.T) {
tt := []struct {
desc string
refresh time.Duration
wantDuration time.Duration
wantError bool
}{
{
desc: "multi-day refresh durarion",
refresh: 120 * time.Hour,
wantDuration: 12 * time.Hour,
},
{
desc: "less than a day refresh duration",
refresh: 10 * time.Hour,
wantDuration: 2*time.Hour + 30*time.Minute,
},
}
for _, tc := range tt {
tc := tc
t.Run(tc.desc, func(t *testing.T) {
t.Parallel()
require.Equal(t, tc.wantDuration, expiryRetryAfter(tc.refresh))
})
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package lokistack

import (
"context"
"fmt"
"time"

"github.com/ViaQ/logerr/v2/kverrors"
lokiv1 "github.com/grafana/loki/operator/apis/loki/v1"
"github.com/grafana/loki/operator/internal/external/k8s"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"sigs.k8s.io/controller-runtime/pkg/client"
)

const certRotationRequiredAtKey = "loki.grafana.com/certRotationRequiredAt"

// AnnotateForRequiredCertRotation adds/updates the `loki.grafana.com/certRotationRequiredAt` annotation
// to the named Lokistack if any of the managed client/serving/ca certificates expired. If no LokiStack
// is found, then skip reconciliation.
func AnnotateForRequiredCertRotation(ctx context.Context, k k8s.Client, name, namespace string) error {
var s lokiv1.LokiStack
key := client.ObjectKey{Name: name, Namespace: namespace}

if err := k.Get(ctx, key, &s); err != nil {
if apierrors.IsNotFound(err) {
// Do nothing
return nil
}

return kverrors.Wrap(err, "failed to get lokistack", "key", key)
}

ss := s.DeepCopy()
if ss.Annotations == nil {
ss.Annotations = make(map[string]string)
}

ss.Annotations[certRotationRequiredAtKey] = time.Now().UTC().Format(time.RFC3339)

if err := k.Update(ctx, ss); err != nil {
return kverrors.Wrap(err, fmt.Sprintf("failed to update lokistack `%s` annotation", certRotationRequiredAtKey), "key", key)
}

return nil
}
Loading

0 comments on commit 27b8d8d

Please sign in to comment.